diff -Nru hypre-2.11.2/AUTOTEST/autotest.sh hypre-2.13.0/AUTOTEST/autotest.sh
--- hypre-2.11.2/AUTOTEST/autotest.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/autotest.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,283 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-# Setup
-testing_dir=`cd ..; pwd`
-autotest_dir="$testing_dir/AUTOTEST"
-finished_dir="$testing_dir/AUTOTEST-FINISHED"
-output_dir="$testing_dir/AUTOTEST-`date +%Y.%m.%d-%a`"
-src_dir="$testing_dir/hypre/src"
-remote_dir="test-hypre"
-summary_file="SUMMARY.html"
-summary_subject="Autotest Error Summary `date +%Y-%m-%d`"
-email_list="rfalgout@llnl.gov, tzanio@llnl.gov, umyang@llnl.gov, schroder2@llnl.gov, oseikuffuor1@llnl.gov, wang84@llnl.gov, li50@llnl.gov"
-
-# Main loop
-test_opts=""
-while [ "$*" ]
-do
-   case $1 in
-      -h|-help)
-         cat <<EOF
-
-   $0 [options] [-checkout | -dist M.mm.rr | -{test1} ... | -summary]
-
-   where:
-
-      -checkout           Checks out the repository and updates the current AUTOTEST
-                          directory.  Should be called before running tests.
-      -dist               Use the hypre release M.mm.rr (e.g. 2.4.0b). This is an
-                          alternative to -checkout and is used by testdist.sh.
-      -{test}             Runs the indicated tests in sequence, which are associated
-                          with specific machine names (e.g., -tux149, -alc, -up).
-      -summary            Generates a summary file of passed, pending, failed tests.
-      -summary-email      Same as -summary, but also sends developers an email.
-      -summary-copy {dir} Same as -summary, but also copies to remote test {dir}.
-
-   with options:
-
-      -h|-help       prints this usage information and exits
-      -t|-trace      echo each command
-
-   The main purpose of this script is to organize the automatic testing process
-   and to ensure that all related files have the appropriate permissions.
-
-   Example usage: $0 -checkout
-                  $0 -tux149 -alc
-                  $0 -summary
-                  $0 -summary-copy tux149:/usr/casc/hypre/testing
-
-EOF
-         exit
-         ;;
-
-      -t|-trace)
-         set -xv
-         shift
-         ;;
-
-      # Checkout the repository and update the global AUTOTEST directory
-      -checkout)
-         cd $testing_dir
-         if [ ! -d hypre ]; then
-            echo "Clone the hypre directory in $testing_dir first"
-            exit
-         else
-            cd hypre; git checkout .; git pull; cd ..
-         fi
-         trap "cp -fR $testing_dir/hypre/AUTOTEST $testing_dir" EXIT
-         test_opts=""
-         break
-         ;;
-
-     -dist)
-         shift
-         finished_dir="$testing_dir/AUTOTEST-hypre-$1"
-         src_dir="$testing_dir/hypre-$1/src"
-         remote_dir="test-hypre-$1"
-         shift
-         ;;
-
-      # Generate a summary file in the output directory
-      -summary*)
-         # move the finished logs to todays output directory
-         # (using 'cp' then 'rm' produces fewer complaints than using 'mv')
-         # (the autotest-* files are removed below if not pending)
-         # (check first that the files exist to reduce error messages from 'cp')
-         mkdir -p $output_dir
-         count=$( find $finished_dir -mindepth 1 -name "*" | wc -m )
-         if [ $count -ne 0 ]; then
-            cp -fr $finished_dir/* $output_dir
-            rm -fr $finished_dir/*
-         fi
-         count=$( find $autotest_dir -mindepth 1 -name "autotest-*" | wc -m )
-         if [ $count -ne 0 ]; then
-            cp -f  $autotest_dir/autotest-* $output_dir
-         fi
-
-         cd $output_dir
-         echo "<html>"          > $summary_file;
-         echo "<head> </head>" >> $summary_file;
-         echo "<PRE>"          >> $summary_file;
-         echo $summary_subject >> $summary_file
-
-
-         # all top-level tests with empty error files are reported as "passed",
-         # not including the cron autotest logs
-         echo ""         >> $summary_file;
-         echo "[PASSED]" >> $summary_file
-         for test in $( find . -maxdepth 1 -size 0 -name "*.err" ! -name "*cron*" )
-         do
-            testname=`basename $test .err`
-            echo "-${testname#machine-}" >> $summary_file
-         done
-
-         # active tests without a *-done file are reported as "pending"
-         echo ""          >> $summary_file;
-         echo "[PENDING]" >> $summary_file
-         for test in $( find . -name "autotest-*-start" )
-         do
-            testbase=`basename $test -start`
-            if [ ! -e $testbase-done ]; then
-               echo $testbase | sed {s/autotest//g} >> $output_dir/$summary_file
-            else
-               rm -f $autotest_dir/$testbase*
-            fi
-         done
-
-         # all top-level tests with non-empty error files are reported as "failed",
-         # including the cron autotest logs
-         echo ""         >> $summary_file;
-         echo "[FAILED]" >> $summary_file
-         for test in $( find . -maxdepth 1 ! -size 0 -name "*.err" )
-         do
-            testname=`basename $test .err`
-            for prefix in "machine-" "autotest-";
-            do
-               testname="${testname#$prefix}"
-            done
-            echo "-$testname" >> $summary_file
-         done
-
-         # keep a time stamp of last runs and report if more than 10 days
-         echo ""           >> $summary_file;
-         echo "[LAST RUN]" >> $summary_file
-         for test in $( find . -maxdepth 1 -name "autotest-*-done" )
-         do
-            testname=`basename $test -done`
-            testname="${testname#autotest-}"
-            touch $testing_dir/lastrun-$testname
-         done
-         for test in $( find $testing_dir -maxdepth 1 -name "lastrun-*" -atime +10 )
-         do
-            testdate=`ls -l $test | awk '{print $6" "$7" "$8}'`
-            testname=`basename $test`
-            testname="${testname#lastrun-}"
-            echo "-$testname  $testdate" >> $summary_file
-         done
-
-         # list all non-empty error files in todays output directory
-         echo ""              >> $summary_file;
-         echo "[ERROR FILES]" >> $summary_file
-         for test in $( find $output_dir ! -size 0 -name "*.err" | sort -r )
-         do
-            echo "<a href=\"file://$test\">$test</a>" >> $summary_file
-         done
-
-         echo "</PRE>"  >> $summary_file;
-         echo "</html>" >> $summary_file;
-
-         if [ "$1" = "-summary-email" ]; then
-            # send the email
-            (
-               echo To: $email_list
-               echo Subject: $summary_subject
-               echo Content-Type: text/html
-               echo MIME-Version: 1.0
-
-               cat $summary_file
-
-            ) | /usr/sbin/sendmail -t
-         fi
-
-         if [ "$1" = "-summary-copy" ]; then
-            # copy output_dir files to the specified remote testing_dir
-            rem_finished_dir="$2/AUTOTEST-FINISHED"
-            scp -q -r * $rem_finished_dir
-         fi
-
-         test_opts=""
-         break
-         ;;
-
-      *)
-         test_opts="$test_opts $1"
-         shift
-         ;;
-   esac
-done
-
-# Ensure that important directories exist
-if [ -n "$test_opts" ]; then
-   cd $testing_dir
-   mkdir -p $autotest_dir
-   mkdir -p $finished_dir
-   cd $autotest_dir
-fi
-
-# Run tests
-for opt in $test_opts
-do
-   # TODO: use a "-<testname>:<hostname>" format to avoid this?
-   case $opt in
-      -tux[0-9]*-compilers)
-         host=`echo $opt | awk -F- '{print $2}'`
-         name="tux-compilers"
-         ;;
-
-      -tux[0-9]*)
-         host=`echo $opt | awk -F- '{print $2}'`
-         name="tux"
-         ;;
-
-      -mac)
-         host="parsol"
-         name="mac"
-         ;;
-
-      *)
-         host=`echo $opt | awk -F- '{print $2}'`
-         name=$host
-         ;;
-   esac
-
-   if [ ! -e autotest-$name-start ]; then
-      echo "Test [machine-$name] started at  `date +%T` on `date +%D`" \
-         >> autotest-$name-start
-      ./testsrc.sh $src_dir $host:$remote_dir/$host machine-$name.sh
-      echo "Test [machine-$name] finished at `date +%T` on `date +%D`" \
-         >> autotest-$name-start
-      mv machine-$name.??? $finished_dir
-      touch autotest-$name-done
-   fi
-done
-
-# Fix permissions
-cd $testing_dir
-ch_dirs="hypre $autotest_dir $finished_dir $output_dir"
-for dir in $ch_dirs lastrun-*
-do
-   if [ -e $dir ]; then
-      chmod -fR a+rX,ug+w,o-w $dir
-      # chgrp -fR hypre         $dir
-   fi
-done
-
-# move all but the last 10 autotest results into yearly subdirectories
-files=`echo AUTOTEST-2*.*`
-count=`echo $files | wc | awk '{print $2}'`
-for i in $files
-do
-   if [ $count -le 10 ]; then
-      break;
-   fi
-   dir=`echo $i | awk -F '.' '{print $1}'`
-   if [ ! -d $dir ]; then
-      mkdir $dir
-      chmod -fR a+rX,ug+w,o-w $dir
-      # chgrp -fR hypre         $dir
-   fi
-   mv $i $dir/$i
-   count=`expr $count - 1`
-done
-
diff -Nru hypre-2.11.2/AUTOTEST/basic.sh hypre-2.13.0/AUTOTEST/basic.sh
--- hypre-2.11.2/AUTOTEST/basic.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/basic.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,106 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+testname=`basename $0 .sh`
+
+# Echo usage information
+case $1 in
+   -h|-help)
+      cat <<EOF
+
+   $0 [-h] {src_dir} [options]
+
+   where: {src_dir}     is the hypre source directory
+          -co: <opts>   options for configure script
+          -mo: <opts>   options for make script
+          -ro: <opts>   call the run script with these options
+          -eo: <opts>   call the examples script with these options
+          -h|-help      prints this usage information and exits
+
+   This script configures and compiles the source in {src_dir}, then optionally
+   runs driver and example tests.
+
+   Example usage: $0 ../src -ro: -ij -sstruct
+
+EOF
+      exit
+      ;;
+esac
+
+# Set src_dir
+src_dir=`cd $1; pwd`
+shift
+
+# Parse the rest of the command line
+copts=""
+mopts=""
+ropts=""
+eopts=""
+while [ "$*" ]
+do
+   case $1 in
+      -co:)
+         opvar="copts"; shift
+         ;;
+      -mo:)
+         opvar="mopts"; shift
+         ;;
+      -ro:)
+         opvar="ropts"; rset="yes"; shift
+         ;;
+      -eo:)
+         opvar="eopts"; eset="yes"; shift
+         ;;
+      *)
+         eval $opvar=\"\$$opvar $1\"
+         shift
+         ;;
+   esac
+done
+
+# Setup
+test_dir=`pwd`
+output_dir=`pwd`/$testname.dir
+rm -fr $output_dir
+mkdir -p $output_dir
+
+# Configure
+# NOTE: The use of 'eval' is needed to deal properly with nested quotes in argument lists
+eval ./test.sh configure.sh $src_dir $copts
+mv -f configure.??? $output_dir
+
+# Make
+./test.sh make.sh $src_dir $mopts
+mv -f make.??? $output_dir
+
+# Run
+if [ -n "$rset" ]; then
+   ./test.sh run.sh $src_dir $ropts
+   mv -f run.??? $output_dir
+fi
+
+# Examples
+if [ -n "$eset" ]; then
+   ./test.sh examples.sh $src_dir $eopts
+   mv -f examples.??? $output_dir
+fi
+
+# Echo to stderr all nonempty error files in $output_dir
+for errfile in $( find $output_dir ! -size 0 -name "*.err" )
+do
+   echo $errfile >&2
+done
+
+# Clean up
+( cd $src_dir; make distclean )
+
diff -Nru hypre-2.11.2/AUTOTEST/basictest.sh hypre-2.13.0/AUTOTEST/basictest.sh
--- hypre-2.11.2/AUTOTEST/basictest.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/basictest.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,105 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-testname=`basename $0 .sh`
-
-# Echo usage information
-case $1 in
-   -h|-help)
-      cat <<EOF
-
-   $0 [-h] {src_dir} [options]
-
-   where: {src_dir}     is the hypre source directory
-          -co: <opts>   options for configure script
-          -mo: <opts>   options for make script
-          -ro: <opts>   call the run script with these options
-          -eo: <opts>   call the examples script with these options
-          -h|-help      prints this usage information and exits
-
-   This script configures and compiles the source in {src_dir}, then optionally
-   runs driver and example tests.
-
-   Example usage: $0 ../src -ro: -ij -sstruct
-
-EOF
-      exit
-      ;;
-esac
-
-# Set src_dir
-src_dir=`cd $1; pwd`
-shift
-
-# Parse the rest of the command line
-copts=""
-mopts=""
-ropts=""
-eopts=""
-while [ "$*" ]
-do
-   case $1 in
-      -co:)
-         opvar="copts"; shift
-         ;;
-      -mo:)
-         opvar="mopts"; shift
-         ;;
-      -ro:)
-         opvar="ropts"; rset="yes"; shift
-         ;;
-      -eo:)
-         opvar="eopts"; eset="yes"; shift
-         ;;
-      *)
-         eval $opvar=\"\$$opvar $1\"
-         shift
-         ;;
-   esac
-done
-
-# Setup
-test_dir=`pwd`
-output_dir=`pwd`/$testname.dir
-rm -fr $output_dir
-mkdir -p $output_dir
-
-# Configure
-./test.sh configure.sh $src_dir $copts
-mv -f configure.??? $output_dir
-
-# Make
-./test.sh make.sh $src_dir $mopts
-mv -f make.??? $output_dir
-
-# Run
-if [ -n "$rset" ]; then
-   ./test.sh run.sh $src_dir $ropts
-   mv -f run.??? $output_dir
-fi
-
-# Examples
-if [ -n "$eset" ]; then
-   ./test.sh examples.sh $src_dir $eopts
-   mv -f examples.??? $output_dir
-fi
-
-# Echo to stderr all nonempty error files in $output_dir
-for errfile in $( find $output_dir ! -size 0 -name "*.err" )
-do
-   echo $errfile >&2
-done
-
-# Clean up
-( cd $src_dir; make distclean )
-
diff -Nru hypre-2.11.2/AUTOTEST/check-double.filters hypre-2.13.0/AUTOTEST/check-double.filters
--- hypre-2.11.2/AUTOTEST/check-double.filters	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/check-double.filters	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,11 @@
 /_hypre_utilities.h:
 /HYPRE_utilities.h:
-/mpistubs.c
+/utilities/general.h:
+/utilities/gpuErrorCheck.h:
+/utilities/gpuErrorCheck.c:
+/utilities/gpuMem.h:
+/utilities/hypre_nvtx.h:
+/utilities/mpistubs.c
+/seq_mv/gpukernels.h:
+/seq_mv/seq_mv.h:.*cudaStream_t
 double-check
diff -Nru hypre-2.11.2/AUTOTEST/check-int.filters hypre-2.13.0/AUTOTEST/check-int.filters
--- hypre-2.11.2/AUTOTEST/check-int.filters	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/check-int.filters	2017-10-20 17:42:22.000000000 +0000
@@ -3,6 +3,14 @@
 /hypre_printf.c:
 /_hypre_utilities.h:
 /HYPRE_utilities.h:
+/utilities/general.h:
+/utilities/gpuErrorCheck.h:
+/utilities/gpuErrorCheck.c:
+/utilities/gpuMem.h:
+/utilities/hypre_nvtx.h:
+/utilities/mpistubs.c
+/seq_mv/gpukernels.h:
+/seq_mv/seq_mv.h:.*cudaStream_t
 as long as
 too long
 long range interpolation
diff -Nru hypre-2.11.2/AUTOTEST/check-mpi.filters hypre-2.13.0/AUTOTEST/check-mpi.filters
--- hypre-2.11.2/AUTOTEST/check-mpi.filters	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/check-mpi.filters	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,10 @@
 /HYPRE_config.h:
 /_hypre_utilities.h:
 /HYPRE_utilities.h:
-/mpistubs.c:
-/mpistubs.h:
-/thread_mpistubs.c:
-/thread_mpistubs.h:
+/utilities/gpuMem.c:
+/utilities/mpistubs.c:
+/utilities/mpistubs.h:
+/parcsr_mv/par_csr_matvec.c:.*MPI_PACK
+/parcsr_mv/par_csr_matvec.c:.*MPI_HALO_EXC_SEND
+/parcsr_mv/par_csr_matvec.c:.*MPI_HALO_EXC_RECV
+/parcsr_mv/par_csr_matvec.c:.*MPI_UNPACK
diff -Nru hypre-2.11.2/AUTOTEST/cmake.sh hypre-2.13.0/AUTOTEST/cmake.sh
--- hypre-2.11.2/AUTOTEST/cmake.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/cmake.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,127 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+testname=`basename $0 .sh`
+
+drivers="ij sstruct struct ams_driver maxwell_unscaled sstruct_fac ij_mv struct_migrate"
+
+# Echo usage information
+case $1 in
+   -h|-help)
+      cat <<EOF
+
+   $0 [-h] {src_dir} [options]
+
+   where: {src_dir}     is the hypre source directory
+          -co: <opts>   configuration options
+          -mo: <opts>   make options
+          -ro: <opts>   call the run script with these options
+          -eo: <opts>   call the examples script with these options
+          -h|-help      prints this usage information and exits
+
+   This script uses cmake to configure and compile the source in {src_dir}, then
+   optionally runs driver and example tests.
+
+   Example usage: $0 ../src -co -DCMAKE_BUILD_TYPE=Debug -ro: -ij
+
+EOF
+      exit
+      ;;
+esac
+
+# Set src_dir
+src_dir=`cd $1; pwd`
+shift
+
+# Parse the rest of the command line
+copts=""
+mopts=""
+ropts=""
+eopts=""
+while [ "$*" ]
+do
+   case $1 in
+      -co:)
+         opvar="copts"; shift
+         ;;
+      -mo:)
+         opvar="mopts"; shift
+         ;;
+      -ro:)
+         opvar="ropts"; rset="yes"; shift
+         ;;
+      -eo:)
+         opvar="eopts"; eset="yes"; shift
+         ;;
+      *)
+         eval $opvar=\"\$$opvar $1\"
+         shift
+         ;;
+   esac
+done
+
+# Setup
+test_dir=`pwd`
+output_dir=`pwd`/$testname.dir
+rm -fr $output_dir
+mkdir -p $output_dir
+cd $src_dir
+src_dir=`pwd`
+
+# Clean up the cmbuild directories (do it from src_dir as a precaution)
+cd $src_dir
+rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'`
+rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'`
+
+# Clean up the previous install
+cd $src_dir
+rm -fr hypre
+
+# Configure
+cd $src_dir/cmbuild
+cmake $copts ..
+make $mopts install
+
+# Make
+cd $src_dir/test/cmbuild
+cmake ..
+make $mopts
+mv -f $drivers ..
+
+cd $test_dir
+
+# Run
+if [ -n "$rset" ]; then
+   ./test.sh run.sh $src_dir $ropts
+   mv -f run.??? $output_dir
+fi
+
+# Examples
+if [ -n "$eset" ]; then
+   ./test.sh examples.sh $src_dir $eopts
+   mv -f examples.??? $output_dir
+fi
+
+# Echo to stderr all nonempty error files in $output_dir
+for errfile in $( find $output_dir ! -size 0 -name "*.err" )
+do
+   echo $errfile >&2
+done
+
+# Clean up
+cd $src_dir
+rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'`
+rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'`
+rm -fr hypre
+( cd $src_dir/test; rm -f $drivers; ./cleantest.sh )
+
diff -Nru hypre-2.11.2/AUTOTEST/cmaketest.sh hypre-2.13.0/AUTOTEST/cmaketest.sh
--- hypre-2.11.2/AUTOTEST/cmaketest.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/cmaketest.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,127 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-testname=`basename $0 .sh`
-
-drivers="ij sstruct struct ams_driver maxwell_unscaled sstruct_fac ij_mv struct_migrate"
-
-# Echo usage information
-case $1 in
-   -h|-help)
-      cat <<EOF
-
-   $0 [-h] {src_dir} [options]
-
-   where: {src_dir}     is the hypre source directory
-          -co: <opts>   configuration options
-          -mo: <opts>   make options
-          -ro: <opts>   call the run script with these options
-          -eo: <opts>   call the examples script with these options
-          -h|-help      prints this usage information and exits
-
-   This script uses cmake to configure and compile the source in {src_dir}, then
-   optionally runs driver and example tests.
-
-   Example usage: $0 ../src -co -DCMAKE_BUILD_TYPE=Debug -ro: -ij
-
-EOF
-      exit
-      ;;
-esac
-
-# Set src_dir
-src_dir=`cd $1; pwd`
-shift
-
-# Parse the rest of the command line
-copts=""
-mopts=""
-ropts=""
-eopts=""
-while [ "$*" ]
-do
-   case $1 in
-      -co:)
-         opvar="copts"; shift
-         ;;
-      -mo:)
-         opvar="mopts"; shift
-         ;;
-      -ro:)
-         opvar="ropts"; rset="yes"; shift
-         ;;
-      -eo:)
-         opvar="eopts"; eset="yes"; shift
-         ;;
-      *)
-         eval $opvar=\"\$$opvar $1\"
-         shift
-         ;;
-   esac
-done
-
-# Setup
-test_dir=`pwd`
-output_dir=`pwd`/$testname.dir
-rm -fr $output_dir
-mkdir -p $output_dir
-cd $src_dir
-src_dir=`pwd`
-
-# Clean up the cmbuild directories (do it from src_dir as a precaution)
-cd $src_dir
-rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'`
-rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'`
-
-# Clean up the previous install
-cd $src_dir
-rm -fr hypre
-
-# Configure
-cd $src_dir/cmbuild
-cmake $copts ..
-make $mopts install
-
-# Make
-cd $src_dir/test/cmbuild
-cmake ..
-make $mopts
-mv -f $drivers ..
-
-cd $test_dir
-
-# Run
-if [ -n "$rset" ]; then
-   ./test.sh run.sh $src_dir $ropts
-   mv -f run.??? $output_dir
-fi
-
-# Examples
-if [ -n "$eset" ]; then
-   ./test.sh examples.sh $src_dir $eopts
-   mv -f examples.??? $output_dir
-fi
-
-# Echo to stderr all nonempty error files in $output_dir
-for errfile in $( find $output_dir ! -size 0 -name "*.err" )
-do
-   echo $errfile >&2
-done
-
-# Clean up
-cd $src_dir
-rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'`
-rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'`
-rm -fr hypre
-( cd $src_dir/test; rm -f $drivers; cleantest.sh )
-
diff -Nru hypre-2.11.2/AUTOTEST/configure.sh hypre-2.13.0/AUTOTEST/configure.sh
--- hypre-2.11.2/AUTOTEST/configure.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/configure.sh	2017-10-20 17:42:22.000000000 +0000
@@ -40,12 +40,13 @@
 shift
 
 # Run configure
+# NOTE: The use of 'eval' is needed to deal properly with nested quotes in argument lists
 cd $src_dir
 if [ "`uname -s`" = "AIX" ]
 then
-   nopoe ./configure $@
+   eval nopoe ./configure $@
 else
-   ./configure $@
+   eval ./configure $@
 fi
 
 # Save config.log, HYPRE_config.h and Makefile.config
diff -Nru hypre-2.11.2/AUTOTEST/cronfile hypre-2.13.0/AUTOTEST/cronfile
--- hypre-2.11.2/AUTOTEST/cronfile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/cronfile	1970-01-01 00:00:00.000000000 +0000
@@ -1,29 +0,0 @@
-# The first five entries on each line correspond respectively to:
-#
-# minute (0-56)
-# hour (0-23)
-# day of month (1-31)
-# month (1-12)
-# day of week (0-6)(0=Sunday)
-#
-# '*' means "every" and '*/m' means "every m-th"
-
-# Rob's crontab (on tux339)
-
-30 23 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -checkout >> autotest-tux-cron.out 2>> autotest-tux-cron.err
-00  1 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -tux339   >> autotest-tux-cron.out 2>> autotest-tux-cron.err
-00  6 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -summary-email
-
-
-# Rob's crontab (on rzcereal2)
-
-40 0 * * * source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -checkout   > autotest-rzcereal2-checkout.out 2>&1
-00 1 * * 2 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -vulcan    >> autotest-vulcan-cron.out    2>> autotest-vulcan-cron.err
-00 1 * * 3 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzmerl    >> autotest-rzmerl-cron.out    2>> autotest-rzmerl-cron.err
-00 3 * * 3 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzcereal3 >> autotest-rzcereal3-cron.out 2>> autotest-rzcereal3-cron.err
-00 1 * * 4 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzzeus    >> autotest-rzzeus-cron.out    2>> autotest-rzzeus-cron.err
-00 5 * * * source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -summary-copy tux339:/usr/casc/hypre/test-hypre
-
-# Tzanio's crontab (on tux252)
-
-00 1 * * * cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -mac >> autotest-mac-cron.out 2>> autotest-mac-cron.err
diff -Nru hypre-2.11.2/AUTOTEST/machine-mac.sh hypre-2.13.0/AUTOTEST/machine-mac.sh
--- hypre-2.11.2/AUTOTEST/machine-mac.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-mac.sh	2017-10-20 17:42:22.000000000 +0000
@@ -51,16 +51,16 @@
 ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
 
 co="--disable-fortran"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
 
 co="--enable-debug --disable-fortran"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
 
 co="--enable-bigint --disable-fortran"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
 
 # Test linking for different languages
 link_opts="all++"
diff -Nru hypre-2.11.2/AUTOTEST/machine-rzmerl.sh hypre-2.13.0/AUTOTEST/machine-rzmerl.sh
--- hypre-2.11.2/AUTOTEST/machine-rzmerl.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-rzmerl.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,80 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-testname=`basename $0 .sh`
-
-# Echo usage information
-case $1 in
-   -h|-help)
-      cat <<EOF
-
-   **** Only run this script on the rzmerl machine ****
-
-   $0 [-h|-help] {src_dir}
-
-   where: -h|-help   prints this usage information and exits
-          {src_dir}  is the hypre source directory
-
-   This script runs a number of tests suitable for the rzmerl machine.
-
-   Example usage: $0 ../src
-
-EOF
-      exit
-      ;;
-esac
-
-# Setup
-test_dir=`pwd`
-output_dir=`pwd`/$testname.dir
-rm -fr $output_dir
-mkdir -p $output_dir
-src_dir=`cd $1; pwd`
-shift
-
-# Basic build and run tests
-mo="test"
-ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
-
-co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
-
-co="--with-openmp"
-RO="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED -nthreads 2"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $RO
-renametest.sh basictest $output_dir/basictest--with-openmp
-
-co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
-
-co="--enable-bigint"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
-
-# Test linking for different languages
-link_opts="all++ all77"
-for opt in $link_opts
-do
-   output_subdir=$output_dir/link$opt
-   mkdir -p $output_subdir
-   ./test.sh link.sh $src_dir $opt
-   mv -f link.??? $output_subdir
-done
-
-# Echo to stderr all nonempty error files in $output_dir
-for errfile in $( find $output_dir ! -size 0 -name "*.err" )
-do
-   echo $errfile >&2
-done
diff -Nru hypre-2.11.2/AUTOTEST/machine-rztopaz.sh hypre-2.13.0/AUTOTEST/machine-rztopaz.sh
--- hypre-2.11.2/AUTOTEST/machine-rztopaz.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-rztopaz.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,80 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+testname=`basename $0 .sh`
+
+# Echo usage information
+case $1 in
+   -h|-help)
+      cat <<EOF
+
+   **** Only run this script on the rztopaz machine ****
+
+   $0 [-h|-help] {src_dir}
+
+   where: -h|-help   prints this usage information and exits
+          {src_dir}  is the hypre source directory
+
+   This script runs a number of tests suitable for the rztopaz machine.
+
+   Example usage: $0 ../src
+
+EOF
+      exit
+      ;;
+esac
+
+# Setup
+test_dir=`pwd`
+output_dir=`pwd`/$testname.dir
+rm -fr $output_dir
+mkdir -p $output_dir
+src_dir=`cd $1; pwd`
+shift
+
+# Basic build and run tests
+mo="test"
+ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
+
+co=""
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
+
+co="--with-openmp"
+RO="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED -nthreads 2"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $RO
+./renametest.sh basic $output_dir/basic--with-openmp
+
+co="--enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
+
+co="--enable-bigint"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
+
+# Test linking for different languages
+link_opts="all++ all77"
+for opt in $link_opts
+do
+   output_subdir=$output_dir/link$opt
+   mkdir -p $output_subdir
+   ./test.sh link.sh $src_dir $opt
+   mv -f link.??? $output_subdir
+done
+
+# Echo to stderr all nonempty error files in $output_dir
+for errfile in $( find $output_dir ! -size 0 -name "*.err" )
+do
+   echo $errfile >&2
+done
diff -Nru hypre-2.11.2/AUTOTEST/machine-rzzeus.sh hypre-2.13.0/AUTOTEST/machine-rzzeus.sh
--- hypre-2.11.2/AUTOTEST/machine-rzzeus.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-rzzeus.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,79 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-testname=`basename $0 .sh`
-
-# Echo usage information
-case $1 in
-   -h|-help)
-      cat <<EOF
-
-   **** Only run this script on the zeus cluster ****
-
-   $0 [-h|-help] {src_dir}
-
-   where: -h|-help   prints this usage information and exits
-          {src_dir}  is the hypre source directory
-
-   This script runs a number of tests suitable for the zeus cluster.
-
-   Example usage: $0 ../src
-
-EOF
-      exit
-      ;;
-esac
-
-# Setup
-test_dir=`pwd`
-output_dir=`pwd`/$testname.dir
-rm -fr $output_dir
-mkdir -p $output_dir
-src_dir=`cd $1; pwd`
-shift
-
-# Basic build and run tests
-mo="test"
-ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
-
-co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
-
-co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
-
-co="--enable-bigint"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
-
-co="--with-blas --with-lapack --with-blas-lib-dirs=/usr/lib64 --with-lapack-lib-dirs=/usr/lib64 --with-blas-libs=blas --with-lapack-libs=lapack"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--with-blas
-
-# Test linking for different languages
-link_opts="all++ all77"
-for opt in $link_opts
-do
-   output_subdir=$output_dir/link$opt
-   mkdir -p $output_subdir
-   ./test.sh link.sh $src_dir $opt
-   mv -f link.??? $output_subdir
-done
-
-# Echo to stderr all nonempty error files in $output_dir
-for errfile in $( find $output_dir ! -size 0 -name "*.err" )
-do
-   echo $errfile >&2
-done
diff -Nru hypre-2.11.2/AUTOTEST/machine-syrah.sh hypre-2.13.0/AUTOTEST/machine-syrah.sh
--- hypre-2.11.2/AUTOTEST/machine-syrah.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-syrah.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,79 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+testname=`basename $0 .sh`
+
+# Echo usage information
+case $1 in
+   -h|-help)
+      cat <<EOF
+
+   **** Only run this script on the syrah cluster ****
+
+   $0 [-h|-help] {src_dir}
+
+   where: -h|-help   prints this usage information and exits
+          {src_dir}  is the hypre source directory
+
+   This script runs a number of tests suitable for the syrah cluster.
+
+   Example usage: $0 ../src
+
+EOF
+      exit
+      ;;
+esac
+
+# Setup
+test_dir=`pwd`
+output_dir=`pwd`/$testname.dir
+rm -fr $output_dir
+mkdir -p $output_dir
+src_dir=`cd $1; pwd`
+shift
+
+# Basic build and run tests
+mo="test"
+ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
+
+co=""
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
+
+co="--enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
+
+co="--enable-bigint"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
+
+co="--with-blas --with-lapack --with-blas-lib-dirs=/usr/lib64 --with-lapack-lib-dirs=/usr/lib64 --with-blas-libs=blas --with-lapack-libs=lapack"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--with-blas
+
+# Test linking for different languages
+link_opts="all++ all77"
+for opt in $link_opts
+do
+   output_subdir=$output_dir/link$opt
+   mkdir -p $output_subdir
+   ./test.sh link.sh $src_dir $opt
+   mv -f link.??? $output_subdir
+done
+
+# Echo to stderr all nonempty error files in $output_dir
+for errfile in $( find $output_dir ! -size 0 -name "*.err" )
+do
+   echo $errfile >&2
+done
diff -Nru hypre-2.11.2/AUTOTEST/machine-tux-exlibs.sh hypre-2.13.0/AUTOTEST/machine-tux-exlibs.sh
--- hypre-2.11.2/AUTOTEST/machine-tux-exlibs.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-tux-exlibs.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,67 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+testname=`basename $0 .sh`
+
+# Echo usage information
+case $1 in
+   -h|-help)
+      cat <<EOF
+
+   **** This script requires certain external libraries. ****
+
+   $0 [-h|-help] {src_dir}
+
+   where: -h|-help   prints this usage information and exits
+          {src_dir}  is the hypre source directory
+          
+
+   This script runs a number of external library tests.
+
+   Example usage: $0 ../src
+
+EOF
+      exit
+      ;;
+esac
+
+# Setup
+test_dir=`pwd`
+output_dir=`pwd`/$testname.dir
+rm -fr $output_dir
+mkdir -p $output_dir
+src_dir=`cd $1; pwd`
+shift
+
+# Basic build and run tests
+mo="-j test"
+ro="-ams -ij -sstruct -struct"
+eo=""
+
+co="--enable-debug --with-blas-lib=\\'-L/home/falgout2/codes/blas/BLAS-3.7.1 -lblas -lgfortran\\'"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic-blas
+
+co="--enable-debug --with-mli --with-superlu --with-superlu-include=/home/falgout2/codes/superlu/SuperLU_5.2.1/SRC"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic-superlu
+
+co="--enable-debug --with-mli --with-superlu --with-superlu-include=/home/falgout2/codes/superlu/SuperLU_5.2.1/SRC --with-dsuperlu --with-dsuperlu-include=/home/falgout2/codes/superlu/SuperLU_DIST_5.2.1/SRC"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic-dsuperlu
+
+# Echo to stderr all nonempty error files in $output_dir
+for errfile in $( find $output_dir ! -size 0 -name "*.err" )
+do
+   echo $errfile >&2
+done
diff -Nru hypre-2.11.2/AUTOTEST/machine-tux.sh hypre-2.13.0/AUTOTEST/machine-tux.sh
--- hypre-2.11.2/AUTOTEST/machine-tux.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-tux.sh	2017-10-20 17:42:22.000000000 +0000
@@ -43,73 +43,99 @@
 src_dir=`cd $1; pwd`
 shift
 
+# Organizing the tests from "fast" to "slow"
+
+# Check for 'int', 'double', and 'MPI_'
+./test.sh check-int.sh $src_dir
+mv -f check-int.??? $output_dir
+./test.sh check-double.sh $src_dir
+mv -f check-double.??? $output_dir
+./test.sh check-mpi.sh $src_dir
+mv -f check-mpi.??? $output_dir
+
 # Basic build and run tests
 mo="-j test"
 ro="-ams -ij -sstruct -struct"
 eo=""
 
 co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest-default
-
-co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: $eo
-renametest.sh basictest $output_dir/basictest-debug1
-
-co="--enable-debug --enable-global-partition"
-RO="-fac"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $RO -eo: $eo
-renametest.sh basictest $output_dir/basictest-debug2
-
-co="--enable-debug CC=mpiCC"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
-renametest.sh basictest $output_dir/basictest-debug-cpp
-
-# co="--with-insure --enable-debug --with-print-errors"
-# MO="test"
-# test.sh basictest.sh $src_dir -co: $co -mo: $MO -ro: $ro
-# renametest.sh basictest $output_dir/basictest--with-insure1
-# 
-# co="--with-insure --enable-debug --enable-global-partition"
-# MO="test"
-# test.sh basictest.sh $src_dir -co: $co -mo: $MO -ro: $ro
-# renametest.sh basictest $output_dir/basictest--with-insure2
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic-default
 
-co="--enable-debug --with-print-errors"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
-renametest.sh basictest $output_dir/basictest--valgrind1
-
-co="--enable-debug --enable-global-partition"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
-renametest.sh basictest $output_dir/basictest--valgrind2
+# Test linking for different languages (depends on previous compile test)
+link_opts="all++ all77"
+for opt in $link_opts
+do
+   output_subdir=$output_dir/link$opt
+   mkdir -p $output_subdir
+   ./test.sh link.sh $src_dir $opt
+   mv -f link.??? $output_subdir
+done
 
 co="--without-MPI"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--without-MPI
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--without-MPI
 
 co="--with-strict-checking"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--with-strict-checking
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--with-strict-checking
+
+co="--with-strict-checking --enable-global-partition"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--with-strict-global
 
 co="--enable-shared"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-shared
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-shared
 
-co="--enable-bigint --enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: -bigint
-renametest.sh basictest $output_dir/basictest--enable-bigint
+co="--enable-debug --with-openmp"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-openmp
+
+co="--enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: $eo
+./renametest.sh basic $output_dir/basic-debug1
 
 co="--enable-maxdim=4 --enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: -maxdim
-renametest.sh basictest $output_dir/basictest--enable-maxdim=4
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: -maxdim
+./renametest.sh basic $output_dir/basic--enable-maxdim=4
 
 co="--enable-complex --enable-maxdim=4 --enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: -complex
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: -complex
 # ignore complex compiler output for now
-rm -fr basictest.dir/make.???
-grep -v make.err basictest.err > basictest.tmp
-mv basictest.tmp basictest.err
-renametest.sh basictest $output_dir/basictest--enable-complex
+rm -fr basic.dir/make.???
+grep -v make.err basic.err > basic.tmp
+mv basic.tmp basic.err
+./renametest.sh basic $output_dir/basic--enable-complex
+
+co="--enable-debug --enable-global-partition"
+RO="-fac"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $RO -eo: $eo
+./renametest.sh basic $output_dir/basic-debug2
+
+co="--enable-single --enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: -single
+./renametest.sh basic $output_dir/basic--enable-single
+
+co="--enable-longdouble --enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: -longdouble
+./renametest.sh basic $output_dir/basic--enable-longdouble
+
+co="--enable-debug CC=mpiCC"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
+./renametest.sh basic $output_dir/basic-debug-cpp
+
+co="--enable-bigint --enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: -bigint
+./renametest.sh basic $output_dir/basic--enable-bigint
+
+co="--enable-debug --with-print-errors"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
+./renametest.sh basic $output_dir/basic--valgrind1
+
+co="--enable-debug --enable-global-partition"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
+./renametest.sh basic $output_dir/basic--valgrind2
 
 # CMake build and run tests
 mo="-j"
@@ -117,50 +143,40 @@
 eo=""
 
 co=""
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-default
-
-co="-DCMAKE_BUILD_TYPE=Debug"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh cmaketest $output_dir/cmaketest-debug
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-default
 
 co="-DHYPRE_NO_GLOBAL_PARTITION=OFF"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-global-partition
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-global-partition
 
 co="-DHYPRE_SEQUENTIAL=ON"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-sequential
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-sequential
 
 co="-DHYPRE_SHARED=ON"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-shared
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-shared
+
+co="-DHYPRE_SINGLE=ON"
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: -single
+./renametest.sh cmake $output_dir/cmake-single
+
+co="-DHYPRE_LONG_DOUBLE=ON"
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: -longdouble
+./renametest.sh cmake $output_dir/cmake-longdouble
+
+co="-DCMAKE_BUILD_TYPE=Debug"
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh cmake $output_dir/cmake-debug
 
 co="-DHYPRE_BIGINT=ON"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh cmaketest $output_dir/cmaketest-bigint
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh cmake $output_dir/cmake-bigint
 
 # cmake build doesn't currently support maxdim
 # cmake build doesn't currently support complex
 
-# Test linking for different languages
-link_opts="all++ all77"
-for opt in $link_opts
-do
-   output_subdir=$output_dir/link$opt
-   mkdir -p $output_subdir
-   ./test.sh link.sh $src_dir $opt
-   mv -f link.??? $output_subdir
-done
-
-# Check for 'int', 'double', and 'MPI_'
-./test.sh check-int.sh $src_dir
-mv -f check-int.??? $output_dir
-./test.sh check-double.sh $src_dir
-mv -f check-double.??? $output_dir
-./test.sh check-mpi.sh $src_dir
-mv -f check-mpi.??? $output_dir
-
 # Echo to stderr all nonempty error files in $output_dir
 for errfile in $( find $output_dir ! -size 0 -name "*.err" )
 do
diff -Nru hypre-2.11.2/AUTOTEST/machine-vulcan.sh hypre-2.13.0/AUTOTEST/machine-vulcan.sh
--- hypre-2.11.2/AUTOTEST/machine-vulcan.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/machine-vulcan.sh	2017-10-20 17:42:22.000000000 +0000
@@ -47,16 +47,16 @@
 ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
 
 co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
 
 co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
 
 co="--enable-bigint"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
 
 # Test linking for different languages
 link_opts="all++ all77"
diff -Nru hypre-2.11.2/AUTOTEST/README.txt hypre-2.13.0/AUTOTEST/README.txt
--- hypre-2.11.2/AUTOTEST/README.txt	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/README.txt	2017-10-20 17:42:22.000000000 +0000
@@ -1,12 +1,10 @@
 
 This directory contains scripts for running various tests on the hypre library.
-They are run automatically as part of hypre's regression testing, and they are
-run manually to test new distributions of hypre before releasing them to the
-public.  The scripts augment the 'runtest.sh' runtime tests in 'test/TEST_*'.
-
-Every test in this directory may be run manually by developers without fear of
-interfering with the auto-testing, as long as they are not run from within the
-auto-testing directory (currently '/usr/casc/hypre/testing').
+The scripts augment the 'runtest.sh' runtime tests in 'test/TEST_*'.
+
+Every test in this directory may be run manually by developers.  Many of the
+scripts are also run as part of the nightly regression testing, currently
+developed and maintained in a separate git repository called 'hypre/autotest'.
 
 =====================
 
@@ -17,20 +15,13 @@
 represents an individual test written by a hypre developer.  The special scripts
 are as follows (note that they are the only scripts with "test" in their names):
 
-1. 'test.sh' - Used to run individual tests locally on a machine.
-2. 'testsrc.sh' - Used to run individual tests on a remote machine.
-3. 'testdist.sh' - Used to test a new distribution before release.
-4. 'autotest.sh' - Usually run in an automatic fashion by 'cron', but may also
-                   be run manually by developers (useful for debugging).
+1. 'test.sh'       - Used to run individual tests.
+2. 'cleantest.sh'  - Used to clean up the output from a test (or tests).
+3. 'renametest.sh' - Used to rename the output from a test.
 
 Usage information for every script (special or individual test) can be obtained
 by running it with the '-h' option (e.g., 'test.sh -h' or 'make.sh -h').
 
-The file 'cronfile' encapsulates the current 'cron' entries for auto-testing.
-It is possible (and probable) to have multiple developers running 'cron' jobs as
-part of the overall auto-testing.  This needs to be coordinated if the output
-files are being written to the global auto-testing directory.
-
 =====================
 
 Writing tests:
@@ -43,8 +34,7 @@
 template and make the appropriate modifications.  Try not to use the word "test"
 in the name of the script so that we can keep the convention of only the special
 scripts having this in their names.  Try not to use absolute directory paths in
-the script.  If in doubt, talk to another developer or send an inquiry to
-hypre-support@llnl.gov.
+the script.
 
 =====================
 
@@ -52,7 +42,6 @@
 
 - Minimal limitations on the types of tests that are possible.
 - Developers should be able to run the tests manually.
-- Tests should be runable on both the repository and each release.
 - Minimal dependence on operating system and software tools (for portability).
 - Developers should be able to easily add new tests.
 - Simplicity and flexibility.
diff -Nru hypre-2.11.2/AUTOTEST/testdist.sh hypre-2.13.0/AUTOTEST/testdist.sh
--- hypre-2.11.2/AUTOTEST/testdist.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/testdist.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,174 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-# Which tests to run?
-TEST_PATCH="-tux339"
-TEST_MINOR="$TEST_PATCH -rzzeus -rzmerl -vulcan"
-TEST_MAJOR="$TEST_MINOR"
-TERMCMD=""
-
-while [ "$*" ]
-do
-   case $1 in
-      -h|-help)
-         cat <<EOF
-
-   $0 [options] {release}
-
-   where: {release}  is a hypre release tar file (gzipped, absolute path)
-
-   with options:
-      -xterm         run the tests in parallel using multiple xterm windows
-      -h|-help       prints this usage information and exits
-      -t|-trace      echo each command
-
-   This script unpacks {release} in the parent directory and lists the tests
-   needed to verify it, based on the type of release (MAJOR, MINOR, or PATCH).
-   If all required tests pass, a verification file is generated containing the
-   logs from the runs.  Otherwise, tests that have failed or have not been run
-   yet can be started, and the script will have to be re-run after their
-   completion to generate the verification file.
-
-   Example usage: $0 /usr/casc/hypre/hypre-2.10.1.tar.gz
-
-   NOTE: The absolute path for the release is required.
-
-   NOTE: Because of ssh restrictions at LLNL, run this script on an LC machine.
-   You may need to change the default tux platform at the top of this file to
-   your own tux machine.  Finally, for each release tar file, it is recommended
-   that you run this script inside a separate copy of the AUTOTEST directory
-   (this will avoid result conflicts in common tests).
-
-EOF
-         exit
-         ;;
-      -t|-trace)
-         set -xv
-         shift
-         ;;
-      -xterm)
-         # Get the terminal command and make sure it runs bash
-         TERMCMD="$TERM -e"; SHELL=/bin/sh
-         shift
-         ;;
-      *)
-         break
-         ;;
-   esac
-done
-
-# Setup
-testing_dir=`cd ..; pwd`
-autotest_dir="$testing_dir/AUTOTEST"
-release_file=$1
-release_dir=`basename $release_file | awk -F.tar '{print $1}'`
-release=`echo $release_dir | sed 's/hypre-//' | sed 's/.tar.gz//'`
-output_dir="$testing_dir/AUTOTEST-hypre-$release"
-case $release in
-   [1-9][0-9]*.0.0)                     NAME="MAJOR"; TESTS=$TEST_MAJOR ;;
-   [1-9][0-9]*.[1-9][0-9]*.0)           NAME="MINOR"; TESTS=$TEST_MINOR ;;
-   [1-9][0-9]*.[1-9][0-9]*.[1-9][0-9]*) NAME="PATCH"; TESTS=$TEST_PATCH ;;
-   *)                                   NAME="PATCH"; TESTS=$TEST_PATCH ;;
-esac
-
-# Extract the release
-cd $testing_dir
-echo "Checking the distribution file..."
-tmpdir=$release_dir.TMP
-mkdir -p $tmpdir
-rm -rf $tmpdir/$release_dir
-tar -C $tmpdir -zxf $release_file
-if !(diff -r $release_dir $tmpdir/$release_dir 2>/dev/null 1>&2) then
-   rm -rf $release_dir $output_dir $autotest_dir/autotest-*
-   tar -zxf $release_file
-fi
-rm -rf $tmpdir
-echo ""
-echo "The following tests are needed to verify this $NAME release: $TESTS"
-echo ""
-
-# List the status of the required tests
-cd $autotest_dir
-NOTRUN=""
-FAILED=""
-PENDING=""
-for test in $TESTS
-do
-   name=`echo $test | sed 's/[0-9]//g'`
-   # Determine failed, pending, passed and tests that have not been run
-   if [ -f $output_dir/machine$name.err ]; then
-      if [ -s $output_dir/machine$name.err ]; then
-         status="[FAILED] "; FAILED="$FAILED $test"
-      else
-         status="[PASSED] ";
-      fi
-   elif [ ! -e autotest$name-start ]; then
-      status="[NOT RUN]"; NOTRUN="$NOTRUN $test"
-   elif [ ! -e autotest$name-done ]; then
-      status="[PENDING]"; PENDING="$PENDING $test"
-   else
-      status="[UNKNOWN]";
-   fi
-   if [ "$TERMCMD" == "" ]; then
-      echo "$status ./autotest.sh -dist $release $test"
-   else
-      echo "$status $TERMCMD ./autotest.sh -dist $release $test &"
-   fi
-done
-
-# If all tests have been run, create a tarball of the log files
-if [ "$NOTRUN$PENDING" == "" ]; then
-   echo ""; echo "Generating the verification file AUTOTEST-hypre-$release.tgz"
-   cd $testing_dir
-   mv -f $autotest_dir/autotest-* $output_dir
-   tar -zcf $autotest_dir/AUTOTEST-hypre-$release.tgz `basename $output_dir`
-fi
-
-# If all tests have passed, print a message and exit
-if [ "$NOTRUN$FAILED$PENDING" == "" ]; then
-   echo "The release is verified!"
-   exit
-fi
-
-cat <<EOF
-
-The release can not be automatically verified at this time because not all tests
-are listed as [PASSED].  You may choose to continue with the release anyway, but
-it is your responsibility to ensure that the test errors are acceptable.
-
-This script can start the remaining tests now.  Alternatively, you can run the
-above commands manually (or in a cron job). If you do this, make sure to examine
-the standart error of the autotest.sh script.
-
-EOF
-
-echo -n "Do you want to start the remaining tests? (yes,no) : "
-read -e RUN
-if [ "$RUN" == "yes" ]; then
-   for test in $FAILED $NOTRUN
-   do
-      name=`echo $test | sed 's/[0-9]//g'`
-      rm -rf $output_dir/machine$name.??? autotest$name*
-      if [ "$TERMCMD" == "" ]; then
-         echo "Running test [./autotest.sh -dist $release $test]"
-         ./autotest.sh -dist $release $test 2>> autotest$name.err
-      else
-         echo "Running test [$TERMCMD ./autotest.sh -dist $release $test &]"
-         $TERMCMD "./autotest.sh -dist $release $test 2>> autotest$name.err" 2>> autotest$name.err &
-      fi
-      echo ""
-   done
-fi
-echo ""
-echo "Re-run the script after tests have completed to verify the release."
-echo ""
diff -Nru hypre-2.11.2/AUTOTEST/testsrc.sh hypre-2.13.0/AUTOTEST/testsrc.sh
--- hypre-2.11.2/AUTOTEST/testsrc.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/AUTOTEST/testsrc.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,89 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-while [ "$*" ]
-do
-   case $1 in
-      -h|-help)
-         cat <<EOF
-
-   $0 [options] {src_dir} {machine:rem_path} {testname}.sh
-
-   where: {src_dir}  is the hypre source directory
-          {machine}  is the name of the machine to run on
-          {rem_path} is the remote path where the {src_dir} directory
-                     will be copied
-          {testname} is the user-defined name for the test script
-
-   with options:
-      -h|-help       prints this usage information and exits
-      -t|-trace      echo each command
-
-   This script is a specialized version of 'test.sh' that runs script
-   {testname}.sh remotely on {machine}.  It is assumed that {testname}.sh takes
-   only one argument, which will be set to '..' on the remote machine.
-
-   The script first copies the {src_dir} directory into {machine:rem_path}, then
-   copies the current AUTOTEST script directory there (potentially overwriting
-   an already existing AUTOTEST directory).
-
-   The output is still collected locally in exactly the same way as 'test.sh'.
-
-   Example usage: $0 ../src tux149:. machine-tux.sh
-
-EOF
-         exit
-         ;;
-      -t|-trace)
-         set -xv
-         shift
-         ;;
-      *)
-         break
-         ;;
-   esac
-done
-
-# Setup
-src_dir=`cd $1; pwd`
-machine=`echo $2 | awk -F: '{print $1}'`
-rem_path=`echo $2 | awk -F: '{print $2}'`
-testname=`basename $3 .sh`
-rem_dir=`basename $src_dir`
-
-# Copy the source and AUTOTEST directories using rsync/tar+ssh
-# ssh $machine "rm -fr $rem_path/$rem_dir"
-# scp -r $src_dir $machine:$rem_path/$rem_dir
-# scp -r . $machine:$rem_path/$rem_dir/AUTOTEST
-echo "Copying sources to $machine"
-rem_dir_exists=`ssh -q $machine "(/bin/sh -c \"[ -d $rem_path/$rem_dir ] && echo \"yes\" || (mkdir -p $rem_path/$rem_dir; echo \"no\")\")"`
-if [ "$rem_dir_exists" == "no" ]
-then
-   tar -C `dirname $src_dir` -zcf - $rem_dir | ssh -q $machine tar -C $rem_path -zxf -
-else
-   rsync -zae "ssh -q" --delete $src_dir/ $USER@$machine:$rem_path/$rem_dir
-fi
-rsync -zae "ssh -q" --delete . $USER@$machine:$rem_path/$rem_dir/AUTOTEST
-
-# Run the test and copy the results
-# Use the '.hyprerc' file when needed to customize the environment
-hyprerc_exists=`ssh -q $machine "( /bin/sh -c '[ -f .hyprerc ] && echo yes' )"`
-if [ "$hyprerc_exists" == "yes" ]
-then
-   ssh -q $machine "source .hyprerc; cd $rem_path/$rem_dir/AUTOTEST; ./test.sh ${testname}.sh .."
-else
-   ssh -q $machine "cd $rem_path/$rem_dir/AUTOTEST; ./test.sh ${testname}.sh .."
-fi
-rm -fr $testname.???
-echo "Copying output files from $machine"
-scp -q -r $machine:$rem_path/$rem_dir/AUTOTEST/$testname.\?\?\? .
diff -Nru hypre-2.11.2/CHANGELOG hypre-2.13.0/CHANGELOG
--- hypre-2.11.2/CHANGELOG	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/CHANGELOG	2017-10-20 17:42:22.000000000 +0000
@@ -15,6 +15,38 @@
 # recent release.
 #=============================================================================
 
+Version 2.13.0 released 2017/10/20
+
+- Added new multigrid reduction (MGR) solver
+
+- Added BoomerAMG feature SetCpointsToKeep() to force specific points to keep on
+  all coarse grids, up to a specified coarse level.
+
+- Changed defaults in AMG Hybrid solver to match BoomerAMG defaults
+
+- Updated hypre blas/lapack implementation
+
+- Removed SuperLU/DSuperLU from the distribution and updated the FEI code to
+  work with external installations of SuperLU/SuperLU_DIST (version 5.2.1)
+
+- Fixed a bug in Euclid
+
+- Fixed Windows Visual Studio compile (version 2017)
+
+Version 2.12.1 released 2017/09/29
+
+- Added support for single and quad precision floating point numbers
+
+- Added weighted Jacobi (relax_type 7) to be usable on GPU in BoomerAMG
+
+- Various bug fixes
+
+Version 2.12.0 released 2017/05/02
+
+- Added GPU support to hypre.  The Struct and SStruct code can use CUDA, RAJA,
+  or KOKKOS.  The ParCSR code uses CUDA.  The BoomerAMG setup phase is not yet
+  ported to the GPU.  This release uses unified memory.
+
 Version 2.11.2 released 2017/03/13
 
 - Changed the defaults in hypre to HMIS with ext+i(4) interpolation
diff -Nru hypre-2.11.2/debian/changelog hypre-2.13.0/debian/changelog
--- hypre-2.11.2/debian/changelog	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/changelog	2017-12-11 19:47:42.000000000 +0000
@@ -1,3 +1,35 @@
+hypre (2.13.0-2) unstable; urgency=medium
+
+  * Team upload.
+  * Standards-Version: 4.1.2
+
+ -- Drew Parsons <dparsons@debian.org>  Tue, 12 Dec 2017 03:47:42 +0800
+
+hypre (2.13.0-1exp1) experimental; urgency=medium
+
+  * Team upload.
+  * New upstream version.
+    - new multigrid reduction (MGR) solver
+    - updates AMG Hybrid solver with BoomerAMG
+    - now uses external SuperLU (remove debian superlu_internal.patch)
+      and SuperLU-Dist.
+      Build-Depends: libsuperlu-dev, libsuperlu-dist-dev
+    - regression: breaks build of distributed_ls/pilut. Fixed with
+      debian patch pilut_blas_mangle.patch to use external BLAS.
+  * libhypre-dev depends on optimised OpenBLAS or ATLAS to provide
+    blas and lapack, where available. 
+
+ -- Drew Parsons <dparsons@debian.org>  Fri, 24 Nov 2017 13:52:43 +0800
+
+hypre (2.12.1-1exp1) experimental; urgency=medium
+
+  * Team upload.
+  * New upstream version.
+    - adds GPU support (not yet activated in Debian)
+    - adds support for single and quad precision floating point numbers
+
+ -- Drew Parsons <dparsons@debian.org>  Wed, 18 Oct 2017 19:17:17 +0800
+
 hypre (2.11.2-2) unstable; urgency=medium
 
   * Team upload.
diff -Nru hypre-2.11.2/debian/control hypre-2.13.0/debian/control
--- hypre-2.11.2/debian/control	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/control	2017-12-11 19:47:42.000000000 +0000
@@ -3,14 +3,15 @@
 Priority: optional
 Maintainer: Debian Science Maintainers <debian-science-maintainers@lists.alioth.debian.org>
 Uploaders: "Adam C. Powell, IV" <hazelsct@debian.org>
-Standards-Version: 4.1.1
+Standards-Version: 4.1.2
 Build-Depends:
  autoconf,
  automake,
  debhelper (>= 10),
  gfortran,
- libblas-dev | libblas.so,
- liblapack-dev | liblapack.so,
+ libblas-dev | libopenblas-dev | libatlas-base-dev | libblas.so,
+ liblapack-dev | libopenblas-dev | libatlas-base-dev | liblapack.so,
+ libsuperlu-dev, libsuperlu-dist-dev,
  libltdl-dev,
  libtool,
  mpi-default-dev,
@@ -18,7 +19,7 @@
 Vcs-Browser: https://anonscm.debian.org/git/debian-science/packages/hypre.git
 Homepage: http://www.llnl.gov/casc/hypre/
 
-Package: libhypre-2.11.2
+Package: libhypre-2.13.0
 Architecture: any
 Multi-Arch: same
 Section: libs
@@ -36,9 +37,9 @@
 Architecture: any
 Section: libdevel
 Depends:
- libblas-dev | libblas.so,
- libhypre-2.11.2 (= ${binary:Version}),
- liblapack-dev | liblapack.so,
+ libhypre-2.13.0 (= ${binary:Version}),
+ libopenblas-dev | libatlas-base-dev | libblas-dev | libblas.so,
+ libopenblas-dev | libatlas-base-dev | liblapack-dev | liblapack.so,
  libsuperlu-dev,
  mpi-default-dev,
  ${misc:Depends},
diff -Nru hypre-2.11.2/debian/patches/cmake-install.patch hypre-2.13.0/debian/patches/cmake-install.patch
--- hypre-2.11.2/debian/patches/cmake-install.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/cmake-install.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,32 +0,0 @@
-Index: hypre-2.10.0b/src/CMakeLists.txt
-===================================================================
---- hypre-2.10.0b.orig/src/CMakeLists.txt
-+++ hypre-2.10.0b/src/CMakeLists.txt
-@@ -8,6 +8,10 @@ set (HYPRE_TIME    00:00:00)
- set (HYPRE_BUGS    hypre-support@llnl.gov)
- set (HYPRE_SRCDIR  "${PROJECT_SOURCE_DIR}")
- 
-+set(HYPRE_MAJOR_VERSION 2)
-+set(HYPRE_MINOR_VERSION 10)
-+set(HYPRE_PATCH_VERSION 0b)
-+
- if (${hypre_SOURCE_DIR} STREQUAL ${hypre_BINARY_DIR})
-   message(FATAL_ERROR "In-place build not allowed! Please use a separate build directory. See the Users Manual or INSTALL file for details.")
- endif ()
-@@ -778,9 +782,13 @@ if (HYPRE_USING_FEI)
- endif ()
- 
- add_library (HYPRE ${HYPRE_SOURCES} ${FEI_LIBS})
--
--install (TARGETS HYPRE DESTINATION lib)
--install (FILES ${HYPRE_HEADERS} DESTINATION include)
-+if (BUILD_SHARED_LIBS)
-+set_target_properties(HYPRE PROPERTIES
-+     VERSION ${HYPRE_MAJOR_VERSION}.${HYPRE_MINOR_VERSION}.${HYPRE_PATCH_VERSION}
-+     SOVERSION ${HYPRE_MAJOR_VERSION})
-+endif (BUILD_SHARED_LIBS)
-+install (TARGETS HYPRE DESTINATION ${INSTALL_LIB_DIR})
-+install (FILES ${HYPRE_HEADERS} DESTINATION include/hypre)
- 
- # add_subdirectory (test EXCLUDE_FROM_ALL)
-  
diff -Nru hypre-2.11.2/debian/patches/cmake.patch hypre-2.13.0/debian/patches/cmake.patch
--- hypre-2.11.2/debian/patches/cmake.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/cmake.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,45 +0,0 @@
-Index: hypre-2.9.1a/src/CMakeLists.txt
-===================================================================
---- hypre-2.9.1a.orig/src/CMakeLists.txt	2013-09-12 08:36:16.000000000 +0200
-+++ hypre-2.9.1a/src/CMakeLists.txt	2013-09-12 08:36:16.000000000 +0200
-@@ -772,7 +772,7 @@
- add_library (HYPRE ${HYPRE_SOURCES} ${FEI_LIBS})
- 
- install (TARGETS HYPRE DESTINATION lib)
--install (FILES ${HYPRE_HEADERS} DESTINATION include)
-+install (FILES ${HYPRE_HEADERS} DESTINATION include/hypre)
- 
- # add_subdirectory (test EXCLUDE_FROM_ALL)
-  
-Index: hypre-2.9.1a/src/FEI_mv/SuperLU/CMakeLists.txt
-===================================================================
---- hypre-2.9.1a.orig/src/FEI_mv/SuperLU/CMakeLists.txt	2012-09-25 00:59:53.000000000 +0200
-+++ hypre-2.9.1a/src/FEI_mv/SuperLU/CMakeLists.txt	2013-09-12 08:40:05.000000000 +0200
-@@ -59,4 +59,5 @@
-   set_target_properties(HYPRE_superlu PROPERTIES COMPILE_FLAGS "-fPIC")
- endif()
- 
--install (FILES ${HYPRE_superlu_HEADERS} DESTINATION include)
-+install (TARGETS HYPRE_superlu DESTINATION lib)
-+install (FILES ${HYPRE_superlu_HEADERS} DESTINATION include/hypre)
-Index: hypre-2.9.1a/src/FEI_mv/fei-hypre/CMakeLists.txt
-===================================================================
---- hypre-2.9.1a.orig/src/FEI_mv/fei-hypre/CMakeLists.txt	2012-09-25 02:13:39.000000000 +0200
-+++ hypre-2.9.1a/src/FEI_mv/fei-hypre/CMakeLists.txt	2013-09-12 08:40:41.000000000 +0200
-@@ -66,4 +66,5 @@
-   set_target_properties(HYPRE_fei PROPERTIES COMPILE_FLAGS "-fPIC")
- endif()
- 
--install (FILES ${HYPRE_fei_HEADERS} DESTINATION include)
-+install (TARGETS HYPRE_fei DESTINATION lib)
-+install (FILES ${HYPRE_fei_HEADERS} DESTINATION include/hypre)
-Index: hypre-2.9.1a/src/FEI_mv/femli/CMakeLists.txt
-===================================================================
---- hypre-2.9.1a.orig/src/FEI_mv/femli/CMakeLists.txt	2012-09-25 00:59:54.000000000 +0200
-+++ hypre-2.9.1a/src/FEI_mv/femli/CMakeLists.txt	2013-09-12 08:49:00.000000000 +0200
-@@ -46,3 +46,5 @@
- if(BUILD_SHARED_LIBS)
-   set_target_properties(HYPRE_mli PROPERTIES COMPILE_FLAGS "-fPIC")
- endif()
-+
-+install (TARGETS HYPRE_mli DESTINATION lib)
diff -Nru hypre-2.11.2/debian/patches/complex.patch hypre-2.13.0/debian/patches/complex.patch
--- hypre-2.11.2/debian/patches/complex.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/complex.patch	2017-12-11 19:47:42.000000000 +0000
@@ -27,7 +27,7 @@
 ===================================================================
 --- hypre.orig/src/config/configure.in
 +++ hypre/src/config/configure.in
-@@ -1259,6 +1259,12 @@ HYPRE_INSTALLDIR="${prefix}"
+@@ -1698,6 +1698,12 @@ HYPRE_INSTALLDIR="${prefix}"
  HYPRE_LIBINSTALL="${libdir}"
  HYPRE_INCINSTALL="${includedir}/hypre"
  
diff -Nru hypre-2.11.2/debian/patches/config-for-petsc.patch hypre-2.13.0/debian/patches/config-for-petsc.patch
--- hypre-2.11.2/debian/patches/config-for-petsc.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/config-for-petsc.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
-Index: hypre-2.10.0b/src/lib/Makefile
-===================================================================
---- hypre-2.10.0b.orig/src/lib/Makefile
-+++ hypre-2.10.0b/src/lib/Makefile
-@@ -53,9 +53,7 @@ LIBS_HYPRE = ../utilities/libHYPRE_utili
-   ../distributed_ls/Euclid/libHYPRE_Euclid.so \
-   ../distributed_ls/ParaSails/libHYPRE_ParaSails.so \
-   ../distributed_ls/pilut/libHYPRE_DistributedMatrixPilutSolver.so \
--  ../parcsr_ls/libHYPRE_parcsr_ls.so \
--  ../FEI_mv/femli/libHYPRE_mli.so \
--  ../FEI_mv/fei-hypre/libHYPRE_FEI.so
-+  ../parcsr_ls/libHYPRE_parcsr_ls.so
- 
- SONAME = libHYPRE-${HYPRE_RELEASE_VERSION}.so
- SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS}\
diff -Nru hypre-2.11.2/debian/patches/config-update-dir.patch hypre-2.13.0/debian/patches/config-update-dir.patch
--- hypre-2.11.2/debian/patches/config-update-dir.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/config-update-dir.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,19 +0,0 @@
-Change include paths so "aclocal -I ." works.
-
-Index: hypre/src/config/configure.in
-===================================================================
---- hypre.orig/src/config/configure.in
-+++ hypre/src/config/configure.in
-@@ -61,9 +61,9 @@
- m4_define([M4_HYPRE_BUGS],    [hypre-support@llnl.gov])
- m4_define([M4_HYPRE_SRCDIR],  [`pwd`])
- 
--m4_include([config/hypre_blas_macros.m4])
--m4_include([config/hypre_lapack_macros.m4])
--m4_include([config/hypre_macros_misc.m4])
-+m4_include([hypre_blas_macros.m4])
-+m4_include([hypre_lapack_macros.m4])
-+m4_include([hypre_macros_misc.m4])
- 
- AC_PREREQ(2.59)
- AC_REVISION($Id: configure.in,v 1.101 2011/11/14 22:36:27 falgout Exp $)
diff -Nru hypre-2.11.2/debian/patches/configure.patch hypre-2.13.0/debian/patches/configure.patch
--- hypre-2.11.2/debian/patches/configure.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/configure.patch	2017-12-11 19:47:42.000000000 +0000
@@ -11,7 +11,7 @@
  
  dnl *********************************************************************
  dnl * Initialize some variables
-@@ -161,19 +161,18 @@ hypre_lapack_lib_dir_old_style=no
+@@ -169,19 +169,18 @@ hypre_lapack_lib_dir_old_style=no
  dnl *********************************************************************
  dnl * Determine BUILD, HOST, and TARGET types
  dnl *********************************************************************
@@ -36,7 +36,7 @@
  fi
  
  dnl *********************************************************************
-@@ -1258,7 +1257,7 @@ dnl * Set installation directories
+@@ -1697,7 +1696,7 @@ dnl * Set installation directories
  dnl *********************************************************************
  HYPRE_INSTALLDIR="${prefix}"
  HYPRE_LIBINSTALL="${libdir}"
diff -Nru hypre-2.11.2/debian/patches/examples.patch hypre-2.13.0/debian/patches/examples.patch
--- hypre-2.11.2/debian/patches/examples.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/examples.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,257 +0,0 @@
-Index: hypre-2.10.0b/src/examples/Makefile
-===================================================================
---- hypre-2.10.0b.orig/src/examples/Makefile
-+++ hypre-2.10.0b/src/examples/Makefile
-@@ -11,7 +11,7 @@ HYPRE_DIR = ../hypre
- # Compiling and linking options
- ########################################################################
- COPTS     = -g -Wall
--CINCLUDES = -I$(HYPRE_DIR)/include
-+CINCLUDES = -I${HYPRE_INC_INSTALL}
- CDEFS     = -DHAVE_CONFIG_H -DHYPRE_TIMING
- CFLAGS    = $(COPTS) $(CINCLUDES) $(CDEFS)
- FOPTS     = -g
-@@ -27,7 +27,27 @@ F90FLAGS = $(FFLAGS) $(IF90FLAGS)
- 
- 
- LINKOPTS  = $(COPTS)
--LIBS      = -L$(HYPRE_DIR)/lib -lHYPRE -lm
-+LIBS      = \
-+ -L${HYPRE_LIB_INSTALL} \
-+ -lHYPRE_utilities \
-+ -lHYPRE_multivector \
-+ -lHYPRE_krylov \
-+ -lHYPRE_struct_mv \
-+ -lHYPRE_struct_ls \
-+ -lHYPRE_sstruct_mv \
-+ -lHYPRE_sstruct_ls \
-+ -lHYPRE_seq_mv \
-+ -lHYPRE_parcsr_mv \
-+ -lHYPRE_parcsr_ls \
-+ -lHYPRE_parcsr_block_mv \
-+ -lHYPRE_DistributedMatrix \
-+ -lHYPRE_MatrixMatrix \
-+ -lHYPRE_IJ_mv \
-+ -lHYPRE_Euclid \
-+ -lHYPRE_ParaSails \
-+ -lHYPRE_DistributedMatrixPilutSolver \
-+ -lHYPRE \
-+ -lblas -llapack -lm
- LFLAGS    = $(LINKOPTS) $(LIBS) -lstdc++
- LFLAGS_B =\
-  -L${HYPRE_DIR}/lib\
-@@ -37,7 +57,7 @@ LFLAGS_B =\
-  -lbHYPRE\
-  -lsidl -ldl -lxml2
- LFLAGS77 = $(LFLAGS)
--LFLAGS90 =
-+LFLAGS90 = $(LFLAGS)
- 
- ########################################################################
- # Rules for compiling the source files
-@@ -47,7 +67,7 @@ LFLAGS90 =
- .c.o:
- 	$(CC) $(CFLAGS) -c $<
- .f.o:
--	$(F77) $(FFLAGS) -c $<
-+	$(F90) $(FFLAGS) -c $<
- .cxx.o:
- 	$(CXX) $(CXXFLAGS) -c $<
- 
-@@ -65,9 +85,10 @@ LFLAGS90 =
- ########################################################################
- # List of all programs to be compiled
- ########################################################################
--ALLPROGS = ex1 ex2 ex3 ex4 ex5 ex5f ex6 ex7 ex8 ex9 ex10 ex11 ex12 ex12f \
-+ALLPROGS = ex1 ex2 ex3 ex4 ex5 ex5f ex6 ex7 ex8 ex9 ex11 ex12 ex12f \
-            ex13 ex14 ex15 ex16
- BIGINTPROGS = ex5big ex15big
-+FEIPROGS= ex10
- BABELPROGS = ex5b ex5b77 ex5bxx ex6b ex6b77
- # ... ex5bp and ex5b90 are not in BABELPROGS because they require a
- # software environment which many people haven't set up.
-@@ -89,59 +110,61 @@ maxdim: $(MAXDIMPROGS)
- 
- complex: $(COMPLEXPROGS)
- 
-+fei: $(FEIPROGS)
-+
- ########################################################################
- # Example 1
- ########################################################################
- ex1: ex1.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 2
- ########################################################################
- ex2: ex2.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 3
- ########################################################################
- ex3: ex3.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 4
- ########################################################################
- ex4: ex4.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 5
- ########################################################################
- ex5: ex5.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 5 with 64-bit integers
- ########################################################################
- ex5big: ex5big.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 5 Fortran 77
- ########################################################################
- ex5f: ex5f.o
--	$(F77) -o $@ $^ $(LFLAGS77)
-+	$(F90) -o $@ $^ $(LFLAGS90)
- 
- ########################################################################
- # Example 5 Babel C
- ########################################################################
- ex5b: ex5b.o
--	$(CC) -o $@ $^ $(LFLAGS_B) $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS_B) $(LFLAGS)
- 
- ########################################################################
- # Example 5 Babel Fortran 77
- ########################################################################
- ex5b77: ex5b77.o
--	$(F77) -o $@ $^ $(LFLAGS_B) $(LFLAGS)
-+	$(F90) -o $@ $^ $(LFLAGS_B) $(LFLAGS90)
- 
- ########################################################################
- # Example 5 Babel Fortran 90
-@@ -153,97 +176,97 @@ ex5b90: ex5b90.o
- # Example 5 Babel C++
- ########################################################################
- ex5bxx: ex5bxx.o
--	$(CXX) -o $@ $^ $(LFLAGS_B) $(LFLAGS)
-+	$(CXX) -o $@ $^ -Wl,--no-as-needed $(LFLAGS_B) $(LFLAGS)
- 
- ########################################################################
- # Example 6
- ########################################################################
- ex6: ex6.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 6 Babel C
- ########################################################################
- ex6b: ex6b.o
--	$(CC) -o $@ $^ $(LFLAGS_B) $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS_B) $(LFLAGS)
- 
- ########################################################################
- # Example 6 Babel Fortran 77
- ########################################################################
- ex6b77: ex6b77.o
--	$(F77) -o $@ $^ $(LFLAGS_B) $(LFLAGS)
-+	$(F90) -o $@ $^ $(LFLAGS_B) $(LFLAGS90)
- 
- ########################################################################
- # Example 7
- ########################################################################
- ex7: ex7.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 8
- ########################################################################
- ex8: ex8.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 9
- ########################################################################
- ex9: ex9.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 10
- ########################################################################
- ex10: ex10.o
--	$(CXX) -o $@ $^ $(LFLAGS)
-+	$(CXX) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 11
- ########################################################################
- ex11: ex11.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 12
- ########################################################################
- ex12: ex12.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 12 Fortran 77
- ########################################################################
- ex12f: ex12f.o
--	$(F77) -o $@ $^ $(LFLAGS77)
-+	$(F90) -o $@ $^ $(LFLAGS) $(LFLAGS90)
- 
- ########################################################################
- # Example 13
- ########################################################################
- ex13: ex13.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 14
- ########################################################################
- ex14: ex14.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 15
- ########################################################################
- ex15: ex15.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 15 with 64-bit integers
- ########################################################################
- ex15big: ex15big.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 16
- ########################################################################
- ex16: ex16.o
--	$(CC) -o $@ $^ $(LFLAGS)
-+	$(CC) -o $@ $^ -Wl,--no-as-needed $(LFLAGS)
- 
- ########################################################################
- # Example 17
diff -Nru hypre-2.11.2/debian/patches/fgmres-lib.patch hypre-2.13.0/debian/patches/fgmres-lib.patch
--- hypre-2.11.2/debian/patches/fgmres-lib.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/fgmres-lib.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,183 +0,0 @@
-Add libHYPRE_FEI_fgmres.[a|so] to resolve a circular dependency:
-libHYPRE_FEI with fgmres.c depends on libHYPRE_mli.so which links with
-symbols in fgmres.c .  (Resolves issue 182.)
-
-Index: hypre-2.9.1a/src/config/configure.in
-===================================================================
---- hypre-2.9.1a.orig/src/config/configure.in	2013-09-12 17:33:00.000000000 +0200
-+++ hypre-2.9.1a/src/config/configure.in	2013-09-12 17:33:00.000000000 +0200
-@@ -914,7 +914,7 @@
- if test "$hypre_using_fei" = "yes"
- then
-    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
--   HYPRE_FEI_SUBDIRS="femli fei-hypre"
-+   HYPRE_FEI_SUBDIRS="fgmres femli fei-hypre"
-    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
-    HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
-    if test "$hypre_using_superlu" = "yes"
-Index: hypre-2.9.1a/src/FEI_mv/fei-hypre/Makefile
-===================================================================
---- hypre-2.9.1a.orig/src/FEI_mv/fei-hypre/Makefile	2013-09-12 17:33:00.000000000 +0200
-+++ hypre-2.9.1a/src/FEI_mv/fei-hypre/Makefile	2013-09-12 17:33:00.000000000 +0200
-@@ -113,7 +113,6 @@
-  HYPRE_parcsr_TFQmr.c\
-  HYPRE_parcsr_bicgs.c\
-  HYPRE_parcsr_bicgstabl.c\
-- HYPRE_parcsr_fgmres.c\
-  HYPRE_parcsr_lsicg.c\
-  HYPRE_parcsr_symqmr.c\
-  HYPRE_parcsr_maxwell.c\
-@@ -121,11 +120,12 @@
-  TFQmr.c\
-  bicgs.c\
-  bicgstabl.c\
-- fgmres.c\
-  hypre_lsi_amge.c\
-  hypre_lsi_ddamg.c\
-  hypre_lsi_misc.c\
-  lsicg.c
-+# HYPRE_parcsr_fgmres.c
-+# fgmres.c
- 
- FILESCXX = \
-  FEI_HYPRE_Impl.cxx\
-Index: hypre-2.9.1a/src/FEI_mv/fgmres/Makefile
-===================================================================
---- /dev/null	1970-01-01 00:00:00.000000000 +0000
-+++ hypre-2.9.1a/src/FEI_mv/fgmres/Makefile	2013-09-12 17:33:00.000000000 +0200
-@@ -0,0 +1,135 @@
-+#BHEADER**********************************************************************
-+# Copyright (c) 2006   The Regents of the University of California.
-+# Produced at the Lawrence Livermore National Laboratory.
-+# Written by the HYPRE team. UCRL-CODE-222953.
-+# All rights reserved.
-+#
-+# This file is part of HYPRE (see http://www.llnl.gov/CASC/hypre/).
-+# Please see the COPYRIGHT_and_LICENSE file for the copyright notice, 
-+# disclaimer, contact information and the GNU Lesser General Public License.
-+#
-+# HYPRE is free software; you can redistribute it and/or modify it under the
-+# terms of the GNU General Public License (as published by the Free Software 
-+# Foundation) version 2.1 dated February 1999.
-+#
-+# HYPRE is distributed in the hope that it will be useful, but WITHOUT ANY 
-+# WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS 
-+# FOR A PARTICULAR PURPOSE.  See the terms and conditions of the GNU General
-+# Public License for more details.
-+#
-+# You should have received a copy of the GNU Lesser General Public License
-+# along with this program; if not, write to the Free Software Foundation,
-+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+#
-+# $Revision: 2.17 $
-+#EHEADER**********************************************************************
-+
-+# why do we create this variable, asked by Charles
-+HYPRE_LIB_SUFFIX = .a
-+
-+include ../../config/Makefile.config
-+
-+BASE_DIR = ${HYPRE_FEI_BASE_DIR}
-+
-+SUPERLU_INCLUDE = -I$(srcdir)/../SuperLU
-+SUPERLU_LIB     = -L$(srcdir)/../SuperLU
-+
-+CINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include
-+CXXINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include
-+CDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED
-+CXXDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DMPICH_SKIP_MPICXX
-+
-+C_COMPILE_FLAGS = \
-+ ${CDEFS}\
-+ -I../..\
-+ -I$(BASE_DIR)\
-+ -I$(srcdir)\
-+ -I$(srcdir)/../..\
-+ -I$(srcdir)/../../IJ_mv\
-+ -I$(srcdir)/../../utilities\
-+ -I$(srcdir)/../../multivector\
-+ -I$(srcdir)/../../krylov\
-+ -I$(srcdir)/../../parcsr_mv\
-+ -I$(srcdir)/../../parcsr_ls\
-+ -I$(srcdir)/../../seq_mv\
-+ -I$(srcdir)/../../distributed_matrix\
-+ -I$(srcdir)/../../distributed_ls\
-+ -I$(srcdir)/../fei-base\
-+ ${SUPERLU_INCLUDE}\
-+ ${CINCLUDES}
-+
-+CXX_COMPILE_FLAGS = \
-+ ${CXXDEFS}\
-+ -I../..\
-+ -I$(BASE_DIR)\
-+ -I$(srcdir)/../..\
-+ -I$(srcdir)/../../IJ_mv\
-+ -I$(srcdir)/../../utilities\
-+ -I$(srcdir)/../../multivector\
-+ -I$(srcdir)/../../krylov\
-+ -I$(srcdir)/../../parcsr_mv\
-+ -I$(srcdir)/../../parcsr_ls\
-+ -I$(srcdir)/../../seq_mv\
-+ -I$(srcdir)/../../distributed_matrix\
-+ -I$(srcdir)/../../distributed_ls\
-+ -I$(srcdir)/../femli\
-+ -I$(srcdir)/../fei-base\
-+ ${SUPERLU_INCLUDE}\
-+ ${CXXINCLUDES}
-+
-+HYPRE_LIBS = \
-+  ../../utilities/libHYPRE_utilities.so \
-+  ../../parcsr_ls/libHYPRE_parcsr_ls.so
-+#  ../../krylov/libHYPRE_krylov.so
-+#  ../../seq_mv/libHYPRE_seq_mv.so
-+#  ../../parcsr_mv/libHYPRE_parcsr_mv.so
-+#  ../../IJ_mv/libHYPRE_IJ_mv.so
-+
-+SYSTEM_LIBS = -lm
-+
-+HEADERS = 
-+
-+FILESC = \
-+ ../fei-hypre/HYPRE_parcsr_fgmres.c\
-+ ../fei-hypre/fgmres.c
-+
-+FILESCXX = 
-+
-+OBJSC = ${FILESC:.c=.o}
-+OBJSCXX = ${FILESCXX:.cxx=.o}
-+OBJS = ${OBJSC} ${OBJSCXX}
-+
-+##################################################################
-+# Targets
-+##################################################################
-+
-+all: libHYPRE_FEI_fgmres${HYPRE_LIB_SUFFIX}
-+	cp -fpd libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+
-+install: libHYPRE_FEI_fgmres${HYPRE_LIB_SUFFIX}
-+	cp -fpd libHYPRE* $(HYPRE_LIB_INSTALL)
-+	@echo  ""
-+
-+clean:
-+	rm -rf *.o libHYPRE*
-+	rm -rf pchdir tca.map *inslog*
-+
-+distclean: clean
-+
-+##################################################################
-+# Rules
-+##################################################################
-+libHYPRE_FEI_fgmres.a: ${OBJS}
-+	@echo  "Building $@ ... "
-+	mv -f *.o ../fei-hypre/
-+	${AR} $@ ${OBJS}
-+	${RANLIB} $@
-+
-+libHYPRE_FEI_fgmres.so: ${OBJS} ${HYPRE_LIBS}
-+	@echo  "Building $@ ... "
-+	mv -f *.o ../fei-hypre/
-+	${BUILD_CC_SHARED} $^ ${SYSTEM_LIBS} -o libHYPRE_FEI_fgmres-2.8.0b.so -Wl,-soname,libHYPRE_FEI_fgmres-2.8.0b.so \
-+	  -Wl,-z,defs
-+	ln -s libHYPRE_FEI_fgmres-2.8.0b.so $@
-+
-+${OBJS}: ${HEADERS}
diff -Nru hypre-2.11.2/debian/patches/flexgmres-to-parcsr_ls.patch hypre-2.13.0/debian/patches/flexgmres-to-parcsr_ls.patch
--- hypre-2.11.2/debian/patches/flexgmres-to-parcsr_ls.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/flexgmres-to-parcsr_ls.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,42 +0,0 @@
-Move (HYPRE_)flexgmres.c from krylov to parcsr_ls to resolve another
-circular dependency between those two libraries.
-
-Index: hypre-2.9.1a/src/krylov/Makefile
-===================================================================
---- hypre-2.9.1a.orig/src/krylov/Makefile	2013-09-12 17:32:50.000000000 +0200
-+++ hypre-2.9.1a/src/krylov/Makefile	2013-09-12 17:32:50.000000000 +0200
-@@ -39,17 +39,17 @@
-  bicgstab.c\
-  cgnr.c\
-  gmres.c\
-- flexgmres.c\
-  lgmres.c\
-  HYPRE_bicgstab.c\
-  HYPRE_cgnr.c\
-  HYPRE_gmres.c\
-  HYPRE_lgmres.c\
-- HYPRE_flexgmres.c\
-  HYPRE_pcg.c\
-  pcg.c\
-  HYPRE_lobpcg.c\
-  lobpcg.c
-+# flexgmres.c
-+# HYPRE_flexgmres.c
- 
- OBJS = ${FILES:.c=.o}
- 
-Index: hypre-2.9.1a/src/parcsr_ls/Makefile
-===================================================================
---- hypre-2.9.1a.orig/src/parcsr_ls/Makefile	2013-09-12 17:32:50.000000000 +0200
-+++ hypre-2.9.1a/src/parcsr_ls/Makefile	2013-09-12 17:32:50.000000000 +0200
-@@ -120,7 +120,9 @@
-  block_tridiag.c\
-  ams.c\
-  ads.c\
-- ame.c
-+ ame.c\
-+ flexgmres.c\
-+ HYPRE_flexgmres.c
- 
- OBJS = ${FILES:.c=.o}
- 
diff -Nru hypre-2.11.2/debian/patches/install-libs.patch hypre-2.13.0/debian/patches/install-libs.patch
--- hypre-2.11.2/debian/patches/install-libs.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/install-libs.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,379 +0,0 @@
-Remove comments so all libraries install.
-
-Index: hypre/src/FEI_mv/fei-hypre/Makefile
-===================================================================
---- hypre.orig/src/FEI_mv/fei-hypre/Makefile
-+++ hypre/src/FEI_mv/fei-hypre/Makefile
-@@ -182,7 +182,7 @@
- 	cp -fpPR $(BASE_DIR)/base/Lookup.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(BASE_DIR)/base/LinearSystemCore.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(BASE_DIR)/fei_defs.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_FEI${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/cfei-hypre.h $(HYPRE_INC_INSTALL)
-@@ -195,7 +195,7 @@
- 	cp -fpPR $(BASE_DIR)/base/Lookup.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(BASE_DIR)/base/LinearSystemCore.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(BASE_DIR)/fei_defs.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 	@echo  ""
- 
- clean:
-Index: hypre/src/FEI_mv/femli/lib/Makefile
-===================================================================
---- hypre.orig/src/FEI_mv/femli/lib/Makefile
-+++ hypre/src/FEI_mv/femli/lib/Makefile
-@@ -145,10 +145,10 @@
- ##################################################################
- 
- all: libHYPRE_mli${HYPRE_LIB_SUFFIX}
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_mli${HYPRE_LIB_SUFFIX}
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 	@echo  " "
- 
- clean:
-Index: hypre/src/IJ_mv/Makefile
-===================================================================
---- hypre.orig/src/IJ_mv/Makefile
-+++ hypre/src/IJ_mv/Makefile
-@@ -69,12 +69,12 @@
- all: libHYPRE_IJ_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_IJ_mv.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_IJ_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_IJ_mv.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/distributed_ls/Euclid/Makefile
-===================================================================
---- hypre.orig/src/distributed_ls/Euclid/Makefile
-+++ hypre/src/distributed_ls/Euclid/Makefile
-@@ -105,10 +105,10 @@
- ##################################################################
- 
- all: libHYPRE_Euclid${HYPRE_LIB_SUFFIX}
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_Euclid${HYPRE_LIB_SUFFIX}
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/distributed_ls/ParaSails/Makefile
-===================================================================
---- hypre.orig/src/distributed_ls/ParaSails/Makefile
-+++ hypre/src/distributed_ls/ParaSails/Makefile
-@@ -71,10 +71,10 @@
- ##################################################################
- 
- all: libHYPRE_ParaSails${HYPRE_LIB_SUFFIX}
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_ParaSails${HYPRE_LIB_SUFFIX}
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/distributed_ls/pilut/Makefile
-===================================================================
---- hypre.orig/src/distributed_ls/pilut/Makefile
-+++ hypre/src/distributed_ls/pilut/Makefile
-@@ -63,11 +63,11 @@
- 
- all: libHYPRE_DistributedMatrixPilutSolver${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_DistributedMatrixPilutSolver${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/distributed_matrix/Makefile
-===================================================================
---- hypre.orig/src/distributed_matrix/Makefile
-+++ hypre/src/distributed_matrix/Makefile
-@@ -50,11 +50,11 @@
- 
- all: libHYPRE_DistributedMatrix${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/distributed_matrix.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_DistributedMatrix${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/distributed_matrix.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/krylov/Makefile
-===================================================================
---- hypre.orig/src/krylov/Makefile
-+++ hypre/src/krylov/Makefile
-@@ -67,13 +67,13 @@
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/krylov.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/lobpcg.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: all
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/krylov.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/lobpcg.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/matrix_matrix/Makefile
-===================================================================
---- hypre.orig/src/matrix_matrix/Makefile
-+++ hypre/src/matrix_matrix/Makefile
-@@ -49,11 +49,11 @@
- 
- all: libHYPRE_MatrixMatrix${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_MatrixMatrix${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o lib*
-Index: hypre/src/multivector/Makefile
-===================================================================
---- hypre.orig/src/multivector/Makefile
-+++ hypre/src/multivector/Makefile
-@@ -39,11 +39,11 @@
- 
- all: libHYPRE_multivector${HYPRE_LIB_SUFFIX}
- 	cp -fpPR *.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_multivector${HYPRE_LIB_SUFFIX}
- 	cp -fpPR *.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/parcsr_block_mv/Makefile
-===================================================================
---- hypre.orig/src/parcsr_block_mv/Makefile
-+++ hypre/src/parcsr_block_mv/Makefile
-@@ -75,7 +75,7 @@
- all: libHYPRE_parcsr_block_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/par_csr_block_matrix.h $(HYPRE_BUILD_DIR)/include	
- 	cp -fpPR $(srcdir)/csr_block_matrix.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE_* $(HYPRE_BUILD_DIR)/lib/.
-+	cp -fpPR libHYPRE_* $(HYPRE_BUILD_DIR)/lib/.
- 
- driver: driver.o libHYPRE_parcsr_block_mv${HYPRE_LIB_SUFFIX}
- 	@echo  "Linking" $@ "... "
-@@ -104,7 +104,7 @@
- install: libHYPRE_parcsr_block_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/par_csr_block_matrix.h $(HYPRE_INC_INSTALL)/.
- 	cp -fpPR $(srcdir)/csr_block_matrix.h $(HYPRE_INC_INSTALL)/.
--#	cp -fpPR libHYPRE_* $(HYPRE_LIB_INSTALL)/.
-+	cp -fpPR libHYPRE_* $(HYPRE_LIB_INSTALL)/.
- 
- clean:
- 	rm -rf *.o libHYPRE_*
-Index: hypre/src/parcsr_ls/Makefile
-===================================================================
---- hypre.orig/src/parcsr_ls/Makefile
-+++ hypre/src/parcsr_ls/Makefile
-@@ -155,12 +155,12 @@
- all: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_parcsr_ls.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_parcsr_ls.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/parcsr_mv/Makefile
-===================================================================
---- hypre.orig/src/parcsr_mv/Makefile
-+++ hypre/src/parcsr_mv/Makefile
-@@ -85,7 +85,7 @@
- all: libHYPRE_parcsr_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_parcsr_mv.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- driver: driver.o libHYPRE_parcsr_mv${HYPRE_LIB_SUFFIX} 
- 	@echo  "Linking" $@ "... "
-@@ -114,7 +114,7 @@
- install: libHYPRE_parcsr_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_parcsr_mv.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/seq_ls/pamg/Makefile
-===================================================================
---- hypre.orig/src/seq_ls/pamg/Makefile
-+++ hypre/src/seq_ls/pamg/Makefile
-@@ -90,8 +90,10 @@
- ##################################################################
- 
- lib: libHYPRE_amg_ls${HYPRE_LIB_SUFFIX}
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- all: libHYPRE_amg_ls${HYPRE_LIB_SUFFIX} driver
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- install: all
- 
-Index: hypre/src/seq_mv/Makefile
-===================================================================
---- hypre.orig/src/seq_mv/Makefile
-+++ hypre/src/seq_mv/Makefile
-@@ -55,12 +55,12 @@
- all: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/seq_mv.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/seq_mv.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/sstruct_ls/Makefile
-===================================================================
---- hypre.orig/src/sstruct_ls/Makefile
-+++ hypre/src/sstruct_ls/Makefile
-@@ -133,12 +133,12 @@
- all: libHYPRE_sstruct_ls${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_sstruct_ls.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_sstruct_ls${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_sstruct_ls.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/sstruct_mv/Makefile
-===================================================================
---- hypre.orig/src/sstruct_mv/Makefile
-+++ hypre/src/sstruct_mv/Makefile
-@@ -78,12 +78,12 @@
- all: libHYPRE_sstruct_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_sstruct_mv.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_sstruct_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_sstruct_mv.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/struct_ls/Makefile
-===================================================================
---- hypre.orig/src/struct_ls/Makefile
-+++ hypre/src/struct_ls/Makefile
-@@ -119,12 +119,12 @@
- all: libHYPRE_struct_ls${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_struct_ls.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_struct_ls${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_struct_ls.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/struct_mv/Makefile
-===================================================================
---- hypre.orig/src/struct_mv/Makefile
-+++ hypre/src/struct_mv/Makefile
-@@ -82,12 +82,12 @@
- all: libHYPRE_struct_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_struct_mv.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_struct_mv${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_struct_mv.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE*
-Index: hypre/src/utilities/Makefile
-===================================================================
---- hypre.orig/src/utilities/Makefile
-+++ hypre/src/utilities/Makefile
-@@ -73,13 +73,13 @@
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/_hypre_utilities.h $(HYPRE_BUILD_DIR)/include
- 	cp -fpPR $(srcdir)/fortran*.h $(HYPRE_BUILD_DIR)/include
--#	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
-+	cp -fpPR libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_utilities${HYPRE_LIB_SUFFIX}
- 	cp -fpPR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/_hypre_utilities.h $(HYPRE_INC_INSTALL)
- 	cp -fpPR $(srcdir)/fortran*.h $(HYPRE_INC_INSTALL)
--#	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
-+	cp -fpPR libHYPRE* $(HYPRE_LIB_INSTALL)
- 
- clean:
- 	rm -f *.o libHYPRE* f2c.h *blas.h *lapack.h 
diff -Nru hypre-2.11.2/debian/patches/install.patch hypre-2.13.0/debian/patches/install.patch
--- hypre-2.11.2/debian/patches/install.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/install.patch	2017-12-11 19:47:42.000000000 +0000
@@ -2,7 +2,7 @@
 ===================================================================
 --- hypre.orig/src/FEI_mv/fei-hypre/Makefile
 +++ hypre/src/FEI_mv/fei-hypre/Makefile
-@@ -182,7 +182,7 @@ all: libHYPRE_FEI${HYPRE_LIB_SUFFIX}
+@@ -183,7 +183,7 @@ all: libHYPRE_FEI${HYPRE_LIB_SUFFIX}
  install: libHYPRE_FEI${HYPRE_LIB_SUFFIX}
  	cp -fR $(srcdir)/cfei-hypre.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/cfei_hypre.h $(HYPRE_INC_INSTALL)
@@ -11,7 +11,7 @@
  	cp -fR $(srcdir)/HYPRE_LinSysCore.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/HYPRE_FEI*.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/LLNL_FEI*.h $(HYPRE_INC_INSTALL)
-@@ -193,6 +193,7 @@ install: libHYPRE_FEI${HYPRE_LIB_SUFFIX}
+@@ -194,6 +194,7 @@ install: libHYPRE_FEI${HYPRE_LIB_SUFFIX}
  	cp -fR $(BASE_DIR)/fei_bool.h $(HYPRE_BUILD_DIR)/include
  	cp -fR $(BASE_DIR)/fei_defs.h $(HYPRE_BUILD_DIR)/include
  	cp -fR $(BASE_DIR)/fei_mpi.h $(HYPRE_BUILD_DIR)/include
@@ -23,7 +23,7 @@
 ===================================================================
 --- hypre.orig/src/FEI_mv/femli/Makefile
 +++ hypre/src/FEI_mv/femli/Makefile
-@@ -137,7 +137,7 @@ all: libHYPRE_mli${HYPRE_LIB_SUFFIX}
+@@ -138,7 +138,7 @@ all: libHYPRE_mli${HYPRE_LIB_SUFFIX}
  #	cp -fR libHYPRE* $(HYPRE_BUILD_DIR)/lib
  
  install: libHYPRE_mli${HYPRE_LIB_SUFFIX}
@@ -62,7 +62,7 @@
 ===================================================================
 --- hypre.orig/src/distributed_ls/ParaSails/Makefile
 +++ hypre/src/distributed_ls/ParaSails/Makefile
-@@ -67,7 +67,7 @@ all: libHYPRE_ParaSails${HYPRE_LIB_SUFFI
+@@ -69,7 +69,7 @@ all: libHYPRE_ParaSails${HYPRE_LIB_SUFFI
  #	cp -fR libHYPRE* $(HYPRE_BUILD_DIR)/lib
  
  install: libHYPRE_ParaSails${HYPRE_LIB_SUFFIX}
@@ -101,7 +101,7 @@
 ===================================================================
 --- hypre.orig/src/krylov/Makefile
 +++ hypre/src/krylov/Makefile
-@@ -69,7 +69,7 @@ install: all
+@@ -71,7 +71,7 @@ install: all
  	cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/krylov.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/lobpcg.h $(HYPRE_INC_INSTALL)
@@ -153,7 +153,7 @@
 ===================================================================
 --- hypre.orig/src/parcsr_ls/Makefile
 +++ hypre/src/parcsr_ls/Makefile
-@@ -140,7 +140,7 @@ all: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFI
+@@ -147,7 +147,7 @@ all: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFI
  install: libHYPRE_parcsr_ls${HYPRE_LIB_SUFFIX}
  	cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/_hypre_parcsr_ls.h $(HYPRE_INC_INSTALL)
@@ -179,7 +179,7 @@
 ===================================================================
 --- hypre.orig/src/seq_mv/Makefile
 +++ hypre/src/seq_mv/Makefile
-@@ -59,7 +59,7 @@ all: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX}
+@@ -63,7 +63,7 @@ all: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX}
  install: libHYPRE_seq_mv${HYPRE_LIB_SUFFIX}
  	cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/seq_mv.h $(HYPRE_INC_INSTALL)
@@ -244,7 +244,7 @@
 ===================================================================
 --- hypre.orig/src/utilities/Makefile
 +++ hypre/src/utilities/Makefile
-@@ -81,7 +81,7 @@ install: libHYPRE_utilities${HYPRE_LIB_S
+@@ -83,7 +83,7 @@ install: libHYPRE_utilities${HYPRE_LIB_S
  	cp -fR $(srcdir)/HYPRE_*.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/_hypre_utilities.h $(HYPRE_INC_INSTALL)
  	cp -fR $(srcdir)/fortran*.h $(HYPRE_INC_INSTALL)
@@ -257,7 +257,7 @@
 ===================================================================
 --- hypre.orig/src/Makefile
 +++ hypre/src/Makefile
-@@ -187,23 +187,16 @@ checkpar:
+@@ -188,23 +188,16 @@ checkpar:
  install: all
  	@ \
  	echo "Installing hypre ..."; \
diff -Nru hypre-2.11.2/debian/patches/lapack.patch hypre-2.13.0/debian/patches/lapack.patch
--- hypre-2.11.2/debian/patches/lapack.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/lapack.patch	2017-12-11 19:47:42.000000000 +0000
@@ -2,7 +2,7 @@
 ===================================================================
 --- hypre.orig/src/FEI_mv/femli/Makefile
 +++ hypre/src/FEI_mv/femli/Makefile
-@@ -35,6 +35,7 @@ MLI_INCLUDES = \
+@@ -36,6 +36,7 @@ MLI_INCLUDES = \
  
  C_COMPILE_FLAGS =\
   -DMLI_SUPERLU\
diff -Nru hypre-2.11.2/debian/patches/pilut_blas_mangle.patch hypre-2.13.0/debian/patches/pilut_blas_mangle.patch
--- hypre-2.11.2/debian/patches/pilut_blas_mangle.patch	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/debian/patches/pilut_blas_mangle.patch	2017-12-11 19:47:42.000000000 +0000
@@ -0,0 +1,12 @@
+Index: hypre/src/distributed_ls/pilut/macros.h
+===================================================================
+--- hypre.orig/src/distributed_ls/pilut/macros.h
++++ hypre/src/distributed_ls/pilut/macros.h
+@@ -18,6 +18,7 @@
+ 
+ #include "../../utilities/general.h"
+ #include "../../utilities/fortran.h"
++#include "../../blas/_hypre_blas.h"
+ 
+ /*
+  * macros.h
diff -Nru hypre-2.11.2/debian/patches/series hypre-2.13.0/debian/patches/series
--- hypre-2.11.2/debian/patches/series	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/series	2017-12-11 19:47:42.000000000 +0000
@@ -1,12 +1,9 @@
+pilut_blas_mangle.patch
 configure.patch
 install.patch
-#examples.patch
-##cmake-install.patch
-#config-for-petsc.patch
 blas.patch
 complex.patch
 shlibs-interlink-libHYPRE.so.patch
 test.patch
 lapack.patch
-superlu_internal.patch
 shlibs-interlink-sublibs.patch
diff -Nru hypre-2.11.2/debian/patches/shlibs-interlink-libHYPRE.so.patch hypre-2.13.0/debian/patches/shlibs-interlink-libHYPRE.so.patch
--- hypre-2.11.2/debian/patches/shlibs-interlink-libHYPRE.so.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/shlibs-interlink-libHYPRE.so.patch	2017-12-11 19:47:42.000000000 +0000
@@ -12,7 +12,7 @@
 ===================================================================
 --- hypre.orig/src/lib/Makefile
 +++ hypre/src/lib/Makefile
-@@ -59,6 +59,28 @@ $(UTILITIESFILES)\
+@@ -57,6 +57,27 @@ $(UTILITIESFILES)\
  $(BLASFILES)\
  $(LAPACKFILES)
  
@@ -35,13 +35,12 @@
 +  ../distributed_ls/pilut/libHYPRE_DistributedMatrixPilutSolver.so \
 +  ../parcsr_ls/libHYPRE_parcsr_ls.so \
 +  ../FEI_mv/fei-hypre/libHYPRE_FEI.so \
-+  ../FEI_mv/femli/libHYPRE_mli.so \
-+  ../FEI_mv/SuperLU/SRC/libHYPRE_superlu.so
++  ../FEI_mv/femli/libHYPRE_mli.so
 +
  SONAME = libHYPRE-${HYPRE_RELEASE_VERSION}.so
  SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS}\
   ${BLASLIBDIRS} ${BLASLIBS} ${LIBS} ${FLIBS}
-@@ -71,7 +93,7 @@ SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPA
+@@ -69,7 +90,7 @@ SOLIBS = ${MPILIBDIRS} ${MPILIBS} ${LAPA
  all: libHYPRE${HYPRE_LIB_SUFFIX}
  	cp -fR libHYPRE* ${HYPRE_BUILD_DIR}/lib
  
@@ -50,7 +49,7 @@
  	cp -fR libHYPRE* ${HYPRE_LIB_INSTALL}
  
  clean:
-@@ -105,7 +127,7 @@ libHYPRE.a: ${FILES_HYPRE}
+@@ -103,7 +124,7 @@ libHYPRE.a: ${FILES_HYPRE}
  	${AR} $@ $(UTILITIESFILES) $(BLASFILES) $(LAPACKFILES)
  	${RANLIB} $@
  
diff -Nru hypre-2.11.2/debian/patches/shlibs-interlink-sublibs.patch hypre-2.13.0/debian/patches/shlibs-interlink-sublibs.patch
--- hypre-2.11.2/debian/patches/shlibs-interlink-sublibs.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/shlibs-interlink-sublibs.patch	2017-12-11 19:47:42.000000000 +0000
@@ -2,13 +2,12 @@
 ===================================================================
 --- hypre.orig/src/FEI_mv/fei-hypre/Makefile
 +++ hypre/src/FEI_mv/fei-hypre/Makefile
-@@ -63,6 +63,19 @@ CXX_COMPILE_FLAGS = \
-  ${SUPERLU_INCLUDE}\
+@@ -64,6 +64,18 @@ CXX_COMPILE_FLAGS = \
+  ${DSUPERLU_INCLUDE}\
   ${CXXINCLUDES}
  
 +
 +HYPRE_LIBS = \
-+  ../SuperLU/SRC/libHYPRE_superlu.so \
 +  ../femli/libHYPRE_mli.so \
 +  ../../utilities/libHYPRE_utilities.so \
 +  ../../seq_mv/libHYPRE_seq_mv.so \
@@ -22,7 +21,7 @@
  HEADERS =\
   cfei-hypre.h\
   cfei_hypre.h\
-@@ -211,9 +224,9 @@ libHYPRE_FEI.a: ${OBJS}
+@@ -212,9 +224,9 @@ libHYPRE_FEI.a: ${OBJS}
  	${AR} $@ ${OBJS}
  	${RANLIB} $@
  
@@ -30,7 +29,7 @@
 +libHYPRE_FEI.so: ${OBJS} ${HYPRE_LIBS}
  	@echo  "Building $@ ... "
 -	${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME}
-+	${BUILD_CC_SHARED} -o ${SONAME} $^ ${SYSTEM_LIBS} ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS}
++	${BUILD_CC_SHARED} -o ${SONAME} $^ ${SYSTEM_LIBS} ${SUPERLU_LIBS} ${DSUPERLU_LIBS} ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS}
  	ln -s ${SONAME} $@
  
  ${OBJS}: ${HEADERS}
@@ -38,12 +37,11 @@
 ===================================================================
 --- hypre.orig/src/FEI_mv/femli/Makefile
 +++ hypre/src/FEI_mv/femli/Makefile
-@@ -10,6 +10,17 @@
+@@ -10,6 +10,16 @@
  # $Revision$
  #EHEADER**********************************************************************
  
 +HYPRE_LIBS = \
-+  ../SuperLU/SRC/libHYPRE_superlu.so \
 +  ../../utilities/libHYPRE_utilities.so \
 +  ../../krylov/libHYPRE_krylov.so \
 +  ../../seq_mv/libHYPRE_seq_mv.so \
@@ -56,7 +54,7 @@
  
  include ../../config/Makefile.config
  
-@@ -156,9 +167,10 @@ libHYPRE_mli.a: ${OBJS}
+@@ -157,9 +167,10 @@ libHYPRE_mli.a: ${OBJS}
  	${AR} $@ ${OBJS}
  	${RANLIB} $@
  
@@ -130,7 +128,7 @@
 ===================================================================
 --- hypre.orig/src/distributed_ls/ParaSails/Makefile
 +++ hypre/src/distributed_ls/ParaSails/Makefile
-@@ -23,6 +23,13 @@ C_COMPILE_FLAGS = \
+@@ -25,6 +25,13 @@ C_COMPILE_FLAGS = \
   -I$(srcdir)/../../distributed_matrix\
   ${CINCLUDES}
  
@@ -144,7 +142,7 @@
  HEADERS =\
   Common.h\
   ConjGrad.h\
-@@ -84,9 +91,9 @@ libHYPRE_ParaSails.a: ${OBJS}
+@@ -86,9 +93,9 @@ libHYPRE_ParaSails.a: ${OBJS}
  	${AR} $@ ${OBJS}
  	${RANLIB} $@
  
@@ -214,7 +212,7 @@
 ===================================================================
 --- hypre.orig/src/krylov/Makefile
 +++ hypre/src/krylov/Makefile
-@@ -22,6 +22,10 @@ C_COMPILE_FLAGS = \
+@@ -24,6 +24,10 @@ C_COMPILE_FLAGS = \
   -I$(srcdir)/../utilities\
   ${CINCLUDES}
  
@@ -225,7 +223,7 @@
  HEADERS =\
   HYPRE_krylov.h\
   krylov.h\
-@@ -86,9 +90,9 @@ libHYPRE_krylov.a: ${OBJS}
+@@ -88,9 +92,9 @@ libHYPRE_krylov.a: ${OBJS}
  	${AR} $@ ${OBJS}
  	${RANLIB} $@
  
@@ -319,7 +317,7 @@
 ===================================================================
 --- hypre.orig/src/parcsr_ls/Makefile
 +++ hypre/src/parcsr_ls/Makefile
-@@ -33,6 +33,22 @@ C_COMPILE_FLAGS =\
+@@ -34,6 +34,22 @@ C_COMPILE_FLAGS =\
   -I$(srcdir)/../parcsr_block_mv\
   ${CINCLUDES}
  
@@ -342,7 +340,7 @@
  HEADERS =\
   HYPRE_parcsr_ls.h\
   _hypre_parcsr_ls.h\
-@@ -158,9 +174,9 @@ libHYPRE_parcsr_ls.a: ${OBJS}
+@@ -165,9 +181,9 @@ libHYPRE_parcsr_ls.a: ${OBJS}
  	${AR} $@ ${OBJS}
  	${RANLIB} $@
  
@@ -386,12 +384,12 @@
 ===================================================================
 --- hypre.orig/src/seq_mv/Makefile
 +++ hypre/src/seq_mv/Makefile
-@@ -76,9 +76,9 @@ libHYPRE_seq_mv.a: ${OBJS}
+@@ -80,9 +80,9 @@ libHYPRE_seq_mv.a: ${OBJS} ${CUOBJS}
  	${AR} $@ ${OBJS}
  	${RANLIB} $@
  
--libHYPRE_seq_mv.so: ${OBJS}
-+libHYPRE_seq_mv.so: ${OBJS} ../utilities/libHYPRE_utilities.so
+-libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS}
++libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS} ../utilities/libHYPRE_utilities.so
  	@echo  "Building $@ ... "
 -	${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME}
 +	${BUILD_CC_SHARED} -o ${SONAME} $^ ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS}
@@ -518,7 +516,7 @@
 ===================================================================
 --- hypre.orig/src/utilities/Makefile
 +++ hypre/src/utilities/Makefile
-@@ -100,7 +100,7 @@ libHYPRE_utilities.a: ${OBJS}
+@@ -102,7 +102,7 @@ libHYPRE_utilities.a: ${OBJS}
  
  libHYPRE_utilities.so: ${OBJS}
  	@echo  "Building $@ ... "
diff -Nru hypre-2.11.2/debian/patches/superlu_internal.patch hypre-2.13.0/debian/patches/superlu_internal.patch
--- hypre-2.11.2/debian/patches/superlu_internal.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/superlu_internal.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,106 +0,0 @@
-Index: hypre/src/FEI_mv/SuperLU/SRC/Makefile
-===================================================================
---- hypre.orig/src/FEI_mv/SuperLU/SRC/Makefile
-+++ hypre/src/FEI_mv/SuperLU/SRC/Makefile
-@@ -55,6 +55,9 @@ HEADERS =\
-  slu_ddefs.h\
-  slu_util.h\
-  supermatrix.h
-+
-+SYSTEM_LIBS = ${BLASLIBDIRS} ${BLASLIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS} -lm
-+
- #######################################
- 
- ### LAPACK 
-@@ -153,13 +156,15 @@ FILES =\
- 
- OBJS = ${FILES:.c=.o}
- 
-+SONAME = libHYPRE_superlu-${HYPRE_RELEASE_VERSION}.so
-+
- all: libHYPRE_superlu${HYPRE_LIB_SUFFIX}
- 	cp -fp *.h $(HYPRE_BUILD_DIR)/include
- #	cp -fp libHYPRE* $(HYPRE_BUILD_DIR)/lib
- 
- install: libHYPRE_superlu${HYPRE_LIB_SUFFIX}
--	cp -f *.h $(HYPRE_INC_INSTALL)
--#	cp -f libHYPRE* $(HYPRE_LIB_INSTALL)
-+#	cp -fR *.h $(HYPRE_INC_INSTALL)
-+	cp -fR libHYPRE* $(HYPRE_LIB_INSTALL)
- 	@echo  " "
- 
- clean:
-@@ -174,7 +179,8 @@ libHYPRE_superlu.a: ${OBJS}
- 
- libHYPRE_superlu.so: ${OBJS}
- 	@echo  "Building $@ ... "
--	${BUILD_CC_SHARED} -o $@ ${OBJS}
-+	${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SYSTEM_LIBS} ${SHARED_SET_SONAME}${SONAME} ${SHARED_OPTIONS}
-+	ln -s ${SONAME} $@
- 
- ${OBJS}: ${HEADERS}
- 
-Index: hypre/src/config/configure.in
-===================================================================
---- hypre.orig/src/config/configure.in
-+++ hypre/src/config/configure.in
-@@ -1045,13 +1045,6 @@ then
-    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
-    HYPRE_FEI_SUBDIRS="fei-hypre"
-    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
--   if test "$hypre_using_superlu" = "yes"
--   then
--      HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
--      HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
--   else
--      HYPRE_FEI_SUPERLU_FILES=
--   fi
-    if test "$hypre_using_mli" = "yes"
-    then
-       HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
-@@ -1059,6 +1052,13 @@ then
-    else
-       HYPRE_FEI_FEMLI_FILES=
-    fi
-+   if test "$hypre_using_superlu" = "yes"
-+   then
-+      HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
-+      HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
-+   else
-+      HYPRE_FEI_SUPERLU_FILES=
-+   fi
-    if test "$hypre_user_chose_fei" = "no"
-    then
-       HYPRE_FEI_BASE_DIR="$HYPRE_SRCDIR/FEI_mv/fei-base"
-Index: hypre/src/configure
-===================================================================
---- hypre.orig/src/configure
-+++ hypre/src/configure
-@@ -6731,13 +6731,6 @@ then
-    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
-    HYPRE_FEI_SUBDIRS="fei-hypre"
-    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
--   if test "$hypre_using_superlu" = "yes"
--   then
--      HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
--      HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
--   else
--      HYPRE_FEI_SUPERLU_FILES=
--   fi
-    if test "$hypre_using_mli" = "yes"
-    then
-       HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
-@@ -6745,6 +6738,13 @@ then
-    else
-       HYPRE_FEI_FEMLI_FILES=
-    fi
-+   if test "$hypre_using_superlu" = "yes"
-+   then
-+      HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
-+      HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
-+   else
-+      HYPRE_FEI_SUPERLU_FILES=
-+   fi
-    if test "$hypre_user_chose_fei" = "no"
-    then
-       HYPRE_FEI_BASE_DIR="$HYPRE_SRCDIR/FEI_mv/fei-base"
diff -Nru hypre-2.11.2/debian/patches/test.patch hypre-2.13.0/debian/patches/test.patch
--- hypre-2.11.2/debian/patches/test.patch	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/patches/test.patch	2017-12-11 19:47:42.000000000 +0000
@@ -224,7 +224,7 @@
 ===================================================================
 --- hypre.orig/src/test/Makefile
 +++ hypre/src/test/Makefile
-@@ -44,7 +44,27 @@ BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS
+@@ -46,7 +46,27 @@ BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS
  LIBFLAGS = ${LDFLAGS} ${LIBS}
  
  LFLAGS =\
diff -Nru hypre-2.11.2/debian/rules hypre-2.13.0/debian/rules
--- hypre-2.11.2/debian/rules	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/rules	2017-12-11 19:47:42.000000000 +0000
@@ -2,13 +2,13 @@
 # Made with the aid of debmake, by Christoph Lameter,
 # based on the sample debian/rules file for GNU hello by Ian Jackson.
 
-export HYPRE_SOVERSION=2.11.2
+export HYPRE_SOVERSION=2.13.0
 
 export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic -O3
 export DEB_CXXFLAGS_MAINT_APPEND  = -Wall -pedantic -O3
 export DEB_LDFLAGS_MAINT_APPEND = -Wl,--no-as-needed
-export DEB_HOST_MULTIARCH := $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
-export DEB_HOST_ARCH := $(shell dpkg-architecture -qDEB_HOST_ARCH)
+export DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
+export DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
 
 %:
 	dh $@ --sourcedirectory=src --with autoreconf
@@ -22,8 +22,9 @@
 # extra flags set to be similar to what petsc requires
 extra_flags += \
   --with-fei --with-mli \
-  --with-superlu \
-  --with-blas \
+  --with-superlu --with-superlu-include=/usr/include/superlu --with-superlu-lib="-lsuperlu" \
+  --with-dsuperlu --with-dsuperlu-include=/usr/include/superlu-dist --with-dsuperlu-lib="-lsuperlu_dist" \
+  --with-blas --with-blas-lib="-lblas" \
   --with-lapack-libs="lapack" --with-lapack-lib-dirs="/usr/lib/lapack" --with-fmangle-lapack="one-underscore" \
   --prefix=$(CURDIR)/debian/tmp/usr
 
diff -Nru hypre-2.11.2/debian/source/lintian-overrides hypre-2.13.0/debian/source/lintian-overrides
--- hypre-2.11.2/debian/source/lintian-overrides	2017-10-18 09:57:39.000000000 +0000
+++ hypre-2.13.0/debian/source/lintian-overrides	1970-01-01 00:00:00.000000000 +0000
@@ -1,8 +0,0 @@
-# Lintian thinks uploader Adam Powell's name violates policy
-hypre source: uploader-address-missing "Adam C. Powell
-hypre source: uploader-not-full-name IV"
-
-# source for these files is provided in the same directory
-hypre source: source-is-missing src/FEI_mv/DSuperLU/INSTALL/testdlamch
-hypre source: source-is-missing src/FEI_mv/DSuperLU/INSTALL/testslamch
-hypre source: source-is-missing src/FEI_mv/DSuperLU/INSTALL/testtimer
Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/docs/HYPRE_ref_manual.pdf and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/docs/HYPRE_ref_manual.pdf differ
Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/docs/HYPRE_usr_manual.pdf and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/docs/HYPRE_usr_manual.pdf differ
diff -Nru hypre-2.11.2/src/blas/blas_utils.c hypre-2.13.0/src/blas/blas_utils.c
--- hypre-2.11.2/src/blas/blas_utils.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/blas_utils.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,219 +0,0 @@
-
-#include "hypre_blas.h"
-#include "f2c.h"
-	
-logical hypre_lsame_(const char *ca,const char *cb)
-{
-/*  -- LAPACK auxiliary routine (version 3.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-
-    Purpose   
-    =======   
-
-    LSAME returns .TRUE. if CA is the same letter as CB regardless of   
-    case.   
-
-    Arguments   
-    =========   
-
-    CA      (input) CHARACTER*1   
-    CB      (input) CHARACTER*1   
-            CA and CB specify the single characters to be compared.   
-
-   ===================================================================== 
-  
-
-
-       Test if the characters are equal */
-    /* System generated locals */
-    logical ret_val;
-    /* Local variables */
-    static integer inta, intb, zcode;
-
-
-    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
-    if (ret_val) {
-	return ret_val;
-    }
-
-/*     Now test for equivalence if both characters are alphabetic. */
-
-    zcode = 'Z';
-
-/*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime   
-       machines, on which ICHAR returns a value with bit 8 set.   
-       ICHAR('A') on Prime machines returns 193 which is the same as   
-       ICHAR('A') on an EBCDIC machine. */
-
-    inta = *(unsigned char *)ca;
-    intb = *(unsigned char *)cb;
-
-    if (zcode == 90 || zcode == 122) {
-
-/*        ASCII is assumed - ZCODE is the ASCII code of either lower o
-r   
-          upper case 'Z'. */
-
-	if (inta >= 97 && inta <= 122) {
-	    inta += -32;
-	}
-	if (intb >= 97 && intb <= 122) {
-	    intb += -32;
-	}
-
-    } else if (zcode == 233 || zcode == 169) {
-
-/*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower
- or   
-          upper case 'Z'. */
-
-	if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || 
-                (inta >= 162 && inta <= 169)) {
-	    inta += 64;
-	}
-	if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || 
-                (intb >= 162 && intb <= 169)) {
-	    intb += 64;
-	}
-
-    } else if (zcode == 218 || zcode == 250) {
-
-/*        ASCII is assumed, on Prime machines - ZCODE is the ASCII cod
-e   
-          plus 128 of either lower or upper case 'Z'. */
-
-	if (inta >= 225 && inta <= 250) {
-	    inta += -32;
-	}
-	if (intb >= 225 && intb <= 250) {
-	    intb += -32;
-	}
-    }
-    ret_val = inta == intb;
-
-/*     RETURN   
-
-       End of LSAME */
-
-    return ret_val;
-} /* hypre_lsame_ */
-
-#include <stdio.h>
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int hypre_xerbla_(const char *srname, integer *info)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-
-    Purpose   
-    =======   
-
-    XERBLA  is an error handler for the LAPACK routines.   
-    It is called by an LAPACK routine if an input parameter has an   
-    invalid value.  A message is printed and execution stops.   
-
-    Installers may consider modifying the STOP statement in order to   
-    call system-specific exception-handling facilities.   
-
-    Arguments   
-    =========   
-
-    SRNAME  (input) CHARACTER*6   
-            The name of the routine which called XERBLA.   
-
-    INFO    (input) INTEGER   
-            The position of the invalid parameter in the parameter list   
-
-            of the calling routine.   
-
-   ===================================================================== 
-*/
-
-    hypre_printf("** On entry to %6s, parameter number %2i had an illegal value\n",
-		srname, *info);
-
-/*     End of XERBLA */
-
-    return 0;
-} /* hypre_xerbla_ */
-
-#include "f2c.h"
-#include "hypre_blas.h"
-
-/* compare two strings */
-
-#ifdef KR_headers
-integer s_cmp(a0, b0, la, lb) char *a0;const char *b0; ftnlen la, lb;
-#else
-integer s_cmp(char *a0,const char *b0, ftnlen la, ftnlen lb)
-#endif
-{
-register unsigned char *a, *aend, *b, *bend;
-a = (unsigned char *)a0;
-b = (unsigned char *)b0;
-aend = a + la;
-bend = b + lb;
-
-if(la <= lb)
-	{
-	while(a < aend)
-		if(*a != *b)
-			return( *a - *b );
-		else
-			{ ++a; ++b; }
-
-	while(b < bend)
-		if(*b != ' ')
-			return( ' ' - *b );
-		else	++b;
-	}
-
-else
-	{
-	while(b < bend)
-		if(*a == *b)
-			{ ++a; ++b; }
-		else
-			return( *a - *b );
-	while(a < aend)
-		if(*a != ' ')
-			return(*a - ' ');
-		else	++a;
-	}
-return(0);
-}
-//#include "f2c.h"
-//#include "hypre_blas.h"
-
-/* assign strings:  a = b */
-
-#ifdef KR_headers
-VOID hypre_s_copy(a, b, la, lb) char *a,const char *b; ftnlen la, lb;
-#else
-void hypre_s_copy(char *a,const char *b, ftnlen la, ftnlen lb)
-#endif
-{
-register char *aend, *bend;
-
-aend = a + la;
-
-if(la <= lb)
-	while(a < aend)
-		*a++ = *b++;
-
-else
-	{
-		bend = (char*)b + lb;
-	while(b < bend)
-		*a++ = *b++;
-	while(a < aend)
-		*a++ = ' ';
-	}
-}
diff -Nru hypre-2.11.2/src/blas/dasum.c hypre-2.13.0/src/blas/dasum.c
--- hypre-2.11.2/src/blas/dasum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dasum.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -83,3 +85,6 @@
     return ret_val;
 } /* dasum_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/daxpy.c hypre-2.13.0/src/blas/daxpy.c
--- hypre-2.11.2/src/blas/daxpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/daxpy.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int daxpy_(integer *n, doublereal *da, doublereal *dx, 
+/* Subroutine */ integer daxpy_(integer *n, doublereal *da, doublereal *dx, 
 	integer *incx, doublereal *dy, integer *incy)
 {
 
@@ -90,3 +92,6 @@
     return 0;
 } /* daxpy_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dcopy.c hypre-2.13.0/src/blas/dcopy.c
--- hypre-2.11.2/src/blas/dcopy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dcopy.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,3 +1,6 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* dcopy.f -- translated by f2c (version 19960315).
    You must link the resulting object file with the libraries:
@@ -7,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dcopy_(integer* n, doublereal* dx,integer* incx,doublereal* dy,integer* incy)
+/* Subroutine */ integer dcopy_(integer* n, doublereal* dx,integer* incx,doublereal* dy,integer* incy)
 {
     /* System generated locals */
     integer i__1;
@@ -87,3 +90,6 @@
     return 0;
 } /* dcopy_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/ddot.c hypre-2.13.0/src/blas/ddot.c
--- hypre-2.11.2/src/blas/ddot.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/ddot.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,3 +1,6 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* ddot.f -- translated by f2c (version 19960315).
    You must link the resulting object file with the libraries:
@@ -91,3 +94,6 @@
     return ret_val;
 } /* ddot_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dgemm.c hypre-2.13.0/src/blas/dgemm.c
--- hypre-2.11.2/src/blas/dgemm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dgemm.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,3 +1,6 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -7,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dgemm_(const char *transa,const char *transb, integer *m, integer *
+/* Subroutine */ integer dgemm_(const char *transa,const char *transb, integer *m, integer *
 	n, integer *k, doublereal *alpha, doublereal *a, integer *lda, 
 	doublereal *b, integer *ldb, doublereal *beta, doublereal *c, integer 
 	*ldc)
@@ -21,9 +24,9 @@
     static logical nota, notb;
     static doublereal temp;
     static integer i, j, l;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer nrowa, nrowb;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 
 
 /*  Purpose   
@@ -192,8 +195,8 @@
 #define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
 #define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
 
-    nota = hypre_lsame_(transa, "N");
-    notb = hypre_lsame_(transb, "N");
+    nota = lsame_(transa, "N");
+    notb = lsame_(transb, "N");
     if (nota) {
 	nrowa = *m;
     } else {
@@ -208,9 +211,9 @@
 /*     Test the input parameters. */
 
     info = 0;
-    if (! nota && ! hypre_lsame_(transa, "C") && ! hypre_lsame_(transa, "T")) {
+    if (! nota && ! lsame_(transa, "C") && ! lsame_(transa, "T")) {
 	info = 1;
-    } else if (! notb && ! hypre_lsame_(transb, "C") && ! hypre_lsame_(transb, 
+    } else if (! notb && ! lsame_(transb, "C") && ! lsame_(transb, 
 	    "T")) {
 	info = 2;
     } else if (*m < 0) {
@@ -227,7 +230,7 @@
 	info = 13;
     }
     if (info != 0) {
-	hypre_xerbla_("DGEMM ", &info);
+	xerbla_("DGEMM ", &info);
 	return 0;
     }
 
@@ -369,3 +372,7 @@
 /*     End of DGEMM . */
 
 } /* dgemm_ */
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dgemv.c hypre-2.13.0/src/blas/dgemv.c
--- hypre-2.11.2/src/blas/dgemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dgemv.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dgemv_(const char *trans, integer *m, integer *n, doublereal *
+/* Subroutine */ integer dgemv_(const char *trans, integer *m, integer *n, doublereal *
 	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
 	doublereal *beta, doublereal *y, integer *incy)
 {
@@ -20,9 +22,9 @@
     static integer info;
     static doublereal temp;
     static integer lenx, leny, i, j;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 
 
 /*  Purpose   
@@ -131,8 +133,8 @@
 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 
     info = 0;
-    if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, "T") && ! 
-	    hypre_lsame_(trans, "C")) {
+    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! 
+	    lsame_(trans, "C")) {
 	info = 1;
     } else if (*m < 0) {
 	info = 2;
@@ -146,7 +148,7 @@
 	info = 11;
     }
     if (info != 0) {
-	hypre_xerbla_("DGEMV ", &info);
+	xerbla_("DGEMV ", &info);
 	return 0;
     }
 
@@ -160,7 +162,7 @@
   
        up the start points in  X  and  Y. */
 
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -216,7 +218,7 @@
     if (*alpha == 0.) {
 	return 0;
     }
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -285,3 +287,7 @@
 /*     End of DGEMV . */
 
 } /* dgemv_ */
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dger.c hypre-2.13.0/src/blas/dger.c
--- hypre-2.11.2/src/blas/dger.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dger.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,11 @@
-#include "hypre_blas.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dger_(integer *m, integer *n, doublereal *alpha, 
+/* Subroutine */ integer dger_(integer *m, integer *n, doublereal *alpha, 
 	doublereal *x, integer *incx, doublereal *y, integer *incy, 
 	doublereal *a, integer *lda)
 {
@@ -11,7 +15,7 @@
     static integer info;
     static doublereal temp;
     static integer i__, j, ix, jy, kx;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 /*  Purpose   
     =======   
@@ -86,7 +90,7 @@
 	info = 9;
     }
     if (info != 0) {
-	hypre_xerbla_("DGER  ", &info);
+	xerbla_("DGER  ", &info);
 	return 0;
     }
 /*     Quick return if possible. */
@@ -141,3 +145,6 @@
 } /* dger_ */
 #undef a_ref
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dnrm2.c hypre-2.13.0/src/blas/dnrm2.c
--- hypre-2.11.2/src/blas/dnrm2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dnrm2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,3 +1,6 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* dnrm2.f -- translated by f2c (version 19960315).
    You must link the resulting object file with the libraries:
@@ -6,7 +9,6 @@
 
 #include "f2c.h"
 #include "hypre_blas.h"
-#include "math.h"
 
 doublereal dnrm2_(integer*n,doublereal* dx,integer* incx)
 {
@@ -24,7 +26,7 @@
     doublereal ret_val, d__1;
 
     /* Builtin functions */
-    /*HYPRE_Real sqrt(doublereal);*/
+    /*doublereal sqrt(doublereal);*/
 
     /* Local variables */
     static doublereal xmax;
@@ -90,7 +92,7 @@
     ix = 1;
 /*                                                 begin main loop */
 L20:
-    switch ((HYPRE_Int)next) {
+    switch ((integer)next) {
 	case 0: goto L30;
 	case 1: goto L50;
 	case 2: goto L70;
@@ -200,3 +202,6 @@
     return ret_val;
 } /* dnrm2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/drot.c hypre-2.13.0/src/blas/drot.c
--- hypre-2.11.2/src/blas/drot.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/drot.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int drot_(integer *n, doublereal *dx, integer *incx, 
+/* Subroutine */ integer drot_(integer *n, doublereal *dx, integer *incx, 
 	doublereal *dy, integer *incy, doublereal *c, doublereal *s)
 {
 
@@ -73,3 +75,6 @@
     return 0;
 } /* drot_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dscal.c hypre-2.13.0/src/blas/dscal.c
--- hypre-2.11.2/src/blas/dscal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dscal.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dscal_(integer *n, doublereal *da, doublereal *dx, 
+/* Subroutine */ integer dscal_(integer *n, doublereal *da, doublereal *dx, 
 	integer *incx)
 {
 
@@ -78,3 +80,6 @@
     return 0;
 } /* dscal_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dswap.c hypre-2.13.0/src/blas/dswap.c
--- hypre-2.11.2/src/blas/dswap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dswap.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_blas.h"
 #include "f2c.h"
+#include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dswap_(integer *n, doublereal *dx, integer *incx, 
+/* Subroutine */ integer dswap_(integer *n, doublereal *dx, integer *incx, 
 	doublereal *dy, integer *incy)
 {
     /* System generated locals */
@@ -80,3 +83,6 @@
     return 0;
 } /* dswap_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dsymm.c hypre-2.13.0/src/blas/dsymm.c
--- hypre-2.11.2/src/blas/dsymm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dsymm.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_blas.h"
 #include "f2c.h"
+#include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dsymm_(const char *side,const char *uplo, integer *m, integer *n, 
+/* Subroutine */ integer dsymm_(const char *side,const char *uplo, integer *m, integer *n, 
 	doublereal *alpha, doublereal *a, integer *lda, doublereal *b, 
 	integer *ldb, doublereal *beta, doublereal *c__, integer *ldc)
 {
@@ -13,10 +16,10 @@
     static integer info;
     static doublereal temp1, temp2;
     static integer i__, j, k;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer nrowa;
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]
 #define c___ref(a_1,a_2) c__[(a_2)*c_dim1 + a_1]
@@ -127,17 +130,17 @@
     c_offset = 1 + c_dim1 * 1;
     c__ -= c_offset;
     /* Function Body */
-    if (hypre_lsame_(side, "L")) {
+    if (lsame_(side, "L")) {
 	nrowa = *m;
     } else {
 	nrowa = *n;
     }
-    upper = hypre_lsame_(uplo, "U");
+    upper = lsame_(uplo, "U");
 /*     Test the input parameters. */
     info = 0;
-    if (! hypre_lsame_(side, "L") && ! hypre_lsame_(side, "R")) {
+    if (! lsame_(side, "L") && ! lsame_(side, "R")) {
 	info = 1;
-    } else if (! upper && ! hypre_lsame_(uplo, "L")) {
+    } else if (! upper && ! lsame_(uplo, "L")) {
 	info = 2;
     } else if (*m < 0) {
 	info = 3;
@@ -151,7 +154,7 @@
 	info = 12;
     }
     if (info != 0) {
-	hypre_xerbla_("DSYMM ", &info);
+	xerbla_("DSYMM ", &info);
 	return 0;
     }
 /*     Quick return if possible. */
@@ -184,7 +187,7 @@
 	return 0;
     }
 /*     Start the operations. */
-    if (hypre_lsame_(side, "L")) {
+    if (lsame_(side, "L")) {
 /*        Form  C := alpha*A*B + beta*C. */
 	if (upper) {
 	    i__1 = *n;
@@ -291,3 +294,6 @@
 #undef b_ref
 #undef a_ref
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dsymv.c hypre-2.13.0/src/blas/dsymv.c
--- hypre-2.11.2/src/blas/dsymv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dsymv.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dsymv_(const char *uplo, integer *n, doublereal *alpha, 
+/* Subroutine */ integer dsymv_(const char *uplo, integer *n, doublereal *alpha, 
 	doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal 
 	*beta, doublereal *y, integer *incy)
 {
@@ -20,9 +22,9 @@
     static integer info;
     static doublereal temp1, temp2;
     static integer i, j;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 
 
 /*  Purpose   
@@ -129,7 +131,7 @@
 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 
     info = 0;
-    if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) {
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
 	info = 1;
     } else if (*n < 0) {
 	info = 2;
@@ -141,7 +143,7 @@
 	info = 10;
     }
     if (info != 0) {
-	hypre_xerbla_("DSYMV ", &info);
+	xerbla_("DSYMV ", &info);
 	return 0;
     }
 
@@ -203,7 +205,7 @@
     if (*alpha == 0.) {
 	return 0;
     }
-    if (hypre_lsame_(uplo, "U")) {
+    if (lsame_(uplo, "U")) {
 
 /*        Form  y  when A is stored in upper triangle. */
 
@@ -287,3 +289,6 @@
 
 } /* dsymv_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dsyr2.c hypre-2.13.0/src/blas/dsyr2.c
--- hypre-2.11.2/src/blas/dsyr2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dsyr2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dsyr2_(const char *uplo, integer *n, doublereal *alpha, 
+/* Subroutine */ integer dsyr2_(const char *uplo, integer *n, doublereal *alpha, 
 	doublereal *x, integer *incx, doublereal *y, integer *incy, 
 	doublereal *a, integer *lda)
 {
@@ -20,9 +22,9 @@
     static integer info;
     static doublereal temp1, temp2;
     static integer i, j;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 
 
 /*  Purpose   
@@ -128,7 +130,7 @@
 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 
     info = 0;
-    if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) {
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
 	info = 1;
     } else if (*n < 0) {
 	info = 2;
@@ -140,7 +142,7 @@
 	info = 9;
     }
     if (info != 0) {
-	hypre_xerbla_("DSYR2 ", &info);
+	xerbla_("DSYR2 ", &info);
 	return 0;
     }
 
@@ -173,7 +175,7 @@
        accessed sequentially with one pass through the triangular part   
        of A. */
 
-    if (hypre_lsame_(uplo, "U")) {
+    if (lsame_(uplo, "U")) {
 
 /*        Form  A  when A is stored in the upper triangle. */
 
@@ -255,3 +257,6 @@
 
 } /* dsyr2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dsyr2k.c hypre-2.13.0/src/blas/dsyr2k.c
--- hypre-2.11.2/src/blas/dsyr2k.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dsyr2k.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dsyr2k_(const char *uplo,const char *trans, integer *n, integer *k, 
+/* Subroutine */ integer dsyr2k_(const char *uplo,const char *trans, integer *n, integer *k, 
 	doublereal *alpha, doublereal *a, integer *lda, doublereal *b, 
 	integer *ldb, doublereal *beta, doublereal *c__, integer *ldc)
 {
@@ -13,10 +16,10 @@
     static integer info;
     static doublereal temp1, temp2;
     static integer i__, j, l;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer nrowa;
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]
 #define c___ref(a_1,a_2) c__[(a_2)*c_dim1 + a_1]
@@ -128,17 +131,17 @@
     c_offset = 1 + c_dim1 * 1;
     c__ -= c_offset;
     /* Function Body */
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 	nrowa = *n;
     } else {
 	nrowa = *k;
     }
-    upper = hypre_lsame_(uplo, "U");
+    upper = lsame_(uplo, "U");
     info = 0;
-    if (! upper && ! hypre_lsame_(uplo, "L")) {
+    if (! upper && ! lsame_(uplo, "L")) {
 	info = 1;
-    } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, 
-	    "T") && ! hypre_lsame_(trans, "C")) {
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, 
+	    "T") && ! lsame_(trans, "C")) {
 	info = 2;
     } else if (*n < 0) {
 	info = 3;
@@ -152,7 +155,7 @@
 	info = 12;
     }
     if (info != 0) {
-	hypre_xerbla_("DSYR2K", &info);
+	xerbla_("DSYR2K", &info);
 	return 0;
     }
 /*     Quick return if possible. */
@@ -209,7 +212,7 @@
 	return 0;
     }
 /*     Start the operations. */
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 /*        Form  C := alpha*A*B' + alpha*B*A' + C. */
 	if (upper) {
 	    i__1 = *n;
@@ -333,3 +336,6 @@
 #undef b_ref
 #undef a_ref
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dsyrk.c hypre-2.13.0/src/blas/dsyrk.c
--- hypre-2.11.2/src/blas/dsyrk.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dsyrk.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dsyrk_(const char *uplo,const char *trans, integer *n, integer *k, 
+/* Subroutine */ integer dsyrk_(const char *uplo,const char *trans, integer *n, integer *k, 
 	doublereal *alpha, doublereal *a, integer *lda, doublereal *beta, 
 	doublereal *c, integer *ldc)
 {
@@ -20,10 +22,10 @@
     static integer info;
     static doublereal temp;
     static integer i, j, l;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer nrowa;
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 
 
 /*  Purpose   
@@ -174,18 +176,18 @@
 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 #define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
 
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 	nrowa = *n;
     } else {
 	nrowa = *k;
     }
-    upper = hypre_lsame_(uplo, "U");
+    upper = lsame_(uplo, "U");
 
     info = 0;
-    if (! upper && ! hypre_lsame_(uplo, "L")) {
+    if (! upper && ! lsame_(uplo, "L")) {
 	info = 1;
-    } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, "T") &&
-	     ! hypre_lsame_(trans, "C")) {
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
+	     ! lsame_(trans, "C")) {
 	info = 2;
     } else if (*n < 0) {
 	info = 3;
@@ -197,7 +199,7 @@
 	info = 10;
     }
     if (info != 0) {
-	hypre_xerbla_("DSYRK ", &info);
+	xerbla_("DSYRK ", &info);
 	return 0;
     }
 
@@ -252,7 +254,7 @@
 
 /*     Start the operations. */
 
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 
 /*        Form  C := alpha*A*A' + beta*C. */
 
@@ -353,3 +355,7 @@
 /*     End of DSYRK . */
 
 } /* dsyrk_ */
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dtrmm.c hypre-2.13.0/src/blas/dtrmm.c
--- hypre-2.11.2/src/blas/dtrmm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dtrmm.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_blas.h"
 #include "f2c.h"
+#include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dtrmm_(const char *side,const char *uplo,const char *transa,const char *diag, 
+/* Subroutine */ integer dtrmm_(const char *side,const char *uplo,const char *transa,const char *diag, 
 	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
 	lda, doublereal *b, integer *ldb)
 {
@@ -13,10 +16,10 @@
     static doublereal temp;
     static integer i__, j, k;
     static logical lside;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer nrowa;
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static logical nounit;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]
@@ -111,23 +114,23 @@
     b_offset = 1 + b_dim1 * 1;
     b -= b_offset;
     /* Function Body */
-    lside = hypre_lsame_(side, "L");
+    lside = lsame_(side, "L");
     if (lside) {
 	nrowa = *m;
     } else {
 	nrowa = *n;
     }
-    nounit = hypre_lsame_(diag, "N");
-    upper = hypre_lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
     info = 0;
-    if (! lside && ! hypre_lsame_(side, "R")) {
+    if (! lside && ! lsame_(side, "R")) {
 	info = 1;
-    } else if (! upper && ! hypre_lsame_(uplo, "L")) {
+    } else if (! upper && ! lsame_(uplo, "L")) {
 	info = 2;
-    } else if (! hypre_lsame_(transa, "N") && ! hypre_lsame_(transa,
-	     "T") && ! hypre_lsame_(transa, "C")) {
+    } else if (! lsame_(transa, "N") && ! lsame_(transa,
+	     "T") && ! lsame_(transa, "C")) {
 	info = 3;
-    } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, 
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, 
 	    "N")) {
 	info = 4;
     } else if (*m < 0) {
@@ -140,7 +143,7 @@
 	info = 11;
     }
     if (info != 0) {
-	hypre_xerbla_("DTRMM ", &info);
+	xerbla_("DTRMM ", &info);
 	return 0;
     }
 /*     Quick return if possible. */
@@ -162,7 +165,7 @@
     }
 /*     Start the operations. */
     if (lside) {
-	if (hypre_lsame_(transa, "N")) {
+	if (lsame_(transa, "N")) {
 /*           Form  B := alpha*A*B. */
 	    if (upper) {
 		i__1 = *n;
@@ -250,7 +253,7 @@
 	    }
 	}
     } else {
-	if (hypre_lsame_(transa, "N")) {
+	if (lsame_(transa, "N")) {
 /*           Form  B := alpha*B*A. */
 	    if (upper) {
 		for (j = *n; j >= 1; --j) {
@@ -374,3 +377,6 @@
 #undef b_ref
 #undef a_ref
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dtrmv.c hypre-2.13.0/src/blas/dtrmv.c
--- hypre-2.11.2/src/blas/dtrmv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dtrmv.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_blas.h"
 #include "f2c.h"
+#include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dtrmv_(const char *uplo,const char *trans,const char *diag, integer *n, 
+/* Subroutine */ integer dtrmv_(const char *uplo,const char *trans,const char *diag, integer *n, 
 	doublereal *a, integer *lda, doublereal *x, integer *incx)
 {
     /* System generated locals */
@@ -11,9 +14,9 @@
     static integer info;
     static doublereal temp;
     static integer i__, j;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer ix, jx, kx;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static logical nounit;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 /*  Purpose   
@@ -88,12 +91,12 @@
     --x;
     /* Function Body */
     info = 0;
-    if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) {
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
 	info = 1;
-    } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, 
-	    "T") && ! hypre_lsame_(trans, "C")) {
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, 
+	    "T") && ! lsame_(trans, "C")) {
 	info = 2;
-    } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, 
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, 
 	    "N")) {
 	info = 3;
     } else if (*n < 0) {
@@ -104,14 +107,14 @@
 	info = 8;
     }
     if (info != 0) {
-	hypre_xerbla_("DTRMV ", &info);
+	xerbla_("DTRMV ", &info);
 	return 0;
     }
 /*     Quick return if possible. */
     if (*n == 0) {
 	return 0;
     }
-    nounit = hypre_lsame_(diag, "N");
+    nounit = lsame_(diag, "N");
 /*     Set up the start point in X if the increment is not unity. This   
        will be  ( N - 1 )*INCX  too small for descending loops. */
     if (*incx <= 0) {
@@ -121,9 +124,9 @@
     }
 /*     Start the operations. In this version the elements of A are   
        accessed sequentially with one pass through A. */
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 /*        Form  x := A*x. */
-	if (hypre_lsame_(uplo, "U")) {
+	if (lsame_(uplo, "U")) {
 	    if (*incx == 1) {
 		i__1 = *n;
 		for (j = 1; j <= i__1; ++j) {
@@ -201,7 +204,7 @@
 	}
     } else {
 /*        Form  x := A'*x. */
-	if (hypre_lsame_(uplo, "U")) {
+	if (lsame_(uplo, "U")) {
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    temp = x[j];
@@ -276,3 +279,6 @@
 } /* dtrmv_ */
 #undef a_ref
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dtrsm.c hypre-2.13.0/src/blas/dtrsm.c
--- hypre-2.11.2/src/blas/dtrsm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dtrsm.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dtrsm_(const char *side,const char *uplo,const char *transa,const char *diag, 
+/* Subroutine */ integer dtrsm_(const char *side,const char *uplo,const char *transa,const char *diag, 
 	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
 	lda, doublereal *b, integer *ldb)
 {
@@ -21,10 +23,10 @@
     static doublereal temp;
     static integer i, j, k;
     static logical lside;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer nrowa;
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static logical nounit;
 
 
@@ -184,24 +186,24 @@
 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 #define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
 
-    lside = hypre_lsame_(side, "L");
+    lside = lsame_(side, "L");
     if (lside) {
 	nrowa = *m;
     } else {
 	nrowa = *n;
     }
-    nounit = hypre_lsame_(diag, "N");
-    upper = hypre_lsame_(uplo, "U");
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
 
     info = 0;
-    if (! lside && ! hypre_lsame_(side, "R")) {
+    if (! lside && ! lsame_(side, "R")) {
 	info = 1;
-    } else if (! upper && ! hypre_lsame_(uplo, "L")) {
+    } else if (! upper && ! lsame_(uplo, "L")) {
 	info = 2;
-    } else if (! hypre_lsame_(transa, "N") && ! hypre_lsame_(transa, "T") 
-	    && ! hypre_lsame_(transa, "C")) {
+    } else if (! lsame_(transa, "N") && ! lsame_(transa, "T") 
+	    && ! lsame_(transa, "C")) {
 	info = 3;
-    } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, "N")) {
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
 	info = 4;
     } else if (*m < 0) {
 	info = 5;
@@ -213,7 +215,7 @@
 	info = 11;
     }
     if (info != 0) {
-	hypre_xerbla_("DTRSM ", &info);
+	xerbla_("DTRSM ", &info);
 	return 0;
     }
 
@@ -239,7 +241,7 @@
 /*     Start the operations. */
 
     if (lside) {
-	if (hypre_lsame_(transa, "N")) {
+	if (lsame_(transa, "N")) {
 
 /*           Form  B := alpha*inv( A )*B. */
 
@@ -327,7 +329,7 @@
 	    }
 	}
     } else {
-	if (hypre_lsame_(transa, "N")) {
+	if (lsame_(transa, "N")) {
 
 /*           Form  B := alpha*B*inv( A ). */
 
@@ -451,3 +453,7 @@
 /*     End of DTRSM . */
 
 } /* dtrsm_ */
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/dtrsv.c hypre-2.13.0/src/blas/dtrsv.c
--- hypre-2.11.2/src/blas/dtrsv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/dtrsv.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -8,7 +10,7 @@
 #include "f2c.h"
 #include "hypre_blas.h"
 
-/* Subroutine */ HYPRE_Int dtrsv_(const char *uplo,const char *trans,const char *diag, integer *n, 
+/* Subroutine */ integer dtrsv_(const char *uplo,const char *trans,const char *diag, integer *n, 
 	doublereal *a, integer *lda, doublereal *x, integer *incx)
 {
 
@@ -19,9 +21,9 @@
     static integer info;
     static doublereal temp;
     static integer i, j;
-    extern logical hypre_lsame_(const char *,const char *);
+    extern logical lsame_(const char *,const char *);
     static integer ix, jx, kx;
-    extern /* Subroutine */ HYPRE_Int hypre_xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static logical nounit;
 
 
@@ -137,12 +139,12 @@
 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 
     info = 0;
-    if (! hypre_lsame_(uplo, "U") && ! hypre_lsame_(uplo, "L")) {
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
 	info = 1;
-    } else if (! hypre_lsame_(trans, "N") && ! hypre_lsame_(trans, "T") &&
-	     ! hypre_lsame_(trans, "C")) {
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
+	     ! lsame_(trans, "C")) {
 	info = 2;
-    } else if (! hypre_lsame_(diag, "U") && ! hypre_lsame_(diag, "N")) {
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
 	info = 3;
     } else if (*n < 0) {
 	info = 4;
@@ -152,7 +154,7 @@
 	info = 8;
     }
     if (info != 0) {
-	hypre_xerbla_("DTRSV ", &info);
+	xerbla_("DTRSV ", &info);
 	return 0;
     }
 
@@ -162,7 +164,7 @@
 	return 0;
     }
 
-    nounit = hypre_lsame_(diag, "N");
+    nounit = lsame_(diag, "N");
 
 /*     Set up the start point in X if the increment is not unity. This   
        will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -176,11 +178,11 @@
 /*     Start the operations. In this version the elements of A are   
        accessed sequentially with one pass through A. */
 
-    if (hypre_lsame_(trans, "N")) {
+    if (lsame_(trans, "N")) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (hypre_lsame_(uplo, "U")) {
+	if (lsame_(uplo, "U")) {
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
 		    if (X(j) != 0.) {
@@ -253,7 +255,7 @@
 
 /*        Form  x := inv( A' )*x. */
 
-	if (hypre_lsame_(uplo, "U")) {
+	if (lsame_(uplo, "U")) {
 	    if (*incx == 1) {
 		for (j = 1; j <= *n; ++j) {
 		    temp = X(j);
@@ -327,3 +329,6 @@
 
 } /* dtrsv_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/f2c.c hypre-2.13.0/src/blas/f2c.c
--- hypre-2.11.2/src/blas/f2c.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/blas/f2c.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,149 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*-----------------------------------------------------------------------------
+ * Contains functions found in the f2c library to avoid needing -lf2c
+ *-----------------------------------------------------------------------------*/
+
+#include "f2c.h"
+#include "hypre_blas.h"
+	
+/* compare two strings */
+
+integer s_cmp(char *a0,const char *b0, ftnlen la, ftnlen lb)
+{
+register unsigned char *a, *aend, *b, *bend;
+a = (unsigned char *)a0;
+b = (unsigned char *)b0;
+aend = a + la;
+bend = b + lb;
+
+if(la <= lb)
+	{
+	while(a < aend)
+		if(*a != *b)
+			return( *a - *b );
+		else
+			{ ++a; ++b; }
+
+	while(b < bend)
+		if(*b != ' ')
+			return( ' ' - *b );
+		else	++b;
+	}
+
+else
+	{
+	while(b < bend)
+		if(*a == *b)
+			{ ++a; ++b; }
+		else
+			return( *a - *b );
+	while(a < aend)
+		if(*a != ' ')
+			return(*a - ' ');
+		else	++a;
+	}
+return(0);
+}
+
+/* assign strings:  a = b */
+
+integer s_copy(char *a,const char *b, ftnlen la, ftnlen lb)
+{
+register char *aend, *bend;
+
+aend = a + la;
+
+if(la <= lb)
+	while(a < aend)
+		*a++ = *b++;
+
+else
+	{
+		bend = (char*)b + lb;
+	while(b < bend)
+		*a++ = *b++;
+	while(a < aend)
+		*a++ = ' ';
+	}
+return(0);
+}
+
+integer s_cat(char *lp, char *rpp[], ftnlen rnp[], ftnlen *np, ftnlen ll)
+{
+ftnlen i, n, nc;
+char *f__rp;
+
+n = (integer)*np;
+for(i = 0 ; i < n ; ++i)
+	{
+	nc = ll;
+	if(rnp[i] < nc)
+		nc = rnp[i];
+	ll -= nc;
+	f__rp = rpp[i];
+	while(--nc >= 0)
+		*lp++ = *f__rp++;
+	}
+while(--ll >= 0)
+	*lp++ = ' ';
+return 0;
+}
+
+#define log10e 0.43429448190325182765
+
+#undef abs
+#include "math.h"					 
+doublereal d_lg10(doublereal *x)
+{
+return( log10e * log(*x) );
+}
+
+doublereal d_sign(doublereal *a, doublereal *b)
+{
+doublereal x;
+x = (*a >= 0 ? *a : - *a);
+return( *b >= 0 ? x : -x);
+}
+
+doublereal pow_di(doublereal *ap, integer *bp)
+{
+doublereal pow, x;
+integer n;
+
+pow = 1;
+x = *ap;
+n = *bp;
+
+if(n != 0)
+	{
+	if(n < 0)
+		{
+		n = -n;
+		x = 1/x;
+		}
+	for( ; ; )
+		{
+		if(n & 01)
+			pow *= x;
+		if(n >>= 1)
+			x *= x;
+		else
+			break;
+		}
+	}
+return(pow);
+}
+
+#undef abs
+#include "math.h"
+doublereal pow_dd(doublereal *ap, doublereal *bp)
+{
+return(pow(*ap, *bp) );
+}
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/f2c.h hypre-2.13.0/src/blas/f2c.h
--- hypre-2.11.2/src/blas/f2c.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/f2c.h	2017-10-20 17:42:22.000000000 +0000
@@ -25,6 +25,13 @@
 #define HYPRE_SEQUENTIAL
 #endif
 #include "_hypre_utilities.h"
+#include "math.h"
+
+#if defined(HYPRE_SINGLE)
+#define sqrt sqrtf
+#elif defined(HYPRE_LONG_DOUBLE)
+#define sqrt sqrtl
+#endif
 
 #ifdef HYPRE_BIGINT
 typedef long long int HYPRE_LongInt;
diff -Nru hypre-2.11.2/src/blas/_hypre_blas.h hypre-2.13.0/src/blas/_hypre_blas.h
--- hypre-2.11.2/src/blas/_hypre_blas.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/blas/_hypre_blas.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,130 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header file for HYPRE BLAS
+ *
+ *****************************************************************************/
+
+#ifndef HYPRE_BLAS_H
+#define HYPRE_BLAS_H
+
+#include "_hypre_utilities.h"
+#include "fortran.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*--------------------------------------------------------------------------
+ * Change all 'hypre_' names based on using HYPRE or external library
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_USING_HYPRE_BLAS
+
+#define hypre_dasum   hypre_F90_NAME_BLAS(dasum ,DASUM )
+#define hypre_daxpy   hypre_F90_NAME_BLAS(daxpy ,DAXPY )
+#define hypre_dcopy   hypre_F90_NAME_BLAS(dcopy ,DCOPY )
+#define hypre_ddot    hypre_F90_NAME_BLAS(ddot  ,DDOT  )
+#define hypre_dgemm   hypre_F90_NAME_BLAS(dgemm ,DGEMM )
+#define hypre_dgemv   hypre_F90_NAME_BLAS(dgemv ,DGEMV )
+#define hypre_dger    hypre_F90_NAME_BLAS(dger  ,DGER  )
+#define hypre_dnrm2   hypre_F90_NAME_BLAS(dnrm2 ,DNRM2 )
+#define hypre_drot    hypre_F90_NAME_BLAS(drot  ,DROT  )
+#define hypre_dscal   hypre_F90_NAME_BLAS(dscal ,DSCAL )
+#define hypre_dswap   hypre_F90_NAME_BLAS(dswap ,DSWAP )
+#define hypre_dsymm   hypre_F90_NAME_BLAS(dsymm ,DSYMM )
+#define hypre_dsymv   hypre_F90_NAME_BLAS(dsymv ,DSYMV )
+#define hypre_dsyr2   hypre_F90_NAME_BLAS(dsyr2 ,DSYR2 )
+#define hypre_dsyr2k  hypre_F90_NAME_BLAS(dsyr2k,DSYR2K)
+#define hypre_dsyrk   hypre_F90_NAME_BLAS(dsyrk ,DSYRK )
+#define hypre_dtrmm   hypre_F90_NAME_BLAS(dtrmm ,DTRMM )
+#define hypre_dtrmv   hypre_F90_NAME_BLAS(dtrmv ,DTRMV )
+#define hypre_dtrsm   hypre_F90_NAME_BLAS(dtrsm ,DTRSM )
+#define hypre_dtrsv   hypre_F90_NAME_BLAS(dtrsv ,DTRSV )
+#define hypre_idamax  hypre_F90_NAME_BLAS(idamax,IDAMAX)
+
+#endif
+
+/*--------------------------------------------------------------------------
+ * Prototypes
+ *--------------------------------------------------------------------------*/
+
+/* dasum.c */
+HYPRE_Real hypre_dasum ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx );
+
+/* daxpy.c */
+HYPRE_Int hypre_daxpy ( HYPRE_Int *n , HYPRE_Real *da , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy );
+
+/* dcopy.c */
+HYPRE_Int hypre_dcopy ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy );
+
+/* ddot.c */
+HYPRE_Real hypre_ddot ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy );
+
+/* dgemm.c */
+HYPRE_Int hypre_dgemm ( const char *transa , const char *transb , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *beta , HYPRE_Real *c , HYPRE_Int *ldc );
+
+/* dgemv.c */
+HYPRE_Int hypre_dgemv ( const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *beta , HYPRE_Real *y , HYPRE_Int *incy );
+
+/* dger.c */
+HYPRE_Int hypre_dger ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *y , HYPRE_Int *incy , HYPRE_Real *a , HYPRE_Int *lda );
+
+/* dnrm2.c */
+HYPRE_Real hypre_dnrm2 ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx );
+
+/* drot.c */
+HYPRE_Int hypre_drot ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy , HYPRE_Real *c , HYPRE_Real *s );
+
+/* dscal.c */
+HYPRE_Int hypre_dscal ( HYPRE_Int *n , HYPRE_Real *da , HYPRE_Real *dx , HYPRE_Int *incx );
+
+/* dswap.c */
+HYPRE_Int hypre_dswap ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx , HYPRE_Real *dy , HYPRE_Int *incy );
+
+/* dsymm.c */
+HYPRE_Int hypre_dsymm ( const char *side , const char *uplo , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *beta , HYPRE_Real *c__ , HYPRE_Int *ldc );
+
+/* dsymv.c */
+HYPRE_Int hypre_dsymv ( const char *uplo , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *beta , HYPRE_Real *y , HYPRE_Int *incy );
+
+/* dsyr2.c */
+HYPRE_Int hypre_dsyr2 ( const char *uplo , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *x , HYPRE_Int *incx , HYPRE_Real *y , HYPRE_Int *incy , HYPRE_Real *a , HYPRE_Int *lda );
+
+/* dsyr2k.c */
+HYPRE_Int hypre_dsyr2k ( const char *uplo , const char *trans , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *beta , HYPRE_Real *c__ , HYPRE_Int *ldc );
+
+/* dsyrk.c */
+HYPRE_Int hypre_dsyrk ( const char *uplo , const char *trans , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *beta , HYPRE_Real *c , HYPRE_Int *ldc );
+
+/* dtrmm.c */
+HYPRE_Int hypre_dtrmm ( const char *side , const char *uplo , const char *transa , const char *diag , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb );
+
+/* dtrmv.c */
+HYPRE_Int hypre_dtrmv ( const char *uplo , const char *trans , const char *diag , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx );
+
+/* dtrsm.c */
+HYPRE_Int hypre_dtrsm ( const char *side , const char *uplo , const char *transa , const char *diag , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *alpha , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb );
+
+/* dtrsv.c */
+HYPRE_Int hypre_dtrsv ( const char *uplo , const char *trans , const char *diag , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *x , HYPRE_Int *incx );
+
+/* idamax.c */
+HYPRE_Int hypre_idamax ( HYPRE_Int *n , HYPRE_Real *dx , HYPRE_Int *incx );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff -Nru hypre-2.11.2/src/blas/hypre_blas.h hypre-2.13.0/src/blas/hypre_blas.h
--- hypre-2.11.2/src/blas/hypre_blas.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/hypre_blas.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,116 +10,45 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-/* hypre_blas.h  --  Contains BLAS prototypes needed by Hypre */
+/***** DO NOT use this file outside of the BLAS directory *****/
 
-#ifndef HYPRE_BLAS_H
-#define HYPRE_BLAS_H
-#include "f2c.h"
-#include "fortran.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* --------------------------------------------------------------------------
- *   Change all names to hypre_ to avoid link conflicts
- * --------------------------------------------------------------------------*/
-
-#define dasum_   hypre_F90_NAME_BLAS(dasum,DASUM)
-#define daxpy_   hypre_F90_NAME_BLAS(daxpy,DAXPY)
-#define dcopy_   hypre_F90_NAME_BLAS(dcopy,DCOPY)
-#define ddot_    hypre_F90_NAME_BLAS(ddot,DDOT)
-#define dgemm_   hypre_F90_NAME_BLAS(dgemm,DGEMM)
-#define dgemv_   hypre_F90_NAME_BLAS(dgemv,DGEMV)
-#define dger_    hypre_F90_NAME_BLAS(dger,DGER)
-#define dnrm2_   hypre_F90_NAME_BLAS(dnrm2,DNRM2)
-#define drot_    hypre_F90_NAME_BLAS(drot,DROT)
-#define dscal_   hypre_F90_NAME_BLAS(dscal,DSCAL)
-#define dswap_   hypre_F90_NAME_BLAS(dswap,DSWAP)
-#define dsymm_   hypre_F90_NAME_BLAS(dsymm,DSYMM)
-#define dsymv_   hypre_F90_NAME_BLAS(dsymv,DSYMV)
-#define dsyr2_   hypre_F90_NAME_BLAS(dsyr2,DSYR2)
-#define dsyr2k_  hypre_F90_NAME_BLAS(dsyr2k,DSYR2K)
-#define dsyrk_   hypre_F90_NAME_BLAS(dsyrk,DSYRK)
-#define dtrmm_   hypre_F90_NAME_BLAS(dtrmm,DTRMM)
-#define dtrmv_   hypre_F90_NAME_BLAS(dtrmv,DTRMV)
-#define dtrsm_   hypre_F90_NAME_BLAS(dtrsm,DTRSM)
-#define dtrsv_   hypre_F90_NAME_BLAS(dtrsv,DTRSV)
-#define idamax_  hypre_F90_NAME_BLAS(idamax,IDAMAX)
-#define s_cmp    hypre_F90_NAME_BLAS(s_cmp,S_CMP)
-#define s_copy   hypre_F90_NAME_BLAS(s_copy,S_COPY)
-
-/* blas_utils.c */
-logical lsame_ ( const char *ca ,const char *cb );
-HYPRE_Int xerbla_ ( const char *srname , integer *info );
-integer s_cmp ( char *a0 , const char *b0 , ftnlen la , ftnlen lb );
-VOID s_copy ( char *a , const char *b , ftnlen la , ftnlen lb );
-
-/* dasum.c */
-doublereal dasum_ ( integer *n , doublereal *dx , integer *incx );
-
-/* daxpy.c */
-HYPRE_Int daxpy_ ( integer *n , doublereal *da , doublereal *dx , integer *incx , doublereal *dy , integer *incy );
-
-/* dcopy.c */
-HYPRE_Int dcopy_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy );
-
-/* ddot.c */
-doublereal ddot_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy );
-
-/* dgemm.c */
-HYPRE_Int dgemm_ ( const char *transa , const char *transb , integer *m , integer *n , integer *k , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *beta , doublereal *c , integer *ldc );
-
-/* dgemv.c */
-HYPRE_Int dgemv_ ( const char *trans , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *x , integer *incx , doublereal *beta , doublereal *y , integer *incy );
-
-/* dger.c */
-HYPRE_Int dger_ ( integer *m , integer *n , doublereal *alpha , doublereal *x , integer *incx , doublereal *y , integer *incy , doublereal *a , integer *lda );
-
-/* dnrm2.c */
-doublereal dnrm2_ ( integer *n , doublereal *dx , integer *incx );
-
-/* drot.c */
-HYPRE_Int drot_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy , doublereal *c , doublereal *s );
-
-/* dscal.c */
-HYPRE_Int dscal_ ( integer *n , doublereal *da , doublereal *dx , integer *incx );
-
-/* dswap.c */
-HYPRE_Int dswap_ ( integer *n , doublereal *dx , integer *incx , doublereal *dy , integer *incy );
-
-/* dsymm.c */
-HYPRE_Int dsymm_ ( const char *side , const char *uplo , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *beta , doublereal *c__ , integer *ldc );
-
-/* dsymv.c */
-HYPRE_Int dsymv_ ( const char *uplo , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *x , integer *incx , doublereal *beta , doublereal *y , integer *incy );
-
-/* dsyr2.c */
-HYPRE_Int dsyr2_ ( const char *uplo , integer *n , doublereal *alpha , doublereal *x , integer *incx , doublereal *y , integer *incy , doublereal *a , integer *lda );
-
-/* dsyr2k.c */
-HYPRE_Int dsyr2k_ ( const char *uplo , const char *trans , integer *n , integer *k , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *beta , doublereal *c__ , integer *ldc );
-
-/* dsyrk.c */
-HYPRE_Int dsyrk_ ( const char *uplo , const char *trans , integer *n , integer *k , doublereal *alpha , doublereal *a , integer *lda , doublereal *beta , doublereal *c , integer *ldc );
-
-/* dtrmm.c */
-HYPRE_Int dtrmm_ ( const char *side , const char *uplo , const char *transa , const char *diag , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb );
-
-/* dtrmv.c */
-HYPRE_Int dtrmv_ ( const char *uplo , const char *trans , const char *diag , integer *n , doublereal *a , integer *lda , doublereal *x , integer *incx );
-
-/* dtrsm.c */
-HYPRE_Int dtrsm_ ( const char *side , const char *uplo , const char *transa , const char *diag , integer *m , integer *n , doublereal *alpha , doublereal *a , integer *lda , doublereal *b , integer *ldb );
-
-/* dtrsv.c */
-HYPRE_Int dtrsv_ ( const char *uplo , const char *trans , const char *diag , integer *n , doublereal *a , integer *lda , doublereal *x , integer *incx );
-
-/* idamax.c */
-integer idamax_ ( integer *n , doublereal *dx , integer *incx );
-
-#ifdef __cplusplus
-}
-#endif
+/*--------------------------------------------------------------------------
+ * This header renames the functions in BLAS to avoid conflicts
+ *--------------------------------------------------------------------------*/
+
+/* blas */
+#define dasum_   hypre_dasum
+#define daxpy_   hypre_daxpy
+#define dcopy_   hypre_dcopy
+#define ddot_    hypre_ddot
+#define dgemm_   hypre_dgemm
+#define dgemv_   hypre_dgemv
+#define dger_    hypre_dger
+#define dnrm2_   hypre_dnrm2
+#define drot_    hypre_drot
+#define dscal_   hypre_dscal
+#define dswap_   hypre_dswap
+#define dsymm_   hypre_dsymm
+#define dsymv_   hypre_dsymv
+#define dsyr2_   hypre_dsyr2
+#define dsyr2k_  hypre_dsyr2k
+#define dsyrk_   hypre_dsyrk
+#define dtrmm_   hypre_dtrmm
+#define dtrmv_   hypre_dtrmv
+#define dtrsm_   hypre_dtrsm
+#define dtrsv_   hypre_dtrsv
+#define idamax_  hypre_idamax
+
+/* f2c library routines */
+#define s_cmp    hypre_s_cmp
+#define s_copy   hypre_s_copy
+#define s_cat    hypre_s_cat
+#define d_lg10   hypre_d_lg10
+#define d_sign   hypre_d_sign
+#define pow_dd   hypre_pow_dd
+#define pow_di   hypre_pow_di
+
+/* these auxiliary routines have a different definition in LAPACK */
+#define lsame_   hypre_blas_lsame
+#define xerbla_  hypre_blas_xerbla
 
-#endif
diff -Nru hypre-2.11.2/src/blas/idamax.c hypre-2.13.0/src/blas/idamax.c
--- hypre-2.11.2/src/blas/idamax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/idamax.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,6 @@
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*  -- translated by f2c (version 19940927).
    You must link the resulting object file with the libraries:
@@ -78,3 +80,6 @@
     return ret_val;
 } /* idamax_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/lsame.c hypre-2.13.0/src/blas/lsame.c
--- hypre-2.11.2/src/blas/lsame.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/blas/lsame.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,125 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* lsame.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "hypre_blas.h"
+
+logical lsame_(const char *ca, const char *cb)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    integer inta, intb, zcode;
+
+
+/*  -- LAPACK auxiliary routine (version 3.1) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  LSAME returns .TRUE. if CA is the same letter as CB regardless of */
+/*  case. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  CA      (input) CHARACTER*1 */
+
+/*  CB      (input) CHARACTER*1 */
+/*          CA and CB specify the single characters to be compared. */
+
+/* ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+
+/*     Test if the characters are equal */
+
+    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
+    if (ret_val) {
+	return ret_val;
+    }
+
+/*     Now test for equivalence if both characters are alphabetic. */
+
+    zcode = 'Z';
+
+/*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
+/*     machines, on which ICHAR returns a value with bit 8 set. */
+/*     ICHAR('A') on Prime machines returns 193 which is the same as */
+/*     ICHAR('A') on an EBCDIC machine. */
+
+    inta = *(unsigned char *)ca;
+    intb = *(unsigned char *)cb;
+
+    if (zcode == 90 || zcode == 122) {
+
+/*        ASCII is assumed - ZCODE is the ASCII code of either lower or */
+/*        upper case 'Z'. */
+
+	if (inta >= 97 && inta <= 122) {
+	    inta += -32;
+	}
+	if (intb >= 97 && intb <= 122) {
+	    intb += -32;
+	}
+
+    } else if (zcode == 233 || zcode == 169) {
+
+/*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
+/*        upper case 'Z'. */
+
+       if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) ||
+           (inta >= 162 && inta <= 169)) {
+	    inta += 64;
+	}
+       if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) ||
+           (intb >= 162 && intb <= 169)) {
+	    intb += 64;
+	}
+
+    } else if (zcode == 218 || zcode == 250) {
+
+/*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
+/*        plus 128 of either lower or upper case 'Z'. */
+
+	if (inta >= 225 && inta <= 250) {
+	    inta += -32;
+	}
+	if (intb >= 225 && intb <= 250) {
+	    intb += -32;
+	}
+    }
+    ret_val = inta == intb;
+
+/*     RETURN */
+
+/*     End of LSAME */
+
+    return ret_val;
+} /* lsame_ */
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/blas/Makefile hypre-2.13.0/src/blas/Makefile
--- hypre-2.11.2/src/blas/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -22,28 +22,30 @@
 
 BLAS_HEADERS =  f2c.h  hypre_blas.h
 BLAS_FILES = \
-blas_utils.c\
-dasum.c\
-daxpy.c\
-dcopy.c\
-ddot.c\
-dgemm.c\
-dgemv.c\
-dger.c\
-dnrm2.c\
-drot.c\
-dscal.c\
-dswap.c\
-dsymm.c\
-dsymv.c\
-dsyr2.c\
-dsyr2k.c\
-dsyrk.c\
-dtrmm.c\
-dtrmv.c\
-dtrsm.c\
-dtrsv.c\
-idamax.c
+ dasum.c\
+ daxpy.c\
+ dcopy.c\
+ ddot.c\
+ dgemm.c\
+ dgemv.c\
+ dger.c\
+ dnrm2.c\
+ drot.c\
+ dscal.c\
+ dswap.c\
+ dsymm.c\
+ dsymv.c\
+ dsyr2.c\
+ dsyr2k.c\
+ dsyrk.c\
+ dtrmm.c\
+ dtrmv.c\
+ dtrsm.c\
+ dtrsv.c\
+ f2c.c\
+ idamax.c\
+ lsame.c\
+ xerbla.c
 
 OBJS = ${BLAS_FILES:.c=.o}
 
@@ -54,6 +56,7 @@
 all: ${OBJS}
 
 install: all
+	cp -fR $(srcdir)/_hypre_blas.h $(HYPRE_INC_INSTALL)
 
 clean:
 	rm -rf *.o
diff -Nru hypre-2.11.2/src/blas/README hypre-2.13.0/src/blas/README
--- hypre-2.11.2/src/blas/README	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/blas/README	2017-10-20 17:42:22.000000000 +0000
@@ -10,22 +10,29 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+HYPRE BLAS README file
 
+The source in the HYPRE BLAS and LAPACK is taken from CLAPACK and most recently
+based on release 3.2.1 (though many of the files here are much older).  To add a
+new BLAS or LAPACK routine, copy the file to the appropriate directory, then do
+the following:
 
+- Add C include guards at the beginning and end of the file to allow for C++ compilation
 
+- Change the 'blaswrap.h' include file to either 'hypre_blas.h' or 'hypre_lapack.h'
 
+- Change 'int' to 'integer' to avoid errors in the autotest check-int script
 
+- Add 'const' in front of 'char *' in prototypes as required by C++ (use the
+  warnings from the C++ compiler to determine where the changes are needed)
 
-HYPRE BLAS README file
+- Add the #define name changes to 'hypre_blas.h' and/or 'hypre_lapack.h'.
+  Organize things alphabetically and by routine type.  Note that the blas
+  renaming needs to be replicated in 'hypre_lapack.h'.
 
-This is a subset of the CBLAS from netlib.org.
-o The LAPACK/blas auxiliary routine xerbla_ has been
-  renamed through this directory as hypre_xerbla_, to
-  reduce name conflicts with the same routine in the
-  HYPRE lapack directory.
-o The LAPACK/blas auxiliary routine lsame_ has been
-  renamed through this directory as hypre_lsame_, for
-  the same reason as above.
-o The routines xerbla_ and lsame_ have been moved into
-  the blas_utils.c file.
+- Create a hypre_ prototype for the main BLAS and LAPACK routines in either
+  '_hypre_blas.h' or '_hypre_lapack.h'.  Do not create prototypes for auxiliary
+  routines or f2c-library routines.
 
+- To determine which routines are auxiliary routines, look at the comments in
+  the C files (search for 'auxiliary').
diff -Nru hypre-2.11.2/src/blas/xerbla.c hypre-2.13.0/src/blas/xerbla.c
--- hypre-2.11.2/src/blas/xerbla.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/blas/xerbla.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,86 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* xerbla.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+        on Microsoft Windows system, link with libf2c.lib;
+        on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+        or, if you install libf2c.a in a standard place, with -lf2c -lm
+        -- in that order, at the end of the command line, as in
+                cc *.o -lf2c -lm
+        Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+                http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "hypre_blas.h"
+
+/* Table of constant values */
+
+/*static integer c__1 = 1;*/
+
+/* Subroutine */ integer xerbla_(const char *srname, integer *info)
+{
+    /* Format strings */
+    /*
+    static char fmt_9999[] = "(\002 ** On entry to \002,a,\002 parameter num"
+            "ber \002,i2,\002 had \002,\002an illegal value\002)";
+    */
+
+    /* Builtin functions */
+    integer s_wsfe(cilist *), i_len_trim(char *, ftnlen), do_fio(integer *, 
+            char *, ftnlen), e_wsfe(void);
+    /* Subroutine */ integer s_stop(char *, ftnlen);
+
+    /* Fortran I/O blocks */
+    /*static cilist io___1 = { 0, 6, 0, fmt_9999, 0 };*/
+
+
+
+/*  -- LAPACK auxiliary routine (preliminary version) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  XERBLA  is an error handler for the LAPACK routines. */
+/*  It is called by an LAPACK routine if an input parameter has an */
+/*  invalid value.  A message is printed and execution stops. */
+
+/*  Installers may consider modifying the STOP statement in order to */
+/*  call system-specific exception-handling facilities. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SRNAME  (input) CHARACTER*(*) */
+/*          The name of the routine which called XERBLA. */
+
+/*  INFO    (input) INTEGER */
+/*          The position of the invalid parameter in the parameter list */
+/*          of the calling routine. */
+
+/* ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+        hypre_printf("** On entry to %6s, parameter number %2i had an illegal value\n",
+                srname, *info);
+
+
+/*     End of XERBLA */
+
+    return 0;
+} /* xerbla_ */
+
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/CMakeLists.txt hypre-2.13.0/src/CMakeLists.txt
--- hypre-2.11.2/src/CMakeLists.txt	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/CMakeLists.txt	2017-10-20 17:42:22.000000000 +0000
@@ -2,8 +2,8 @@
 project (hypre)
 
 # The version number.
-set (HYPRE_VERSION 2.11.2)
-set (HYPRE_DATE    2017/03/13)
+set (HYPRE_VERSION 2.13.0)
+set (HYPRE_DATE    2017/10/20)
 set (HYPRE_TIME    00:00:00)
 set (HYPRE_BUGS    hypre-support@llnl.gov)
 set (HYPRE_SRCDIR  "${PROJECT_SOURCE_DIR}")
@@ -25,6 +25,8 @@
 # Configuration options
 option(HYPRE_SHARED               "Build a shared library" OFF)
 option(HYPRE_BIGINT               "Use long long int for HYPRE_Int" OFF)
+option(HYPRE_SINGLE               "Use float for HYPRE_Real" OFF)
+option(HYPRE_LONG_DOUBLE          "Use long double for HYPRE_Real" OFF)
 option(HYPRE_SEQUENTIAL           "Compile without MPI" OFF)
 option(HYPRE_TIMING               "Use HYPRE timing routines" OFF)
 option(HYPRE_USING_HYPRE_BLAS     "Use internal BLAS library" ON)
@@ -44,6 +46,14 @@
   set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE)
 endif ()
 
+if (HYPRE_SINGLE)
+  set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE)
+endif ()
+
+if (HYPRE_LONG_DOUBLE)
+  set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE)
+endif ()
+
 if (HYPRE_SEQUENTIAL)
   set (HYPRE_NO_GLOBAL_PARTITION OFF CACHE BOOL "" FORCE)
 endif ()
@@ -65,9 +75,7 @@
 )
 
 # Headers and sources: blas
-if (HYPRE_USING_HYPRE_BLAS)
 list (APPEND HYPRE_SOURCES
-  blas/blas_utils.c
   blas/dasum.c
   blas/daxpy.c
   blas/dcopy.c
@@ -88,12 +96,13 @@
   blas/dtrmv.c
   blas/dtrsm.c
   blas/dtrsv.c
+  blas/f2c.c
   blas/idamax.c
+  blas/lsame.c
+  blas/xerbla.c
 )
-endif (HYPRE_USING_HYPRE_BLAS)
 
 # Headers and sources: lapack
-if (HYPRE_USING_HYPRE_LAPACK)
 list (APPEND HYPRE_SOURCES
   lapack/dbdsqr.c
   lapack/dgebd2.c
@@ -163,11 +172,9 @@
   lapack/dsytrd.c
   lapack/ieeeck.c
   lapack/ilaenv.c
-  lapack/lapack_utils.c
   lapack/lsame.c
   lapack/xerbla.c
 )
-endif (HYPRE_USING_HYPRE_LAPACK)
 
 # Headers and sources: utilities
 list (APPEND HYPRE_HEADERS
@@ -456,6 +463,7 @@
   parcsr_ls/HYPRE_parcsr_lgmres.c
   parcsr_ls/HYPRE_parcsr_hybrid.c
   parcsr_ls/HYPRE_parcsr_int.c
+  parcsr_ls/HYPRE_parcsr_mgr.c
   parcsr_ls/HYPRE_parcsr_ParaSails.c
   parcsr_ls/HYPRE_parcsr_pcg.c
   parcsr_ls/HYPRE_parcsr_pilut.c
@@ -470,6 +478,7 @@
   parcsr_ls/par_cg_relax_wt.c
   parcsr_ls/par_coarsen.c
   parcsr_ls/par_cgc_coarsen.c
+  parcsr_ls/par_cheby.c
   parcsr_ls/par_coarse_parms.c
   parcsr_ls/par_coordinates.c
   parcsr_ls/par_cr.c
@@ -485,12 +494,16 @@
   parcsr_ls/par_laplace_9pt.c
   parcsr_ls/par_laplace.c
   parcsr_ls/par_lr_interp.c
+  parcsr_ls/par_mgr.c
+  parcsr_ls/par_mgr_setup.c
+  parcsr_ls/par_mgr_solve.c
   parcsr_ls/par_nongalerkin.c
   parcsr_ls/par_nodal_systems.c
   parcsr_ls/par_rap.c
   parcsr_ls/par_rap_communication.c
   parcsr_ls/par_rotate_7pt.c
   parcsr_ls/par_vardifconv.c
+  parcsr_ls/par_vardifconv_rs.c
   parcsr_ls/par_relax.c
   parcsr_ls/par_relax_more.c
   parcsr_ls/par_relax_interface.c
diff -Nru hypre-2.11.2/src/config/config.guess hypre-2.13.0/src/config/config.guess
--- hypre-2.11.2/src/config/config.guess	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/config/config.guess	2017-10-20 17:42:22.000000000 +0000
@@ -2,7 +2,7 @@
 # Attempt to guess a canonical system name.
 #   Copyright 1992-2016 Free Software Foundation, Inc.
 
-timestamp='2016-10-02'
+timestamp='2017-03-02'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -1009,7 +1009,23 @@
     or32:Linux:*:* | or1k*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
-    padre:Linux:*:*)
+    ppc64le:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+    padre:Linux:*:*
 	echo sparc-unknown-linux-${LIBC}
 	exit ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
diff -Nru hypre-2.11.2/src/config/configure.in hypre-2.13.0/src/config/configure.in
--- hypre-2.11.2/src/config/configure.in	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/config/configure.in	2017-10-20 17:42:22.000000000 +0000
@@ -55,8 +55,8 @@
 dnl *********************************************************************
 
 m4_define([M4_HYPRE_NAME],    [hypre])
-m4_define([M4_HYPRE_VERSION], [2.11.2])
-m4_define([M4_HYPRE_DATE],    [2017/03/13])
+m4_define([M4_HYPRE_VERSION], [2.13.0])
+m4_define([M4_HYPRE_DATE],    [2017/10/20])
 m4_define([M4_HYPRE_TIME],    [00:00:00])
 m4_define([M4_HYPRE_BUGS],    [hypre-support@llnl.gov])
 m4_define([M4_HYPRE_SRCDIR],  [`pwd`])
@@ -133,18 +133,26 @@
 hypre_user_chose_blas=no
 hypre_user_chose_lapack=no
 hypre_user_chose_fei=no
+hypre_user_chose_cuda=no
+hypre_user_chose_raja=no
+hypre_user_chose_kokkos=no
 
 hypre_using_c=yes
 hypre_using_cxx=yes
 
 hypre_using_mpi=yes
 
+hypre_using_superlu=no
+hypre_using_dsuperlu=no
+
 hypre_using_fei=yes
-hypre_using_superlu=yes
-hypre_using_mli=yes
+hypre_using_mli=no
 
 hypre_using_openmp=no
 hypre_using_insure=no
+hypre_using_cuda=no
+hypre_using_gpu=no
+hypre_using_um=no
 
 hypre_using_caliper=no
 hypre_user_gave_caliper_lib=no
@@ -217,6 +225,38 @@
    AC_DEFINE(HYPRE_BIGINT, 1)
 fi
 
+AC_ARG_ENABLE(single,
+AS_HELP_STRING([--enable-single],
+               [Use single precision values (default is NO).]),
+[case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_single=yes ;;
+    no)  hypre_using_single=no ;;
+    *)   AC_MSG_ERROR([Bad value ${enableval} for --enable-single]) ;;
+ esac],
+[hypre_using_single=no]
+)
+if test "$hypre_using_single" = "yes"
+then
+   AC_DEFINE(HYPRE_SINGLE, 1)
+fi
+
+AC_ARG_ENABLE(longdouble,
+AS_HELP_STRING([--enable-longdouble],
+               [Use long double precision values (default is NO).]),
+[case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_longdouble=yes ;;
+    no)  hypre_using_longdouble=no ;;
+    *)   AC_MSG_ERROR([Bad value ${enableval} for --enable-longdouble]) ;;
+ esac],
+[hypre_using_longdouble=no]
+)
+if test "$hypre_using_longdouble" = "yes"
+then
+   AC_DEFINE(HYPRE_LONG_DOUBLE, 1)
+fi
+
 AC_ARG_ENABLE(complex,
 AS_HELP_STRING([--enable-complex],
                [Use complex values (default is NO).]),
@@ -293,8 +333,6 @@
     *)   hypre_using_global_partition=yes ;;
  esac]
 )
-dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set
-dnl * Need to change to a new approach that always defines variable to some value
 
 AC_ARG_ENABLE(fortran,
 AS_HELP_STRING([--enable-fortran],
@@ -307,6 +345,20 @@
 [hypre_using_fortran=yes]
 )
 
+AC_ARG_ENABLE(unified-memory,
+AS_HELP_STRING([--enable-unified-memory],
+               [Use unified memory for allocating the memory (default is NO).]),
+[case "${enableval}" in
+    yes) hypre_using_um=yes ;;
+    no)  hypre_using_um=no ;;
+    *)   hypre_using_um=no ;;
+ esac],
+[hypre_using_um=no]
+)
+
+dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set
+dnl * Need to change to a new approach that always defines variable to some value
+
 dnl *********************************************************************
 dnl * Determine if user provided C compiler or flags
 dnl *********************************************************************
@@ -493,6 +545,8 @@
  AC_DEFINE(HYPRE_SEQUENTIAL,1,[No MPI being used])]
 )
 
+dnl ***** MPI
+
 AC_ARG_WITH(MPI-include,
 AS_HELP_STRING([--with-MPI-include=DIR],
                [User specifies that mpi.h is in DIR.  The options
@@ -546,6 +600,8 @@
 [MPIFLAGS=""]
 )
 
+dnl ***** BLAS
+
 AC_ARG_WITH(blas-lib,
 AS_HELP_STRING([--with-blas-lib=LIBS],
                [LIBS is space-separated linkable list (enclosed in quotes) of libraries
@@ -588,6 +644,8 @@
  hypre_blas_lib_dir_old_style=yes]
 )
 
+dnl ***** LAPACK
+
 AC_ARG_WITH(lapack-lib,
 AS_HELP_STRING([--with-lapack-lib=LIBS],
                [LIBS is space-separated linkable list (enclosed in quotes) of libraries
@@ -725,13 +783,78 @@
 AS_HELP_STRING([--with-openmp],
                [Use OpenMP.  This may affect which compiler is chosen.]),
 [case "${withval}" in
-    yes) hypre_using_openmp=yes
-         AC_DEFINE([HYPRE_USING_OPENMP],1,[Enable OpenMP support]) ;;
+    yes) hypre_using_openmp=yes;;
     no)  hypre_using_openmp=no ;;
  esac],
 [hypre_using_openmp=no]
 )
 
+dnl ***** SuperLU
+
+AC_ARG_WITH(superlu,
+AS_HELP_STRING([--with-superlu],
+               [Use external SuperLU library.]),
+[case "${withval}" in
+    no) hypre_using_superlu=no ;;
+    *)  hypre_using_superlu=yes ;;
+ esac]
+)
+
+AS_IF([test "x$with_superlu" = "xyes"],
+      [AC_DEFINE(HAVE_SUPERLU, 1, [Have external SuperLU library.])],
+      [])
+
+AC_ARG_WITH(superlu-include,
+AS_HELP_STRING([--with-superlu-include=DIR],
+               [Directory where SuperLU is installed.]),
+[for superlu_inc_dir in $withval; do
+    SUPERLU_INCLUDE="-I$superlu_inc_dir $SUPERLU_INCLUDE"
+ done]
+)
+
+AC_ARG_WITH(superlu-lib,
+AS_HELP_STRING([--with-superlu-lib=LIBS],
+               [LIBS is space-separated linkable list (enclosed in quotes) of libraries
+                needed for SuperLU. OK to use -L and -l flags in the list]),
+[for superlu_lib in $withval; do
+    SUPERLU_LIBS="$SUPERLU_LIBS $superlu_lib"
+ done]
+)
+
+dnl ***** DSuperLU
+
+AC_ARG_WITH(dsuperlu,
+AS_HELP_STRING([--with-dsuperlu],
+               [Use external DSuperLU library.]),
+[case "${withval}" in
+    no) hypre_using_dsuperlu=no ;;
+    *)  hypre_using_dsuperlu=yes ;;
+ esac]
+)
+
+AS_IF([test "x$with_dsuperlu" = "xyes"],
+      [AC_DEFINE(HAVE_DSUPERLU, 1, [Have external DSuperLU library.])],
+      [])
+
+AC_ARG_WITH(dsuperlu-include,
+AS_HELP_STRING([--with-dsuperlu-include=DIR],
+               [Directory where DSuperLU is installed.]),
+[for dsuperlu_inc_dir in $withval; do
+    DSUPERLU_INCLUDE="-I$dsuperlu_inc_dir $DSUPERLU_INCLUDE"
+ done]
+)
+
+AC_ARG_WITH(dsuperlu-lib,
+AS_HELP_STRING([--with-dsuperlu-lib=LIBS],
+               [LIBS is space-separated linkable list (enclosed in quotes) of libraries
+                needed for DSuperLU. OK to use -L and -l flags in the list]),
+[for dsuperlu_lib in $withval; do
+    DSUPERLU_LIBS="$DSUPERLU_LIBS $dsuperlu_lib"
+ done]
+)
+
+dnl ***** FEI
+
 AC_ARG_WITH(fei,
 AS_HELP_STRING([--with-fei],
                [Use internal FEI routines.]),
@@ -741,14 +864,7 @@
  esac]
 )
 
-AC_ARG_WITH(superlu,
-AS_HELP_STRING([--with-superlu],
-               [Use internal SuperLU routines.]),
-[case "${withval}" in
-    no) hypre_using_superlu=no ;;
-    *)  hypre_using_superlu=yes ;;
- esac]
-)
+dnl ***** MLI
 
 AC_ARG_WITH(mli,
 AS_HELP_STRING([--with-mli],
@@ -759,6 +875,8 @@
  esac]
 )
 
+dnl ***** MPI
+
 AC_ARG_WITH(MPI,
 AS_HELP_STRING([--with-MPI],
                [DEFAULT: Compile with MPI.  Selecting --without-MPI
@@ -769,6 +887,147 @@
  esac]
 )
 
+dnl ***** CUDA
+
+AC_ARG_WITH(cuda,
+AS_HELP_STRING([--with-cuda],
+               [Use CUDA. Require cuda-8.0 or higher (default is NO).]),
+[case "$withval" in
+    yes) hypre_user_chose_cuda=yes
+    	 hypre_using_cuda=yes ;;
+    no)  hypre_using_cuda=no ;;
+    *)   hypre_using_cuda=no ;;
+ esac],
+[hypre_using_cuda=no]
+)
+
+dnl ***** RAJA
+
+AC_ARG_WITH(raja,
+AS_HELP_STRING([--with-raja],
+               [Use RAJA. Require RAJA package to be compiled properly (default is NO).]),
+[case "$withval" in
+    yes) hypre_user_chose_raja=yes;;
+    no)  hypre_user_chose_raja=no ;;
+    *)   hypre_user_chose_raja=no ;;
+ esac],
+[hypre_using_raja=no]
+)
+
+dnl ***** Kokkos
+
+AC_ARG_WITH(kokkos,
+AS_HELP_STRING([--with-kokkos],
+               [Use Kokkos. Require kokkos package to be compiled properly(default is NO).]),
+[case "$withval" in
+    yes) hypre_user_chose_kokkos=yes ;;
+    no)  hypre_user_chose_kokkos=no ;;
+    *)   hypre_user_chose_kokkos=no ;;
+ esac]
+)
+
+AC_ARG_WITH(raja-include,
+AS_HELP_STRING([--with-raja-include=DIR],
+               [User specifies that RAJA/*.h is in DIR.  The options
+                --with-raja-include --with-raja-libs and 
+                --with-raja-lib-dirs must be used together.]),
+[for raja_dir in $withval; do
+    HYPRE_RAJA_INCLUDE="$HYPRE_RAJA_INCLUDE -I$raja_dir"
+ done;
+ hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(raja-lib,
+AS_HELP_STRING([--with-raja-lib=LIBS],
+               [LIBS is space-separated linkable list (enclosed in quotes) of libraries
+                needed for RAJA. OK to use -L and -l flags in the list]),
+[for raja_lib in $withval; do
+       HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB $raja_lib"
+ done;
+hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(raja-libs,
+AS_HELP_STRING([--with-raja-libs=LIBS],
+               [LIBS is space-separated list (enclosed in quotes) of libraries
+                needed for RAJA (base name only). The options --with-raja-libs and
+                --with-raja-lib-dirs must be used together.]),
+[for raja_lib in $withval; do
+    HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB -l$raja_lib"
+ done;
+hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(raja-lib-dirs,
+AS_HELP_STRING([--with-raja-lib-dirs=DIRS],
+               [DIRS is space-separated list (enclosed in quotes) of 
+                directories containing the libraries specified by 
+                --with-raja-libs, e.g "usr/lib /usr/local/lib".
+                The  options --with-raja-libs and --raja-blas-lib-dirs
+                must be used together.]),
+[for raja_lib_dir in $withval; do
+    HYPRE_RAJA_LIB_DIR="-L$raja_lib_dir $HYPRE_RAJA_LIB_DIR"
+ done;
+ hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(kokkos-include,
+AS_HELP_STRING([--with-kokkos-include=DIR],
+               [User specifies that KOKKOS headers is in DIR.  The options
+                --with-kokkos-include --with-kokkos-libs and
+                --with-kokkos-dirs must be used together.]), 
+[for kokkos_dir in $withval; do
+HYPRE_KOKKOS_INCLUDE="$HYPRE_KOKKOS_INCLUDE -I$kokkos_dir"
+done;
+hypre_user_chose_kokkos=yes]
+) 
+
+AC_ARG_WITH(kokkos-lib,
+AS_HELP_STRING([--with-kokkos-lib=LIBS],
+               [LIBS is space-separated linkable list (enclosed in quotes) of libraries
+                needed for KOKKOS. OK to use -L and -l flags in the list]),
+[for kokkos_lib in $withval; do
+       HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB $kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes]
+)
+
+AC_ARG_WITH(kokkos-libs,
+AS_HELP_STRING([--with-kokkos-libs=LIBS],
+               [LIBS is space-separated list (enclosed in quotes) of libraries
+                needed for KOKKOS (base name only). The options --with-kokkos-libs and
+                --with-kokkos-dirs must be used together.]),
+[for kokkos_lib in $withval; do
+    HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB -l$kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes]
+)
+
+AC_ARG_WITH(kokkos-dirs,
+AS_HELP_STRING([--with-kokkos-dirs=DIRS],
+               [DIRS is space-separated list (enclosed in quotes) of 
+                directories containing Makefile.kokkos.
+                The  options --with-kokkos-libs and --with-kokkos-dirs
+                must be used together.]),
+[for kokkos_lib_dir in $withval; do
+    HYPRE_KOKKOS_SRC_DIR="$kokkos_lib_dir"
+ done;
+hypre_user_chose_kokkos=yes]
+)
+
+AC_ARG_WITH(nvcc,
+AS_HELP_STRING([--with-nvcc],
+               [Use NVCC compiler (default is NO).]),
+[case "${withval}" in
+    yes) hypre_using_nvcc=yes ;;
+    no)  hypre_using_nvcc=no ;;
+    *)   AC_MSG_ERROR([Bad value ${withval} for --with-nvcc]) ;;
+ esac],
+[hypre_using_nvcc=no]
+)
+
+dnl ***** Caliper
+
 AC_ARG_WITH(caliper,
 AS_HELP_STRING([--with-caliper],
                [Use Caliper instrumentation (default is NO).]),
@@ -801,22 +1060,48 @@
 dnl *********************************************************************
 dnl * Select compilers if not already defined by command line options 
 dnl *********************************************************************
+if test "$hypre_using_cuda" = "yes"
+then
+	hypre_using_fortran=no
+	AC_CHECK_PROGS(CXX, [nvcc])
+	AC_CHECK_PROGS(CC, [nvcc])
+	if test "$hypre_user_chose_cxxcompilers" = "no"
+	then
+	   if test "$hypre_using_mpi" = "no"
+	   then
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         AC_CHECK_PROGS(CUDACXX, [xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
+              else
+		 AC_CHECK_PROGS(CUDACXX, [xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
+	      fi
+	   else
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         AC_CHECK_PROGS(CUDACXX, [mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
+	      else
+	         AC_CHECK_PROGS(CUDACXX, [mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
+	      fi
+	   fi
+	fi
+fi
+
 if test "$hypre_user_chose_ccompilers" = "no"
 then
    if test "$hypre_using_mpi" = "no"
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CC, [xlc_r xlC_r icc icpc gcc g++ pgcc pgCC cc CC kcc KCC])
+         AC_CHECK_PROGS(CC, [xlc_r xlC_r xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC])
       else
          AC_CHECK_PROGS(CC, [xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC])
       fi
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CC, [mpxlc mpixlc_r mpiicc mpicc mpipgcc])
+         AC_CHECK_PROGS(CC, [mpxlc mpixlc_r mpixlc mpiicc mpigcc mpicc mpipgcc])
       else
-         AC_CHECK_PROGS(CC, [mpxlc mpixlc mpiicc mpicc mpipgcc])
+         AC_CHECK_PROGS(CC, [mpxlc mpixlc mpiicc mpigcc mpicc mpipgcc])
       fi
    fi
 
@@ -832,16 +1117,16 @@
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CXX, [xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
+         AC_CHECK_PROGS(CXX, [xlC_r xlc_r xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
       else
          AC_CHECK_PROGS(CXX, [xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
       fi
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx_r mpiicpc mpiCC mpicxx mpipgCC])
+         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
       else
-         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx mpiicpc mpiCC mpicxx mpipgCC])
+         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
       fi
    fi
 
@@ -933,10 +1218,10 @@
 dnl * to make that happen.
 if test "$hypre_using_global_partition" = "no"
 then
-   if test "$hypre_using_mpi" != "no"
-   then
+dnl   if test "$hypre_using_mpi" != "no"
+dnl   then
       AC_DEFINE(HYPRE_NO_GLOBAL_PARTITION, 1)
-   fi
+dnl   fi
 fi
 
 dnl *********************************************************************
@@ -981,9 +1266,6 @@
    fi
    if test "$hypre_using_hypre_blas" = "yes"
    then
-      HYPRE_BLAS_SRC_DIR="$HYPRE_SRCDIR/blas"
-      HYPRE_BLAS_FILES="$HYPRE_SRCDIR/blas/*.o"
-dnl      BLAS_DIR="$HYPRE_SRCDIR/hypre/lib"
       BLASLIBDIRS=""
       BLASLIBS=""
       AC_DEFINE(HYPRE_USING_HYPRE_BLAS, 1, [Using internal HYPRE routines])
@@ -1027,9 +1309,6 @@
    fi
    if test "$hypre_using_hypre_lapack" = "yes"
    then
-      HYPRE_LAPACK_SRC_DIR="$HYPRE_SRCDIR/lapack"
-      HYPRE_LAPACK_FILES="$HYPRE_SRCDIR/lapack/*.o"
-dnl      LAPACK_DIR="$HYPRE_SRCDIR/hypre/lib"
       LAPACKLIBDIRS=""
       LAPACKLIBS=""
       AC_DEFINE(HYPRE_USING_HYPRE_LAPACK, 1, [Using internal HYPRE routines])
@@ -1037,28 +1316,23 @@
 fi
 
 dnl *********************************************************************
-dnl * Determine if FEI libraries are needed.
-dnl * if so, determine whether internal routines or user-specified 
-dnl * libraries are to be used.
+dnl * Determine if FEI and MLI are needed.
+dnl * Note that MLI requires both FEI and SuperLU.
 dnl *********************************************************************
 if test "$hypre_using_fei" = "yes"
 then
    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
    HYPRE_FEI_SUBDIRS="fei-hypre"
    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
-   if test "$hypre_using_superlu" = "yes"
-   then
-      HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
-      HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
-   else
-      HYPRE_FEI_SUPERLU_FILES=
-   fi
+   HYPRE_FEI_FEMLI_FILES=
    if test "$hypre_using_mli" = "yes"
    then
-      HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
-      HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
-   else
-      HYPRE_FEI_FEMLI_FILES=
+      if test "$hypre_using_superlu" = "yes"
+      then
+         HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
+         HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
+         AC_DEFINE(HAVE_MLI, 1, [Using MLI.])
+      fi
    fi
    if test "$hypre_user_chose_fei" = "no"
    then
@@ -1070,7 +1344,6 @@
    HYPRE_FEI_BASE_DIR=
    HYPRE_FEI_HYPRE_FILES=
    HYPRE_FEI_FEMLI_FILES=
-   HYPRE_FEI_SUPERLU_FILES=
 fi
 
 dnl *********************************************************************
@@ -1254,6 +1527,172 @@
 fi
 
 dnl *********************************************************************
+dnl * Set nvcc options
+dnl *********************************************************************
+
+if test "$hypre_using_nvcc" = "yes"
+then
+   AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+   NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   CFLAGS="${CFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+   CXXFLAGS="${CXXFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+else
+   NVCCFLAGS=
+   NVCCLIBS=
+   HYPRE_NVCC_MAKEFILE="Makefile.empty"
+fi
+
+dnl *********************************************************************
+dnl * Set raja options
+dnl *********************************************************************
+if test "$hypre_user_chose_raja" = "yes"
+then
+   RAJA_LIBS=" $HYPRE_RAJA_LIB_DIR $HYPRE_RAJA_LIB "
+   if [test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"]
+   then 
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+      then
+        RAJAFLAGS=" -lRAJA "
+	LDFLAGS=" -ccbin=$CUDACXX -expt-extended-lambda -Xcompiler -fopenmp -arch compute_35 -lcudart -lcuda $RAJAFLAGS "
+   	CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register -Xcompiler $RAJAFLAGS "
+   	CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register  $RAJAFLAGS "
+   	if test "$hypre_using_um" != "yes"
+   	then
+		CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	   	CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+		AC_MSG_NOTICE([*******************************************************])
+       		AC_MSG_NOTICE([Configuring with --with-raja and --with-cuda without unified memory.]) 
+		AC_MSG_NOTICE([It only works for struct interface.])
+		AC_MSG_NOTICE([Try to confiure with --wiht-raja --with-cuda --enable-unified-memory])
+		AC_MSG_NOTICE([to use the cuda feature for the whold package])
+       		AC_MSG_NOTICE([*******************************************************])
+	else
+		CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+		CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " 
+   	fi
+	hypre_user_chose_cuda=no
+   else 
+   	if test "$hypre_using_openmp" = "yes"
+	then
+	   CC=${CXX}
+	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   hypre_using_openmp=no
+	else
+	   CC=${CXX}
+   	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+   	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+	fi
+   fi
+fi
+dnl *********************************************************************
+dnl * Set kokkos options
+dnl *********************************************************************
+
+if test "$hypre_user_chose_kokkos" = "yes"
+then
+   if [test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"]
+   then 
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+   then
+      LDFLAGS=" -ccbin=$CUDACXX -arch compute_35 -lcudart -lcuda"
+      HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+      HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+      CC=${CXX}
+      LINK_CC=$LINK_CXX
+      CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      if test "$hypre_using_um" != "yes"
+      then
+          CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	  CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+          AC_MSG_NOTICE([*******************************************************])
+	  AC_MSG_NOTICE([Configuring with --with-kokkos and --with-cuda, but not with unified memory])
+	  AC_MSG_NOTICE([It only works for struct interface.])
+	  AC_MSG_NOTICE([Try to confiure with --wiht-raja --with-cuda --enable-unified-memory])
+	  AC_MSG_NOTICE([to use the cuda feature for the whold package])
+	  AC_MSG_NOTICE([*******************************************************])
+      else
+	  CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+	  AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi
+      hypre_user_chose_cuda=no
+   else
+      if test "$hypre_using_openmp" = "yes"  
+      then
+         CC=${CXX}
+	 CFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 CXXFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 hypre_using_openmp=no
+      else
+      	 CC=${CXX}
+	 CFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 CXXFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+	 AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi   
+  fi
+  KOKKOS_LIBS=" $HYPRE_KOKKOS_LIB_DIR $HYPRE_KOKKOS_LIB "
+fi
+dnl *********************************************************************
+dnl * Set cuda options
+dnl *********************************************************************
+if test "$hypre_user_chose_cuda" = "yes"
+then
+   LDFLAGS+=" -ccbin=$CUDACXX -arch compute_35 "
+   CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   if test "$hypre_using_um" != "yes"
+   then
+	CFLAGS+=" -DHYPRE_MEMORY_GPU"
+	CXXFLAGS+=" -DHYPRE_MEMORY_GPU"
+	AC_MSG_NOTICE([*******************************************************])
+       	AC_MSG_NOTICE([Configuring with --with-cuda=yes without unified memory.]) 
+	AC_MSG_NOTICE([It only works for struct interface.])
+	AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.])
+       	AC_MSG_NOTICE([*******************************************************])
+   else
+	CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+   	CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+	AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   fi
+fi
+
+if test "$hypre_using_um" = "yes"
+then
+   LDFLAGS+=" -lcudart -lcuda "
+   CFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+   CXXFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+fi
+
+if test "$hypre_using_openmp" = "yes"
+then
+   AC_DEFINE([HYPRE_USING_OPENMP],1,[Enable OpenMP support])
+fi
+
+dnl *********************************************************************
 dnl * Set installation directories
 dnl *********************************************************************
 HYPRE_INSTALLDIR="${prefix}"
@@ -1340,27 +1779,60 @@
 AC_SUBST(HYPRE_FEI_SUBDIRS)
 AC_SUBST(HYPRE_FEI_HYPRE_FILES)
 AC_SUBST(HYPRE_FEI_FEMLI_FILES)
-AC_SUBST(HYPRE_FEI_SUPERLU_FILES)
 
 dnl *********************************************************************
 dnl * BLAS & LAPACK related information
 dnl *********************************************************************
-AC_SUBST(HYPRE_BLAS_SRC_DIR)
-AC_SUBST(HYPRE_BLAS_FILES)
+AC_SUBST(HYPRE_KOKKOS_PATH)
 AC_SUBST(BLASLIBDIRS)
 AC_SUBST(BLASLIBS)
-AC_SUBST(HYPRE_LAPACK_SRC_DIR)
-AC_SUBST(HYPRE_LAPACK_FILES)
 AC_SUBST(LAPACKLIBDIRS)
 AC_SUBST(LAPACKLIBS)
 
 dnl *********************************************************************
+dnl * RAJA information
+dnl *********************************************************************
+AC_SUBST(HYPRE_RAJA_LIB_DIR)
+AC_SUBST(HYPRE_RAJA_INCLUDE)
+AC_SUBST(HYPRE_RAJA_LIB)
+AC_SUBST(RAJA_LIBS)
+
+dnl *********************************************************************
+dnl * KOKKOS information
+dnl *********************************************************************
+AC_SUBST(HYPRE_KOKKOS_SRC_DIR)
+AC_SUBST(HYPRE_KOKKOS_LIB_DIR)
+AC_SUBST(HYPRE_KOKKOS_INCLUDE)
+AC_SUBST(HYPRE_KOKKOS_INC_FILE)	
+AC_SUBST(HYPRE_KOKKOS_LIB)
+AC_SUBST(KOKKOS_LIBS)
+
+dnl *********************************************************************
+dnl * NVCC stuff
+dnl *********************************************************************
+AC_SUBST(NVCCFLAGS)
+AC_SUBST(NVCCLIBS)
+AC_SUBST(HYPRE_NVCC_MAKEFILE)
+
+dnl *********************************************************************
 dnl * Caliper instrumentation
 dnl *********************************************************************
 AC_SUBST(CALIPER_INCLUDE)
 AC_SUBST(CALIPER_LIBS)
 
 dnl *********************************************************************
+dnl * SuperLU instrumentation
+dnl *********************************************************************
+AC_SUBST(SUPERLU_INCLUDE)
+AC_SUBST(SUPERLU_LIBS)
+
+dnl *********************************************************************
+dnl * DSuperLU instrumentation
+dnl *********************************************************************
+AC_SUBST(DSUPERLU_INCLUDE)
+AC_SUBST(DSUPERLU_LIBS)
+
+dnl *********************************************************************
 dnl * ar & ranlib substitution
 dnl *********************************************************************
 AC_SUBST(AR)
diff -Nru hypre-2.11.2/src/config/HYPRE_config.h.cmake.in hypre-2.13.0/src/config/HYPRE_config.h.cmake.in
--- hypre-2.11.2/src/config/HYPRE_config.h.cmake.in	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/config/HYPRE_config.h.cmake.in	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,12 @@
 /* Use long long int for HYPRE_Int */
 #cmakedefine HYPRE_BIGINT
 
+/* Use single precision values for HYPRE_Real */
+#cmakedefine HYPRE_SINGLE
+
+/* Use quad precision values for HYPRE_Real */
+#cmakedefine HYPRE_LONG_DOUBLE
+
 /* Use complex values */
 #cmakedefine HYPRE_COMPLEX
 
diff -Nru hypre-2.11.2/src/config/HYPRE_config.h.in hypre-2.13.0/src/config/HYPRE_config.h.in
--- hypre-2.11.2/src/config/HYPRE_config.h.in	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/config/HYPRE_config.h.in	2017-10-20 17:42:22.000000000 +0000
@@ -49,6 +49,12 @@
 /* Define to 1 if using long long int for HYPRE_Int */
 #undef HYPRE_BIGINT
 
+/* Define to 1 if using single precision values for HYPRE_Real */
+#undef HYPRE_SINGLE
+
+/* Define to 1 if using quad precision values for HYPRE_Real */
+#undef HYPRE_LONG_DOUBLE
+
 /* Define to 1 if using complex values */
 #undef HYPRE_COMPLEX
 
@@ -115,5 +121,18 @@
 /* As HYPRE_FC_FUNC, but for C identifiers containing underscores. */
 #undef FC_FUNC_
 
+/* Define to 1 if nvcc is enabled */
+#undef HYPRE_USING_NVCC
+
 /* Define to 1 if Caliper instrumentation is enabled */
 #undef HYPRE_USING_CALIPER
+
+/* Define to 1 if using SuperLU */
+#undef HAVE_SUPERLU
+
+/* Define to 1 if using DSuperLU */
+#undef HAVE_DSUPERLU
+
+/* Define to 1 if using MLI */
+#undef HAVE_MLI
+
diff -Nru hypre-2.11.2/src/config/Makefile.config.in hypre-2.13.0/src/config/Makefile.config.in
--- hypre-2.11.2/src/config/Makefile.config.in	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/config/Makefile.config.in	2017-10-20 17:42:22.000000000 +0000
@@ -10,7 +10,6 @@
 # $Revision$
 #EHEADER**********************************************************************
 
-
 ###############################################################################
 ##  All configurable variables are defined in the file named Makefile.config.in
 ##  When Autoconf is run, it will create a file named Makefile.config which
@@ -42,7 +41,7 @@
 HYPRE_LIB_SUFFIX = @HYPRE_LIBSUFFIX@
 
 .SUFFIXES:
-.SUFFIXES: .o .f .c .C .cxx .cc
+.SUFFIXES: .o .f .c .C .cxx .cc .cu
 
 .f.o:
 	$(FC) $(FFLAGS) -c $< 
@@ -54,6 +53,8 @@
 	$(CXX) $(CXXFLAGS) -c $<
 .cc.o:
 	$(CXX) $(CXXFLAGS) -c $<
+.cu.o:
+	$(NVCC) $(NVCCFLAGS) -c $<
 
 FC       = @FC@
 FFLAGS   = @FFLAGS@ @FCFLAGS@ $(FC_COMPILE_FLAGS)
@@ -64,6 +65,10 @@
 CXX      = @CXX@
 CXXFLAGS = @CXXFLAGS@ @DEFS@ $(CXX_COMPILE_FLAGS)
 
+NVCC 	  = nvcc
+NVCCFLAGS = @NVCCFLAGS@
+NVCCLIBS  = @NVCCLIBS@
+
 LINK_FC  = @LINK_FC@
 LINK_CC  = @LINK_CC@
 LINK_CXX = @LINK_CXX@
@@ -85,26 +90,22 @@
 RANLIB = @RANLIB@
 
 LDFLAGS = @LDFLAGS@
-LIBS    = @LIBS@ @CALIPER_LIBS@
+LIBS    = @LIBS@ @CALIPER_LIBS@ @NVCCLIBS@  @RAJA_LIBS@ @KOKKOS_LIBS@
 FLIBS   = @FLIBS@
 
-INCLUDES = @CALIPER_INCLUDE@
+INCLUDES = @CALIPER_INCLUDE@ @HYPRE_RAJA_INCLUDE@ @HYPRE_KOKKOS_INCLUDE@
 
 ##################################################################
 ##  LAPACK Library Flags 
 ##################################################################
-HYPRE_LAPACK_SRC_DIR = @HYPRE_LAPACK_SRC_DIR@
-HYPRE_LAPACK_FILES   = @HYPRE_LAPACK_FILES@
-LAPACKLIBS           = @LAPACKLIBS@
-LAPACKLIBDIRS        = @LAPACKLIBDIRS@
+LAPACKLIBS      = @LAPACKLIBS@
+LAPACKLIBDIRS   = @LAPACKLIBDIRS@
 
 ##################################################################
 ##  BLAS Library Flags 
 ##################################################################
-HYPRE_BLAS_SRC_DIR = @HYPRE_BLAS_SRC_DIR@
-HYPRE_BLAS_FILES   = @HYPRE_BLAS_FILES@
-BLASLIBS           = @BLASLIBS@
-BLASLIBDIRS        = @BLASLIBDIRS@
+BLASLIBS        = @BLASLIBS@
+BLASLIBDIRS     = @BLASLIBDIRS@
 
 ##################################################################
 ##  MPI options
@@ -115,17 +116,49 @@
 MPIFLAGS   = @MPIFLAGS@
 
 ##################################################################
+##  NVCC options
+##################################################################
+HYPRE_NVCC_MAKEFILE = @HYPRE_NVCC_MAKEFILE@
+
+##################################################################
 ##  Caliper options
 ##################################################################
 CALIPER_INCLUDE = @CALIPER_INCLUDE@
 CALIPER_LIBS    = @CALIPER_LIBS@
 
 ##################################################################
+##  SuperLU options
+##################################################################
+SUPERLU_INCLUDE = @SUPERLU_INCLUDE@
+SUPERLU_LIBS    = @SUPERLU_LIBS@
+
+##################################################################
+##  DsuperLU options
+##################################################################
+DSUPERLU_INCLUDE = @DSUPERLU_INCLUDE@
+DSUPERLU_LIBS    = @DSUPERLU_LIBS@
+
+##################################################################
 ##  FEI options
 ##################################################################
 HYPRE_FEI_SRC_DIR       = @HYPRE_FEI_SRC_DIR@
 HYPRE_FEI_BASE_DIR      = @HYPRE_FEI_BASE_DIR@
 HYPRE_FEI_SUBDIRS       = @HYPRE_FEI_SUBDIRS@
-HYPRE_FEI_SUPERLU_FILES = @HYPRE_FEI_SUPERLU_FILES@
 HYPRE_FEI_HYPRE_FILES   = @HYPRE_FEI_HYPRE_FILES@
 HYPRE_FEI_FEMLI_FILES   = @HYPRE_FEI_FEMLI_FILES@
+
+##################################################################
+##  RAJA options
+##################################################################
+HYPRE_RAJA_LIB_DIR      = @HYPRE_RAJA_LIB_DIR@
+HYPRE_RAJA_INCLUDE      = @HYPRE_RAJA_INCLUDE@
+HYPRE_RAJA_LIB          = @HYPRE_RAJA_LIB@
+
+##################################################################
+##  kokkos options
+##################################################################
+HYPRE_KOKKOS_SRC_DIR    = @HYPRE_KOKKOS_SRC_DIR@
+HYPRE_KOKKOS_LIB_DIR    = @HYPRE_KOKKOS_LIB_DIR@
+HYPRE_KOKKOS_INCLUDE    = @HYPRE_KOKKOS_INCLUDE@
+HYPRE_KOKKOS_LIB        = @HYPRE_KOKKOS_LIB@
+@HYPRE_KOKKOS_INC_FILE@
diff -Nru hypre-2.11.2/src/configure hypre-2.13.0/src/configure
--- hypre-2.11.2/src/configure	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/configure	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,7 @@
 #! /bin/sh
 # From configure.in Id.
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for hypre 2.11.2.
+# Generated by GNU Autoconf 2.69 for hypre 2.13.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -586,8 +586,8 @@
 # Identity of this package.
 PACKAGE_NAME='hypre'
 PACKAGE_TARNAME='hypre'
-PACKAGE_VERSION='2.11.2'
-PACKAGE_STRING='hypre 2.11.2'
+PACKAGE_VERSION='2.13.0'
+PACKAGE_STRING='hypre 2.13.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -596,17 +596,30 @@
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
 AR
+DSUPERLU_LIBS
+DSUPERLU_INCLUDE
+SUPERLU_LIBS
+SUPERLU_INCLUDE
 CALIPER_LIBS
 CALIPER_INCLUDE
+HYPRE_NVCC_MAKEFILE
+NVCCLIBS
+NVCCFLAGS
+KOKKOS_LIBS
+HYPRE_KOKKOS_LIB
+HYPRE_KOKKOS_INC_FILE
+HYPRE_KOKKOS_INCLUDE
+HYPRE_KOKKOS_LIB_DIR
+HYPRE_KOKKOS_SRC_DIR
+RAJA_LIBS
+HYPRE_RAJA_LIB
+HYPRE_RAJA_INCLUDE
+HYPRE_RAJA_LIB_DIR
 LAPACKLIBS
 LAPACKLIBDIRS
-HYPRE_LAPACK_FILES
-HYPRE_LAPACK_SRC_DIR
 BLASLIBS
 BLASLIBDIRS
-HYPRE_BLAS_FILES
-HYPRE_BLAS_SRC_DIR
-HYPRE_FEI_SUPERLU_FILES
+HYPRE_KOKKOS_PATH
 HYPRE_FEI_FEMLI_FILES
 HYPRE_FEI_HYPRE_FILES
 HYPRE_FEI_SUBDIRS
@@ -653,6 +666,7 @@
 CFLAGS
 RANLIB
 SET_MAKE
+CUDACXX
 FC
 CXX
 CC
@@ -714,6 +728,8 @@
 enable_debug
 enable_shared
 enable_bigint
+enable_single
+enable_longdouble
 enable_complex
 enable_maxdim
 enable_persistent
@@ -721,6 +737,7 @@
 with_no_global_partition
 enable_global_partition
 enable_fortran
+enable_unified_memory
 with_LD
 with_LDFLAGS
 with_extra_incpath
@@ -744,10 +761,27 @@
 with_print_errors
 with_timing
 with_openmp
-with_fei
 with_superlu
+with_superlu_include
+with_superlu_lib
+with_dsuperlu
+with_dsuperlu_include
+with_dsuperlu_lib
+with_fei
 with_mli
 with_MPI
+with_cuda
+with_raja
+with_kokkos
+with_raja_include
+with_raja_lib
+with_raja_libs
+with_raja_lib_dirs
+with_kokkos_include
+with_kokkos_lib
+with_kokkos_libs
+with_kokkos_dirs
+with_nvcc
 with_caliper
 with_caliper_include
 with_caliper_lib
@@ -1308,7 +1342,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures hypre 2.11.2 to adapt to many kinds of systems.
+\`configure' configures hypre 2.13.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1373,7 +1407,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of hypre 2.11.2:";;
+     short | recursive ) echo "Configuration of hypre 2.13.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1384,6 +1418,8 @@
   --enable-debug          Set compiler flags for debugging.
   --enable-shared         Build shared libraries (default is NO).
   --enable-bigint         Use long long int for HYPRE_Int (default is NO).
+  --enable-single         Use single precision values (default is NO).
+  --enable-longdouble     Use long double precision values (default is NO).
   --enable-complex        Use complex values (default is NO).
   --enable-maxdim=MAXDIM  Change max dimension size to MAXDIM (default is 3).
                           Currently must be at least 3.
@@ -1393,6 +1429,8 @@
   --enable-global-partition
                           Use global partitioning (default is NO).
   --enable-fortran        Require a working Fortran compiler (default is YES).
+  --enable-unified-memory Use unified memory for allocating the memory
+                          (default is NO).
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -1476,11 +1514,61 @@
   --with-timing           Use HYPRE timing routines.
   --with-openmp           Use OpenMP. This may affect which compiler is
                           chosen.
+  --with-superlu          Use external SuperLU library.
+  --with-superlu-include=DIR
+                          Directory where SuperLU is installed.
+  --with-superlu-lib=LIBS LIBS is space-separated linkable list (enclosed in
+                          quotes) of libraries needed for SuperLU. OK to use
+                          -L and -l flags in the list
+  --with-dsuperlu         Use external DSuperLU library.
+  --with-dsuperlu-include=DIR
+                          Directory where DSuperLU is installed.
+  --with-dsuperlu-lib=LIBS
+                          LIBS is space-separated linkable list (enclosed in
+                          quotes) of libraries needed for DSuperLU. OK to use
+                          -L and -l flags in the list
   --with-fei              Use internal FEI routines.
-  --with-superlu          Use internal SuperLU routines.
   --with-mli              Use MLI
   --with-MPI              DEFAULT: Compile with MPI. Selecting --without-MPI
                           may affect which compiler is chosen.
+  --with-cuda             Use CUDA. Require cuda-8.0 or higher (default is
+                          NO).
+  --with-raja             Use RAJA. Require RAJA package to be compiled
+                          properly (default is NO).
+  --with-kokkos           Use Kokkos. Require kokkos package to be compiled
+                          properly(default is NO).
+  --with-raja-include=DIR User specifies that RAJA/*.h is in DIR. The options
+                          --with-raja-include --with-raja-libs and
+                          --with-raja-lib-dirs must be used together.
+  --with-raja-lib=LIBS    LIBS is space-separated linkable list (enclosed in
+                          quotes) of libraries needed for RAJA. OK to use -L
+                          and -l flags in the list
+  --with-raja-libs=LIBS   LIBS is space-separated list (enclosed in quotes) of
+                          libraries needed for RAJA (base name only). The
+                          options --with-raja-libs and --with-raja-lib-dirs
+                          must be used together.
+  --with-raja-lib-dirs=DIRS
+                          DIRS is space-separated list (enclosed in quotes) of
+                          directories containing the libraries specified by
+                          --with-raja-libs, e.g "usr/lib /usr/local/lib". The
+                          options --with-raja-libs and --raja-blas-lib-dirs
+                          must be used together.
+  --with-kokkos-include=DIR
+                          User specifies that KOKKOS headers is in DIR. The
+                          options --with-kokkos-include --with-kokkos-libs and
+                          --with-kokkos-dirs must be used together.
+  --with-kokkos-lib=LIBS  LIBS is space-separated linkable list (enclosed in
+                          quotes) of libraries needed for KOKKOS. OK to use -L
+                          and -l flags in the list
+  --with-kokkos-libs=LIBS LIBS is space-separated list (enclosed in quotes) of
+                          libraries needed for KOKKOS (base name only). The
+                          options --with-kokkos-libs and --with-kokkos-dirs
+                          must be used together.
+  --with-kokkos-dirs=DIRS DIRS is space-separated list (enclosed in quotes) of
+                          directories containing Makefile.kokkos. The options
+                          --with-kokkos-libs and --with-kokkos-dirs must be
+                          used together.
+  --with-nvcc             Use NVCC compiler (default is NO).
   --with-caliper          Use Caliper instrumentation (default is NO).
   --with-caliper-include=DIR
                           Directory where Caliper is installed.
@@ -1570,7 +1658,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-hypre configure 2.11.2
+hypre configure 2.13.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1867,7 +1955,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by hypre $as_me 2.11.2, which was
+It was created by hypre $as_me 2.13.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2253,8 +2341,8 @@
 
 
 HYPRE_NAME="hypre"
-HYPRE_VERSION="2.11.2"
-HYPRE_DATE="2017/03/13"
+HYPRE_VERSION="2.13.0"
+HYPRE_DATE="2017/10/20"
 HYPRE_TIME="00:00:00"
 HYPRE_BUGS="hypre-support@llnl.gov"
 HYPRE_SRCDIR="`pwd`"
@@ -2306,18 +2394,26 @@
 hypre_user_chose_blas=no
 hypre_user_chose_lapack=no
 hypre_user_chose_fei=no
+hypre_user_chose_cuda=no
+hypre_user_chose_raja=no
+hypre_user_chose_kokkos=no
 
 hypre_using_c=yes
 hypre_using_cxx=yes
 
 hypre_using_mpi=yes
 
+hypre_using_superlu=no
+hypre_using_dsuperlu=no
+
 hypre_using_fei=yes
-hypre_using_superlu=yes
-hypre_using_mli=yes
+hypre_using_mli=no
 
 hypre_using_openmp=no
 hypre_using_insure=no
+hypre_using_cuda=no
+hypre_using_gpu=no
+hypre_using_um=no
 
 hypre_using_caliper=no
 hypre_user_gave_caliper_lib=no
@@ -2457,6 +2553,44 @@
 
 fi
 
+# Check whether --enable-single was given.
+if test "${enable_single+set}" = set; then :
+  enableval=$enable_single; case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_single=yes ;;
+    no)  hypre_using_single=no ;;
+    *)   as_fn_error $? "Bad value ${enableval} for --enable-single" "$LINENO" 5 ;;
+ esac
+else
+  hypre_using_single=no
+
+fi
+
+if test "$hypre_using_single" = "yes"
+then
+   $as_echo "#define HYPRE_SINGLE 1" >>confdefs.h
+
+fi
+
+# Check whether --enable-longdouble was given.
+if test "${enable_longdouble+set}" = set; then :
+  enableval=$enable_longdouble; case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_longdouble=yes ;;
+    no)  hypre_using_longdouble=no ;;
+    *)   as_fn_error $? "Bad value ${enableval} for --enable-longdouble" "$LINENO" 5 ;;
+ esac
+else
+  hypre_using_longdouble=no
+
+fi
+
+if test "$hypre_using_longdouble" = "yes"
+then
+   $as_echo "#define HYPRE_LONG_DOUBLE 1" >>confdefs.h
+
+fi
+
 # Check whether --enable-complex was given.
 if test "${enable_complex+set}" = set; then :
   enableval=$enable_complex; case "${enableval}" in
@@ -2562,6 +2696,20 @@
 fi
 
 
+# Check whether --enable-unified-memory was given.
+if test "${enable_unified_memory+set}" = set; then :
+  enableval=$enable_unified_memory; case "${enableval}" in
+    yes) hypre_using_um=yes ;;
+    no)  hypre_using_um=no ;;
+    *)   hypre_using_um=no ;;
+ esac
+else
+  hypre_using_um=no
+
+fi
+
+
+
 if test "x$CC" = "x"
 then
    hypre_user_chose_ccompilers=no
@@ -2862,6 +3010,7 @@
 
 
 
+
 # Check whether --with-MPI-include was given.
 if test "${with_MPI_include+set}" = set; then :
   withval=$with_MPI_include; for mpi_dir in $withval; do
@@ -2911,6 +3060,7 @@
 
 
 
+
 # Check whether --with-blas-lib was given.
 if test "${with_blas_lib+set}" = set; then :
   withval=$with_blas_lib; for blas_lib in $withval; do
@@ -2946,6 +3096,7 @@
 
 
 
+
 # Check whether --with-lapack-lib was given.
 if test "${with_lapack_lib+set}" = set; then :
   withval=$with_lapack_lib; for lapack_lib in $withval; do
@@ -3092,10 +3243,7 @@
 # Check whether --with-openmp was given.
 if test "${with_openmp+set}" = set; then :
   withval=$with_openmp; case "${withval}" in
-    yes) hypre_using_openmp=yes
-
-$as_echo "#define HYPRE_USING_OPENMP 1" >>confdefs.h
- ;;
+    yes) hypre_using_openmp=yes;;
     no)  hypre_using_openmp=no ;;
  esac
 else
@@ -3105,16 +3253,6 @@
 
 
 
-# Check whether --with-fei was given.
-if test "${with_fei+set}" = set; then :
-  withval=$with_fei; case "${withval}" in
-    no) hypre_using_fei=no ;;
-    *)  hypre_using_fei=yes ;;
- esac
-
-fi
-
-
 
 # Check whether --with-superlu was given.
 if test "${with_superlu+set}" = set; then :
@@ -3126,6 +3264,83 @@
 fi
 
 
+if test "x$with_superlu" = "xyes"; then :
+
+$as_echo "#define HAVE_SUPERLU 1" >>confdefs.h
+
+fi
+
+
+# Check whether --with-superlu-include was given.
+if test "${with_superlu_include+set}" = set; then :
+  withval=$with_superlu_include; for superlu_inc_dir in $withval; do
+    SUPERLU_INCLUDE="-I$superlu_inc_dir $SUPERLU_INCLUDE"
+ done
+
+fi
+
+
+
+# Check whether --with-superlu-lib was given.
+if test "${with_superlu_lib+set}" = set; then :
+  withval=$with_superlu_lib; for superlu_lib in $withval; do
+    SUPERLU_LIBS="$SUPERLU_LIBS $superlu_lib"
+ done
+
+fi
+
+
+
+
+# Check whether --with-dsuperlu was given.
+if test "${with_dsuperlu+set}" = set; then :
+  withval=$with_dsuperlu; case "${withval}" in
+    no) hypre_using_dsuperlu=no ;;
+    *)  hypre_using_dsuperlu=yes ;;
+ esac
+
+fi
+
+
+if test "x$with_dsuperlu" = "xyes"; then :
+
+$as_echo "#define HAVE_DSUPERLU 1" >>confdefs.h
+
+fi
+
+
+# Check whether --with-dsuperlu-include was given.
+if test "${with_dsuperlu_include+set}" = set; then :
+  withval=$with_dsuperlu_include; for dsuperlu_inc_dir in $withval; do
+    DSUPERLU_INCLUDE="-I$dsuperlu_inc_dir $DSUPERLU_INCLUDE"
+ done
+
+fi
+
+
+
+# Check whether --with-dsuperlu-lib was given.
+if test "${with_dsuperlu_lib+set}" = set; then :
+  withval=$with_dsuperlu_lib; for dsuperlu_lib in $withval; do
+    DSUPERLU_LIBS="$DSUPERLU_LIBS $dsuperlu_lib"
+ done
+
+fi
+
+
+
+
+# Check whether --with-fei was given.
+if test "${with_fei+set}" = set; then :
+  withval=$with_fei; case "${withval}" in
+    no) hypre_using_fei=no ;;
+    *)  hypre_using_fei=yes ;;
+ esac
+
+fi
+
+
+
 
 # Check whether --with-mli was given.
 if test "${with_mli+set}" = set; then :
@@ -3138,6 +3353,7 @@
 
 
 
+
 # Check whether --with-MPI was given.
 if test "${with_MPI+set}" = set; then :
   withval=$with_MPI; case "$withval" in
@@ -3149,6 +3365,153 @@
 
 
 
+
+# Check whether --with-cuda was given.
+if test "${with_cuda+set}" = set; then :
+  withval=$with_cuda; case "$withval" in
+    yes) hypre_user_chose_cuda=yes
+    	 hypre_using_cuda=yes ;;
+    no)  hypre_using_cuda=no ;;
+    *)   hypre_using_cuda=no ;;
+ esac
+else
+  hypre_using_cuda=no
+
+fi
+
+
+
+
+# Check whether --with-raja was given.
+if test "${with_raja+set}" = set; then :
+  withval=$with_raja; case "$withval" in
+    yes) hypre_user_chose_raja=yes;;
+    no)  hypre_user_chose_raja=no ;;
+    *)   hypre_user_chose_raja=no ;;
+ esac
+else
+  hypre_using_raja=no
+
+fi
+
+
+
+
+# Check whether --with-kokkos was given.
+if test "${with_kokkos+set}" = set; then :
+  withval=$with_kokkos; case "$withval" in
+    yes) hypre_user_chose_kokkos=yes ;;
+    no)  hypre_user_chose_kokkos=no ;;
+    *)   hypre_user_chose_kokkos=no ;;
+ esac
+
+fi
+
+
+
+# Check whether --with-raja-include was given.
+if test "${with_raja_include+set}" = set; then :
+  withval=$with_raja_include; for raja_dir in $withval; do
+    HYPRE_RAJA_INCLUDE="$HYPRE_RAJA_INCLUDE -I$raja_dir"
+ done;
+ hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-raja-lib was given.
+if test "${with_raja_lib+set}" = set; then :
+  withval=$with_raja_lib; for raja_lib in $withval; do
+       HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB $raja_lib"
+ done;
+hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-raja-libs was given.
+if test "${with_raja_libs+set}" = set; then :
+  withval=$with_raja_libs; for raja_lib in $withval; do
+    HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB -l$raja_lib"
+ done;
+hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-raja-lib-dirs was given.
+if test "${with_raja_lib_dirs+set}" = set; then :
+  withval=$with_raja_lib_dirs; for raja_lib_dir in $withval; do
+    HYPRE_RAJA_LIB_DIR="-L$raja_lib_dir $HYPRE_RAJA_LIB_DIR"
+ done;
+ hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-include was given.
+if test "${with_kokkos_include+set}" = set; then :
+  withval=$with_kokkos_include; for kokkos_dir in $withval; do
+HYPRE_KOKKOS_INCLUDE="$HYPRE_KOKKOS_INCLUDE -I$kokkos_dir"
+done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-lib was given.
+if test "${with_kokkos_lib+set}" = set; then :
+  withval=$with_kokkos_lib; for kokkos_lib in $withval; do
+       HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB $kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-libs was given.
+if test "${with_kokkos_libs+set}" = set; then :
+  withval=$with_kokkos_libs; for kokkos_lib in $withval; do
+    HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB -l$kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-dirs was given.
+if test "${with_kokkos_dirs+set}" = set; then :
+  withval=$with_kokkos_dirs; for kokkos_lib_dir in $withval; do
+    HYPRE_KOKKOS_SRC_DIR="$kokkos_lib_dir"
+ done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-nvcc was given.
+if test "${with_nvcc+set}" = set; then :
+  withval=$with_nvcc; case "${withval}" in
+    yes) hypre_using_nvcc=yes ;;
+    no)  hypre_using_nvcc=no ;;
+    *)   as_fn_error $? "Bad value ${withval} for --with-nvcc" "$LINENO" 5 ;;
+ esac
+else
+  hypre_using_nvcc=no
+
+fi
+
+
+
+
 # Check whether --with-caliper was given.
 if test "${with_caliper+set}" = set; then :
   withval=$with_caliper; hypre_using_caliper=yes
@@ -3185,13 +3548,285 @@
 fi
 
 
+if test "$hypre_using_cuda" = "yes"
+then
+	hypre_using_fortran=no
+	for ac_prog in nvcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CXX" && break
+done
+
+	for ac_prog in nvcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CC" && break
+done
+
+	if test "$hypre_user_chose_cxxcompilers" = "no"
+	then
+	   if test "$hypre_using_mpi" = "no"
+	   then
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         for ac_prog in xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+              else
+		 for ac_prog in xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+	      fi
+	   else
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         for ac_prog in mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+	      else
+	         for ac_prog in mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+	      fi
+	   fi
+	fi
+fi
+
 if test "$hypre_user_chose_ccompilers" = "no"
 then
    if test "$hypre_using_mpi" = "no"
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in xlc_r xlC_r icc icpc gcc g++ pgcc pgCC cc CC kcc KCC
+         for ac_prog in xlc_r xlC_r xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3280,7 +3915,7 @@
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in mpxlc mpixlc_r mpiicc mpicc mpipgcc
+         for ac_prog in mpxlc mpixlc_r mpixlc mpiicc mpigcc mpicc mpipgcc
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3323,7 +3958,7 @@
 done
 
       else
-         for ac_prog in mpxlc mpixlc mpiicc mpicc mpipgcc
+         for ac_prog in mpxlc mpixlc mpiicc mpigcc mpicc mpipgcc
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3380,7 +4015,7 @@
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
+         for ac_prog in xlC_r xlc_r xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3469,7 +4104,7 @@
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in mpxlC mpixlcxx_r mpiicpc mpiCC mpicxx mpipgCC
+         for ac_prog in mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3512,7 +4147,7 @@
 done
 
       else
-         for ac_prog in mpxlC mpixlcxx mpiicpc mpiCC mpicxx mpipgCC
+         for ac_prog in mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -6307,11 +6942,8 @@
 
 if test "$hypre_using_global_partition" = "no"
 then
-   if test "$hypre_using_mpi" != "no"
-   then
       $as_echo "#define HYPRE_NO_GLOBAL_PARTITION 1" >>confdefs.h
 
-   fi
 fi
 
 if test "$hypre_user_chose_blas" = "yes"
@@ -6521,8 +7153,6 @@
    fi
    if test "$hypre_using_hypre_blas" = "yes"
    then
-      HYPRE_BLAS_SRC_DIR="$HYPRE_SRCDIR/blas"
-      HYPRE_BLAS_FILES="$HYPRE_SRCDIR/blas/*.o"
       BLASLIBDIRS=""
       BLASLIBS=""
 
@@ -6716,8 +7346,6 @@
    fi
    if test "$hypre_using_hypre_lapack" = "yes"
    then
-      HYPRE_LAPACK_SRC_DIR="$HYPRE_SRCDIR/lapack"
-      HYPRE_LAPACK_FILES="$HYPRE_SRCDIR/lapack/*.o"
       LAPACKLIBDIRS=""
       LAPACKLIBS=""
 
@@ -6731,19 +7359,17 @@
    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
    HYPRE_FEI_SUBDIRS="fei-hypre"
    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
-   if test "$hypre_using_superlu" = "yes"
-   then
-      HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
-      HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
-   else
-      HYPRE_FEI_SUPERLU_FILES=
-   fi
+   HYPRE_FEI_FEMLI_FILES=
    if test "$hypre_using_mli" = "yes"
    then
-      HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
-      HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
-   else
-      HYPRE_FEI_FEMLI_FILES=
+      if test "$hypre_using_superlu" = "yes"
+      then
+         HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
+         HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
+
+$as_echo "#define HAVE_MLI 1" >>confdefs.h
+
+      fi
    fi
    if test "$hypre_user_chose_fei" = "no"
    then
@@ -6802,7 +7428,6 @@
    HYPRE_FEI_BASE_DIR=
    HYPRE_FEI_HYPRE_FILES=
    HYPRE_FEI_FEMLI_FILES=
-   HYPRE_FEI_SUPERLU_FILES=
 fi
 
 if test "$hypre_using_debug" = "yes"
@@ -7232,6 +7857,187 @@
    fi
 fi
 
+
+if test "$hypre_using_nvcc" = "yes"
+then
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+   NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   CFLAGS="${CFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+   CXXFLAGS="${CXXFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+else
+   NVCCFLAGS=
+   NVCCLIBS=
+   HYPRE_NVCC_MAKEFILE="Makefile.empty"
+fi
+
+if test "$hypre_user_chose_raja" = "yes"
+then
+   RAJA_LIBS=" $HYPRE_RAJA_LIB_DIR $HYPRE_RAJA_LIB "
+   if test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"
+   then
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+      then
+        RAJAFLAGS=" -lRAJA "
+	LDFLAGS=" -ccbin=$CUDACXX -expt-extended-lambda -Xcompiler -fopenmp -arch compute_35 -lcudart -lcuda $RAJAFLAGS "
+   	CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register -Xcompiler $RAJAFLAGS "
+   	CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register  $RAJAFLAGS "
+   	if test "$hypre_using_um" != "yes"
+   	then
+		CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	   	CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+       		{ $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-raja and --with-cuda without unified memory." >&5
+$as_echo "$as_me: Configuring with --with-raja and --with-cuda without unified memory." >&6;}
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&5
+$as_echo "$as_me: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&6;}
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: to use the cuda feature for the whold package" >&5
+$as_echo "$as_me: to use the cuda feature for the whold package" >&6;}
+       		{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+	else
+		CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+		CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+   	fi
+	hypre_user_chose_cuda=no
+   else
+   	if test "$hypre_using_openmp" = "yes"
+	then
+	   CC=${CXX}
+	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   hypre_using_openmp=no
+	else
+	   CC=${CXX}
+   	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+   	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+	fi
+   fi
+fi
+
+if test "$hypre_user_chose_kokkos" = "yes"
+then
+   if test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"
+   then
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+   then
+      LDFLAGS=" -ccbin=$CUDACXX -arch compute_35 -lcudart -lcuda"
+      HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+      HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+      CC=${CXX}
+      LINK_CC=$LINK_CXX
+      CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      if test "$hypre_using_um" != "yes"
+      then
+          CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	  CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+          { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-kokkos and --with-cuda, but not with unified memory" >&5
+$as_echo "$as_me: Configuring with --with-kokkos and --with-cuda, but not with unified memory" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&5
+$as_echo "$as_me: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: to use the cuda feature for the whold package" >&5
+$as_echo "$as_me: to use the cuda feature for the whold package" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+      else
+	  CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi
+      hypre_user_chose_cuda=no
+   else
+      if test "$hypre_using_openmp" = "yes"
+      then
+         CC=${CXX}
+	 CFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 CXXFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 hypre_using_openmp=no
+      else
+      	 CC=${CXX}
+	 CFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 CXXFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi
+  fi
+  KOKKOS_LIBS=" $HYPRE_KOKKOS_LIB_DIR $HYPRE_KOKKOS_LIB "
+fi
+if test "$hypre_user_chose_cuda" = "yes"
+then
+   LDFLAGS+=" -ccbin=$CUDACXX -arch compute_35 "
+   CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   if test "$hypre_using_um" != "yes"
+   then
+	CFLAGS+=" -DHYPRE_MEMORY_GPU"
+	CXXFLAGS+=" -DHYPRE_MEMORY_GPU"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+       	{ $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-cuda=yes without unified memory." >&5
+$as_echo "$as_me: Configuring with --with-cuda=yes without unified memory." >&6;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: Use --enable-unified-memory to compile with unified memory." >&5
+$as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;}
+       	{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+   else
+	CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+   	CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   fi
+fi
+
+if test "$hypre_using_um" = "yes"
+then
+   LDFLAGS+=" -lcudart -lcuda "
+   CFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+   CXXFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+fi
+
+if test "$hypre_using_openmp" = "yes"
+then
+
+$as_echo "#define HYPRE_USING_OPENMP 1" >>confdefs.h
+
+fi
+
 HYPRE_INSTALLDIR="${prefix}"
 HYPRE_LIBINSTALL="${libdir}"
 HYPRE_INCINSTALL="${includedir}"
@@ -7410,6 +8216,24 @@
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 ac_config_files="$ac_config_files config/Makefile.config"
 
 
@@ -7919,7 +8743,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by hypre $as_me 2.11.2, which was
+This file was extended by hypre $as_me 2.13.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7981,7 +8805,7 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-hypre config.status 2.11.2
+hypre config.status 2.13.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/blas_dh.c hypre-2.13.0/src/distributed_ls/Euclid/blas_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/blas_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/blas_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -119,7 +119,7 @@
     }
 
     if (np_dh > 1) {
-      hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm_dh);
+      hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm_dh);
     } else {
       result = local_result;
     }
@@ -145,7 +145,7 @@
   }
 
   if (np_dh > 1) {
-    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm_dh);
+    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm_dh);
   } else {
     result = local_result;
   }
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Euclid_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Euclid_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/Euclid_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/Euclid_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -75,7 +75,7 @@
   strcpy(ctx->krylovMethod, "bicgstab");
   ctx->maxIts = 200;
   ctx->rtol = 1e-5;
-  ctx->atol = 1e-50;
+  ctx->atol = _ATOL_;
   ctx->its = 0;
   ctx->itsTotal = 0;
   ctx->setupCount = 0;
@@ -132,11 +132,15 @@
 void Euclid_dhSetup(Euclid_dh ctx)
 {
   START_FUNC_DH
-  HYPRE_Int m, n, beg_row;
+  HYPRE_Int m, n, beg_row, ierr;
   HYPRE_Real t1;
   bool isSetup = ctx->isSetup;
   bool bj = false;
 
+  /* clear error flag if previously setup - DOK */
+  if(isSetup)
+     ierr = HYPRE_GetError(); HYPRE_ClearAllErrors();
+
   /*----------------------------------------------------
    * If Euclid was previously setup, print summary of
    * what happened during previous setup/solve
@@ -169,7 +173,9 @@
   if (ctx->A == NULL) {
     SET_V_ERROR("must set ctx->A before calling init");
   }
+
   EuclidGetDimensions(ctx->A, &beg_row, &m, &n); CHECK_V_ERROR;
+  
   ctx->m = m;
   ctx->n = n;
 
@@ -293,6 +299,9 @@
 
   ctx->isSetup = true;
 
+  /* setup done. Reset error flag - DOK*/
+  hypre_error_flag |= ierr;
+
   END_FUNC_DH
 }
 
@@ -410,7 +419,7 @@
       bufGlobal[1] = bufLocal[1];
       bufGlobal[2] = bufLocal[2];
     } else {
-      hypre_MPI_Reduce(bufLocal, bufGlobal, 3, hypre_MPI_DOUBLE, hypre_MPI_SUM, 0, comm_dh);
+      hypre_MPI_Reduce(bufLocal, bufGlobal, 3, hypre_MPI_REAL, hypre_MPI_SUM, 0, comm_dh);
     }
 
     if (myid_dh == 0) {
@@ -888,7 +897,7 @@
     HYPRE_Real bufOUT[TIMING_BINS];
 
     memcpy(bufOUT, ctx->timing, TIMING_BINS*sizeof(HYPRE_Real));
-    hypre_MPI_Reduce(bufOUT, ctx->timing, TIMING_BINS, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh);
+    hypre_MPI_Reduce(bufOUT, ctx->timing, TIMING_BINS, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh);
   }
 
   ctx->timingsWereReduced = true;
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/ExternalRows_dh.c hypre-2.13.0/src/distributed_ls/Euclid/ExternalRows_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/ExternalRows_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/ExternalRows_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -317,7 +317,7 @@
     nz = rcv_nz_counts[i];
     hypre_MPI_Irecv(extRowCval+offset, nz, HYPRE_MPI_INT,    nabor, CVAL_TAG, comm_dh, er->req1+i);
     hypre_MPI_Irecv(extRowFill+offset, nz, HYPRE_MPI_INT,    nabor, FILL_TAG, comm_dh, er->req2+i);
-    hypre_MPI_Irecv(extRowAval+offset, nz, hypre_MPI_DOUBLE, nabor, AVAL_TAG, comm_dh, er->req3+i);
+    hypre_MPI_Irecv(extRowAval+offset, nz, hypre_MPI_REAL, nabor, AVAL_TAG, comm_dh, er->req3+i);
     offset += nz;
   }
 
@@ -537,7 +537,7 @@
     HYPRE_Int nabor = hiNabors[i];
     hypre_MPI_Isend(cvalSend, nz, HYPRE_MPI_INT,    nabor, CVAL_TAG, comm_dh, er->cval_req+i);
     hypre_MPI_Isend(fillSend, nz, HYPRE_MPI_INT,    nabor, FILL_TAG, comm_dh, er->fill_req+i); 
-    hypre_MPI_Isend(avalSend, nz, hypre_MPI_DOUBLE, nabor, AVAL_TAG, comm_dh, er->aval_req+i);
+    hypre_MPI_Isend(avalSend, nz, hypre_MPI_REAL, nabor, AVAL_TAG, comm_dh, er->aval_req+i);
   }
   END_FUNC_DH
 }
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Factor_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Factor_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/Factor_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/Factor_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -379,7 +379,7 @@
     hypre_MPI_Request_free(&request); 
 
     /* set up persistent comms for receiving the values from this_pe */
-    hypre_MPI_Recv_init(recvBuf+i, j-i, hypre_MPI_DOUBLE, this_pe, 555,
+    hypre_MPI_Recv_init(recvBuf+i, j-i, hypre_MPI_REAL, this_pe, 555,
                         comm_dh, req+num_recv); 
     ++num_recv;
   }
@@ -457,7 +457,7 @@
       ++count;
 
       /* Set up the send */
-      hypre_MPI_Send_init(sendBuf, inlist[i], hypre_MPI_DOUBLE, i, 555, comm_dh, sendReq); 
+      hypre_MPI_Send_init(sendBuf, inlist[i], hypre_MPI_REAL, i, 555, comm_dh, sendReq); 
     }
   }
 
@@ -1128,7 +1128,7 @@
   if (np_dh == 1) {
     minGlobal = min;
   } else {
-    hypre_MPI_Reduce(&min, &minGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MIN, 0, comm_dh);
+    hypre_MPI_Reduce(&min, &minGlobal, 1, hypre_MPI_REAL, hypre_MPI_MIN, 0, comm_dh);
   }
 
   if (minGlobal == 0) {
@@ -1155,7 +1155,7 @@
   if (np_dh == 1) {
     maxGlobal = max;
   } else {
-    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh);
+    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh);
   }
   END_FUNC_VAL(maxGlobal)
 }
@@ -1185,7 +1185,7 @@
   if (np_dh == 1) {
     maxGlobal = max;
   } else {
-    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh);
+    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh);
   }
   END_FUNC_VAL(maxGlobal)
 }
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/globalObjects.c hypre-2.13.0/src/distributed_ls/Euclid/globalObjects.c
--- hypre-2.11.2/src/distributed_ls/Euclid/globalObjects.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/globalObjects.c	2017-10-20 17:42:22.000000000 +0000
@@ -30,7 +30,10 @@
 TimeLog_dh  tlog_dh = NULL;     /* internal timing  functionality */
 Mem_dh      mem_dh = NULL;      /* memory management */
 FILE        *logFile = NULL;
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
+#else
 char        msgBuf_dh[MSG_BUF_SIZE_DH]; /* for internal use */
+#endif
 HYPRE_Int         np_dh = 1;     /* number of processors and subdomains */
 HYPRE_Int         myid_dh = 0;   /* rank of this processor (and subdomain) */
 MPI_Comm    comm_dh = 0;
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/_hypre_Euclid.h hypre-2.13.0/src/distributed_ls/Euclid/_hypre_Euclid.h
--- hypre-2.11.2/src/distributed_ls/Euclid/_hypre_Euclid.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/_hypre_Euclid.h	2017-10-20 17:42:22.000000000 +0000
@@ -198,8 +198,14 @@
 #define FABS(a)    ((a) < 0 ? -(a) : a)
 #endif
 
-/* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */
+#ifdef HYPRE_SINGLE
+#define _ATOL_ 1.0e-16   /* used to compute absolute tolerance for Euclid's internal Krylov solvers */
+#define _MATLAB_ZERO_  1e-30 /* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */
+#else // default
+#define _ATOL_ 1.0e-50
 #define _MATLAB_ZERO_  1e-100
+#endif
+
 
 
 /*---------------------------------------------------------------------- 
@@ -481,7 +487,11 @@
 #endif
 
 #define MSG_BUF_SIZE_DH MAX(1024, hypre_MPI_MAX_ERROR_STRING)
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
+static char  msgBuf_dh[MSG_BUF_SIZE_DH];
+#else
 extern char  msgBuf_dh[MSG_BUF_SIZE_DH];
+#endif
 
 /* Each processor (may) open a logfile.
  * The bools are switches for controlling the amount of informational 
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/macros_dh.h hypre-2.13.0/src/distributed_ls/Euclid/macros_dh.h
--- hypre-2.11.2/src/distributed_ls/Euclid/macros_dh.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/macros_dh.h	2017-10-20 17:42:22.000000000 +0000
@@ -34,7 +34,11 @@
 #endif
 
 /* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */
+#ifdef HYPRE_SINGLE
+#define _MATLAB_ZERO_  1e-30
+#else // default
 #define _MATLAB_ZERO_  1e-100
+#endif
 
 
 /*---------------------------------------------------------------------- 
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Mat_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Mat_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/Mat_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/Mat_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -251,7 +251,7 @@
     /* Count of number of number of indices needed from this_pe */
     outlist[this_pe] = j-i;
 
-    ierr = hypre_MPI_Recv_init(&mat->recvbuf[i+m], j-i, hypre_MPI_DOUBLE, this_pe, 555,
+    ierr = hypre_MPI_Recv_init(&mat->recvbuf[i+m], j-i, hypre_MPI_REAL, this_pe, 555,
             comm_dh, &mat->recv_req[mat->num_recv]); CHECK_MPI_V_ERROR(ierr);
 
     mat->num_recv++;
@@ -289,7 +289,7 @@
       ierr = hypre_MPI_Irecv(&mat->sendind[j], inlist[i], HYPRE_MPI_INT, i, 444, comm_dh,
                             &requests[mat->num_send]); CHECK_MPI_V_ERROR(ierr);
       /* Set up the send */
-      ierr = hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 555, comm_dh,
+      ierr = hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 555, comm_dh,
                        &mat->send_req[mat->num_send]); CHECK_MPI_V_ERROR(ierr);
 
       mat->num_send++;
@@ -601,8 +601,8 @@
   if (mat->time[MATVEC_MPI_TIME]) {
     mat->time[MATVEC_RATIO] = mat->time[MATVEC_TIME] / mat->time[MATVEC_MPI_TIME];
   }
-  hypre_MPI_Allreduce(mat->time, mat->time_min, MAT_DH_BINS, hypre_MPI_DOUBLE, hypre_MPI_MIN, comm_dh);
-  hypre_MPI_Allreduce(mat->time, mat->time_max, MAT_DH_BINS, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm_dh);
+  hypre_MPI_Allreduce(mat->time, mat->time_min, MAT_DH_BINS, hypre_MPI_REAL, hypre_MPI_MIN, comm_dh);
+  hypre_MPI_Allreduce(mat->time, mat->time_max, MAT_DH_BINS, hypre_MPI_REAL, hypre_MPI_MAX, comm_dh);
   END_FUNC_DH
 }
 
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/mat_dh_private.c hypre-2.13.0/src/distributed_ls/Euclid/mat_dh_private.c
--- hypre-2.11.2/src/distributed_ls/Euclid/mat_dh_private.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/mat_dh_private.c	2017-10-20 17:42:22.000000000 +0000
@@ -1082,7 +1082,7 @@
       }
 
       hypre_MPI_Isend(cval+rp[i], count, HYPRE_MPI_INT, owner, CVAL_TAG, comm_dh, send_req+2*i);
-      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_DOUBLE, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
+      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_REAL, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
     }
   } 
 
@@ -1105,7 +1105,7 @@
       }
 
       hypre_MPI_Irecv(cval+rp[i], count, HYPRE_MPI_INT, 0, CVAL_TAG, comm_dh, rcv_req+2*i);
-      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_DOUBLE, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
+      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_REAL, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
     }
   }
 
@@ -1195,7 +1195,7 @@
       }
 
       hypre_MPI_Isend(cval+rp[i], count, HYPRE_MPI_INT, owner, CVAL_TAG, comm_dh, send_req+2*i);
-      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_DOUBLE, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
+      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_REAL, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
     }
   } 
 
@@ -1218,7 +1218,7 @@
       }
 
       hypre_MPI_Irecv(cval+rp[i], count, HYPRE_MPI_INT, 0, CVAL_TAG, comm_dh, rcv_req+2*i);
-      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_DOUBLE, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
+      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_REAL, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
     }
   }
 
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/Numbering_dh.c hypre-2.13.0/src/distributed_ls/Euclid/Numbering_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/Numbering_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/Numbering_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -111,8 +111,9 @@
            space.  The global_to_local hash table may also need
            to be enlarged, but the hash object will take care of that.
          */
+        /* RL : why ``m+num_ext'' instead of ``num_ext+1'' ??? */
         if (m+num_ext >= size) {
-          HYPRE_Int newSize = size*1.5;  /* heuristic */
+          HYPRE_Int newSize = hypre_max(m+num_ext+1, size*1.5);  /* heuristic */
           HYPRE_Int *tmp = (HYPRE_Int*)MALLOC_DH(newSize*sizeof(HYPRE_Int)); CHECK_V_ERROR;
           memcpy(tmp, idx_ext, size*sizeof(size));
           FREE_DH(idx_ext); CHECK_V_ERROR;
diff -Nru hypre-2.11.2/src/distributed_ls/Euclid/TimeLog_dh.c hypre-2.13.0/src/distributed_ls/Euclid/TimeLog_dh.c
--- hypre-2.11.2/src/distributed_ls/Euclid/TimeLog_dh.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/Euclid/TimeLog_dh.c	2017-10-20 17:42:22.000000000 +0000
@@ -121,8 +121,8 @@
     hypre_sprintf(t->desc[t->last], "========== totals, and reset ==========\n");
     t->last += 1;
 
-    hypre_MPI_Allreduce(t->time, timeMax, t->last, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm_dh);
-    hypre_MPI_Allreduce(t->time, timeMin, t->last, hypre_MPI_DOUBLE, hypre_MPI_MIN, comm_dh);
+    hypre_MPI_Allreduce(t->time, timeMax, t->last, hypre_MPI_REAL, hypre_MPI_MAX, comm_dh);
+    hypre_MPI_Allreduce(t->time, timeMin, t->last, hypre_MPI_REAL, hypre_MPI_MIN, comm_dh);
     wasSummed = true;
   }
 
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/ConjGrad.c hypre-2.13.0/src/distributed_ls/ParaSails/ConjGrad.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/ConjGrad.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/ConjGrad.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,9 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
 /******************************************************************************
  *
  * ConjGrad - Preconditioned conjugate gradient algorithm using the
@@ -24,20 +21,16 @@
 #include "Common.h"
 #include "Matrix.h"
 #include "ParaSails.h"
-
-HYPRE_Real hypre_F90_NAME_BLAS(ddot, DDOT)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(dcopy, DCOPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(dscal, DSCAL)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(daxpy, DAXPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *);
+#include "_hypre_blas.h"
 
 static HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y, MPI_Comm comm)
 {
     HYPRE_Real local_result, result;
 
     HYPRE_Int one = 1;
-    local_result = hypre_F90_NAME_BLAS(ddot, DDOT)(&n, x, &one, y, &one);
+    local_result = hypre_ddot(&n, x, &one, y, &one);
 
-    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
 
     return result;
 }
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/DiagScale.c hypre-2.13.0/src/distributed_ls/ParaSails/DiagScale.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/DiagScale.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/DiagScale.c	2017-10-20 17:42:22.000000000 +0000
@@ -76,7 +76,7 @@
         }
 
         /* Post receive for diagonal values */
-        hypre_MPI_Irecv(&diags[i], j-i, hypre_MPI_DOUBLE, this_pe, DIAG_VALS_TAG, 
+        hypre_MPI_Irecv(&diags[i], j-i, hypre_MPI_REAL, this_pe, DIAG_VALS_TAG, 
 	    comm, &requests[*num_requests]);
 
         /* Request rows in reqind[i..j-1] */
@@ -130,7 +130,7 @@
 	    sendbuf[j] = local_diags[recvbuf[j] - mat->beg_row];
 
 	/* Use ready-mode send, since receives already posted */
-	hypre_MPI_Irsend(sendbuf, count, hypre_MPI_DOUBLE, source, 
+	hypre_MPI_Irsend(sendbuf, count, hypre_MPI_REAL, source, 
 	    DIAG_VALS_TAG, comm, &requests[i]);
     }
 }
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/driver.c hypre-2.13.0/src/distributed_ls/ParaSails/driver.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/driver.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/driver.c	2017-10-20 17:42:22.000000000 +0000
@@ -139,11 +139,11 @@
 #endif
 	    }
 
-	    hypre_MPI_Bcast(&threshg, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
-	    hypre_MPI_Bcast(&thresh,  1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&threshg, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&thresh,  1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
 	    hypre_MPI_Bcast(&nlevels, 1, HYPRE_MPI_INT,    0, hypre_MPI_COMM_WORLD);
-	    hypre_MPI_Bcast(&filter,  1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
-	    hypre_MPI_Bcast(&loadbal, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&filter,  1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&loadbal, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
 
             if (nlevels < 0)
                 break;
@@ -220,9 +220,9 @@
         time1 = hypre_MPI_Wtime();
 	solve_time = time1-time0;
 
-        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, 
+        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, 
 	    hypre_MPI_COMM_WORLD);
-        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, 
+        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, 
 	    hypre_MPI_COMM_WORLD);
 
 	if (mype == 0)
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/FGmres.c hypre-2.13.0/src/distributed_ls/ParaSails/FGmres.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/FGmres.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/FGmres.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,9 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
 /******************************************************************************
  *
  * FlexGmres - Preconditioned flexible GMRES algorithm using the
@@ -24,20 +21,16 @@
 #include "Common.h"
 #include "Matrix.h"
 #include "ParaSails.h"
-
-HYPRE_Real hypre_F90_NAME_BLAS(ddot, DDOT)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(dcopy, DCOPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(dscal, DSCAL)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(daxpy, DAXPY)(HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *);
+#include "_hypre_blas.h"
 
 static HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y, MPI_Comm comm)
 {
     HYPRE_Real local_result, result;
 
     HYPRE_Int one = 1;
-    local_result = hypre_F90_NAME_BLAS(ddot, DDOT)(&n, x, &one, y, &one);
+    local_result = hypre_ddot(&n, x, &one, y, &one);
 
-    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
 
     return result;
 }
@@ -45,19 +38,19 @@
 static void CopyVector(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y)
 {
     HYPRE_Int one = 1;
-    hypre_F90_NAME_BLAS(dcopy, DCOPY)(&n, x, &one, y, &one);
+    hypre_dcopy(&n, x, &one, y, &one);
 }
 
 static void ScaleVector(HYPRE_Int n, HYPRE_Real alpha, HYPRE_Real *x)
 {
     HYPRE_Int one = 1;
-    hypre_F90_NAME_BLAS(dscal, DSCAL)(&n, &alpha, x, &one);
+    hypre_dscal(&n, &alpha, x, &one);
 }
 
 static void Axpy(HYPRE_Int n, HYPRE_Real alpha, HYPRE_Real *x, HYPRE_Real *y)
 {
     HYPRE_Int one = 1;
-    hypre_F90_NAME_BLAS(daxpy, DAXPY)(&n, &alpha, x, &one, y, &one);
+    hypre_daxpy(&n, &alpha, x, &one, y, &one);
 }
 
 /* simulate 2-D arrays at the cost of some arithmetic */
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/lapdriver.c hypre-2.13.0/src/distributed_ls/ParaSails/lapdriver.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/lapdriver.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/lapdriver.c	2017-10-20 17:42:22.000000000 +0000
@@ -304,9 +304,9 @@
         ParaSailsStatsPattern(ps, A);
         ParaSailsStatsValues(ps, A);
 
-        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0,
+        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0,
             hypre_MPI_COMM_WORLD);
-        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0,
+        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0,
             hypre_MPI_COMM_WORLD);
 
         if (mype == 0)
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/LoadBal.c hypre-2.13.0/src/distributed_ls/ParaSails/LoadBal.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/LoadBal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/LoadBal.c	2017-10-20 17:42:22.000000000 +0000
@@ -57,7 +57,7 @@
 
     cost = (HYPRE_Real *) malloc(npes * sizeof(HYPRE_Real));
 
-    hypre_MPI_Allgather(&local_cost, 1, hypre_MPI_DOUBLE, cost, 1, hypre_MPI_DOUBLE, comm);
+    hypre_MPI_Allgather(&local_cost, 1, hypre_MPI_REAL, cost, 1, hypre_MPI_REAL, comm);
 
     /* Compute the average cost */
     average = 0.0;
@@ -284,7 +284,7 @@
             bufferp += len;
         }
 
-        hypre_MPI_Isend(recip_data[i].buffer, buflen, hypre_MPI_DOUBLE, recip_data[i].pe,
+        hypre_MPI_Isend(recip_data[i].buffer, buflen, hypre_MPI_REAL, recip_data[i].pe,
             LOADBAL_REP_TAG, comm, &request[i]);
 
         MatrixDestroy(mat);
@@ -311,10 +311,10 @@
     {
         hypre_MPI_Probe(hypre_MPI_ANY_SOURCE, LOADBAL_REP_TAG, comm, &status);
         source = status.hypre_MPI_SOURCE;
-        hypre_MPI_Get_count(&status, hypre_MPI_DOUBLE, &count);
+        hypre_MPI_Get_count(&status, hypre_MPI_REAL, &count);
 
         buffer = (HYPRE_Real *) malloc(count * sizeof(HYPRE_Real));
-        hypre_MPI_Recv(buffer, count, hypre_MPI_DOUBLE, source, LOADBAL_REP_TAG, 
+        hypre_MPI_Recv(buffer, count, hypre_MPI_REAL, source, LOADBAL_REP_TAG, 
            comm, &status);
 
 	/* search for which entry in donor_data this message corresponds to */
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/Makefile hypre-2.13.0/src/distributed_ls/ParaSails/Makefile
--- hypre-2.11.2/src/distributed_ls/ParaSails/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,8 @@
  -I../..\
  -I$(srcdir)\
  -I$(srcdir)/../..\
+ -I$(srcdir)/../../blas\
+ -I$(srcdir)/../../lapack\
  -I$(srcdir)/../../utilities\
  -I$(srcdir)/../../distributed_matrix\
  ${CINCLUDES}
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/Matrix.c hypre-2.13.0/src/distributed_ls/ParaSails/Matrix.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/Matrix.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/Matrix.c	2017-10-20 17:42:22.000000000 +0000
@@ -532,7 +532,7 @@
 
     if (mype != 0)
     {
-	hypre_MPI_Recv(rhs, num_local, hypre_MPI_DOUBLE, 0, 0, mat->comm, &status);
+	hypre_MPI_Recv(rhs, num_local, hypre_MPI_REAL, 0, 0, mat->comm, &status);
 	return;
     }
 
@@ -567,7 +567,7 @@
 	  else
             hypre_fscanf(file, "%lf", &buffer[i]);
 
-	hypre_MPI_Send(buffer, num_local, hypre_MPI_DOUBLE, pe, 0, mat->comm);
+	hypre_MPI_Send(buffer, num_local, hypre_MPI_REAL, pe, 0, mat->comm);
     }
 
     free(buffer);
@@ -614,10 +614,10 @@
 	/* Count of number of number of indices needed from this_pe */
         outlist[this_pe] = j-i;
 
-        hypre_MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_DOUBLE, this_pe, 555,
+        hypre_MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_REAL, this_pe, 555,
 	    comm, &mat->recv_req[mat->num_recv]);
 
-        hypre_MPI_Send_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_DOUBLE, this_pe, 666,
+        hypre_MPI_Send_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_REAL, this_pe, 666,
 	    comm, &mat->send_req2[mat->num_recv]);
 
         mat->num_recv++;
@@ -665,11 +665,11 @@
                 &requests[mat->num_send]);
 
 	    /* Set up the send */
-	    hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 555, comm,
+	    hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 555, comm,
 		&mat->send_req[mat->num_send]);
 
 	    /* Set up the receive for the transpose  */
-	    hypre_MPI_Recv_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 666, comm,
+	    hypre_MPI_Recv_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 666, comm,
 		&mat->recv_req2[mat->num_send]);
 
 	    mat->num_send++;
diff -Nru hypre-2.11.2/src/distributed_ls/ParaSails/ParaSails.c hypre-2.13.0/src/distributed_ls/ParaSails/ParaSails.c
--- hypre-2.11.2/src/distributed_ls/ParaSails/ParaSails.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/ParaSails/ParaSails.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,9 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
 /******************************************************************************
  *
  * ParaSails - Parallel sparse approximate inverse least squares.
@@ -33,6 +30,8 @@
 #include "LoadBal.h"
 #include "ParaSails.h"
 
+#include "_hypre_lapack.h"
+
 #define ROW_PRUNED_REQ_TAG        221
 #define ROW_STORED_REQ_TAG        222
 #define ROW_REPI_TAG              223
@@ -40,20 +39,6 @@
 
 #ifdef ESSL
 #include <essl.h>
-#else
-#ifdef __cplusplus
-extern "C" 
-{	
-#endif
-HYPRE_Int hypre_F90_NAME_LAPACK(dpotrf, DPOTRF)(char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, 
-  HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dgels, DGELS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *,
-  HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-#ifdef __cplusplus
-}
-#endif
-
 #endif
 
 #if 0 /* no longer need this since using 'memset' now */
@@ -371,7 +356,7 @@
 
     hypre_MPI_Request_free(request);
 
-    hypre_MPI_Isend(valbuf, valbufp-valbuf, hypre_MPI_DOUBLE, dest, ROW_REPV_TAG,
+    hypre_MPI_Isend(valbuf, valbufp-valbuf, hypre_MPI_REAL, dest, ROW_REPV_TAG,
         comm, request);
 }
 
@@ -400,7 +385,7 @@
     ind = StoredRowsAllocInd(stored_rows, count);
     hypre_MPI_Recv(ind, count, HYPRE_MPI_INT, source, ROW_REPI_TAG, comm, &status);
     val = StoredRowsAllocVal(stored_rows, count);
-    hypre_MPI_Recv(val, count, hypre_MPI_DOUBLE, source, ROW_REPV_TAG, comm, &status);
+    hypre_MPI_Recv(val, count, hypre_MPI_REAL, source, ROW_REPV_TAG, comm, &status);
 
     /* Parse the message */
     num_rows = *ind++; /* number of rows */
@@ -1149,7 +1134,7 @@
         dpps(ahat, len, val, 1);
 #else
         /* Solve local linear system - factor phase */
-        hypre_F90_NAME_LAPACK(dpotrf, DPOTRF)(&uplo, &len, ahat, &len, &info);
+        hypre_dpotrf(&uplo, &len, ahat, &len, &info);
         if (info != 0)
         {
 #if 0
@@ -1163,8 +1148,7 @@
         }
 
         /* Solve local linear system - solve phase */
-        hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &len, &one, ahat, &len, val, &len, 
-          &info);
+        hypre_dpotrs(&uplo, &len, &one, ahat, &len, val, &len, &info);
         if (info != 0)
         {
 #if 0
@@ -1339,7 +1323,7 @@
             &info, work, work_size);
 #else
         /* rhs in bhat, and put solution in bhat */
-        hypre_F90_NAME_LAPACK(dgels, DGELS)(&trans, &npat, &len, &one, ahat, &npat,
+        hypre_dgels(&trans, &npat, &len, &one, ahat, &npat,
             bhat, &npat, work, &work_size, &info);
 
         if (info != 0)
@@ -1430,7 +1414,7 @@
     }
 
     /* Find the average across all processors */
-    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
     hypre_MPI_Comm_size(comm, &npes);
 
     free(buffer);
@@ -1486,7 +1470,7 @@
     }
 
     /* Find the average across all processors */
-    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
     hypre_MPI_Comm_size(comm, &npes);
 
     free(buffer);
@@ -1980,9 +1964,9 @@
     }
 
     hypre_MPI_Allreduce(&ps->setup_pattern_time, &max_pattern_time, 
-	1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
-    hypre_MPI_Allreduce(&ps->cost, &max_cost, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
-    hypre_MPI_Allreduce(&ps->cost, &ave_cost, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+	1, hypre_MPI_REAL, hypre_MPI_MAX, comm);
+    hypre_MPI_Allreduce(&ps->cost, &max_cost, 1, hypre_MPI_REAL, hypre_MPI_MAX, comm);
+    hypre_MPI_Allreduce(&ps->cost, &ave_cost, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
     ave_cost = ave_cost / (HYPRE_Real) npes;
 
     if (mype)
@@ -2029,13 +2013,13 @@
     }
 
     hypre_MPI_Allreduce(&ps->setup_values_time, &max_values_time, 
-	1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
+	1, hypre_MPI_REAL, hypre_MPI_MAX, comm);
 
     if (!mype)
         setup_times = (HYPRE_Real *) malloc(npes * sizeof(HYPRE_Real));
 
     temp = ps->setup_pattern_time + ps->setup_values_time;
-    hypre_MPI_Gather(&temp, 1, hypre_MPI_DOUBLE, setup_times, 1, hypre_MPI_DOUBLE, 0, comm);
+    hypre_MPI_Gather(&temp, 1, hypre_MPI_REAL, setup_times, 1, hypre_MPI_REAL, 0, comm);
 
     if (mype)
         return;
diff -Nru hypre-2.11.2/src/distributed_ls/pilut/comm.c hypre-2.13.0/src/distributed_ls/pilut/comm.c
--- hypre-2.11.2/src/distributed_ls/pilut/comm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/pilut/comm.c	2017-10-20 17:42:22.000000000 +0000
@@ -77,7 +77,7 @@
 HYPRE_Real hypre_GlobalSEMaxDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 {
   HYPRE_Real max;
-  hypre_MPI_Allreduce( &value, &max, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, hypre_MPI_Context );
+  hypre_MPI_Allreduce( &value, &max, 1, hypre_MPI_REAL, hypre_MPI_MAX, hypre_MPI_Context );
 
   return max;
 }
@@ -88,7 +88,7 @@
 HYPRE_Real hypre_GlobalSEMinDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 {
   HYPRE_Real min;
-  hypre_MPI_Allreduce( &value, &min, 1, hypre_MPI_DOUBLE, hypre_MPI_MIN, hypre_MPI_Context );
+  hypre_MPI_Allreduce( &value, &min, 1, hypre_MPI_REAL, hypre_MPI_MIN, hypre_MPI_Context );
 
   return min;
 }
@@ -99,7 +99,7 @@
 HYPRE_Real hypre_GlobalSESumDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 {
   HYPRE_Real sum;
-  hypre_MPI_Allreduce( &value, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, hypre_MPI_Context );
+  hypre_MPI_Allreduce( &value, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, hypre_MPI_Context );
 
   return sum;
 }
diff -Nru hypre-2.11.2/src/distributed_ls/pilut/macros.h hypre-2.13.0/src/distributed_ls/pilut/macros.h
--- hypre-2.11.2/src/distributed_ls/pilut/macros.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/pilut/macros.h	2017-10-20 17:42:22.000000000 +0000
@@ -83,13 +83,13 @@
 #else
 #ifdef MACHINE_IS_SOLARIS
 #ifdef USE_SHORT
-#define SNRM2 hypre_F90_NAME_BLAS(snrm2, SNRM2)
-#define SDOT hypre_F90_NAME_BLAS(sdot, SDOT)
-#define SCOPY hypre_F90_NAME_BLAS(scopy, SCOPY)
+#define SNRM2 hypre_snrm2
+#define SDOT hypre_sdot
+#define SCOPY hypre_scopy
 #else
-#define SNRM2 hypre_F90_NAME_BLAS(dnrm2, DNRM2)
-#define SDOT hypre_F90_NAME_BLAS(ddot, DDOT)
-#define SCOPY hypre_F90_NAME_BLAS(dcopy, DCOPY)
+#define SNRM2 hypre_dnrm2
+#define SDOT hypre_ddot
+#define SCOPY hypre_dcopy
 #endif
 #else
 #ifdef USE_SHORT
diff -Nru hypre-2.11.2/src/distributed_ls/pilut/parilut.c hypre-2.13.0/src/distributed_ls/pilut/parilut.c
--- hypre-2.11.2/src/distributed_ls/pilut/parilut.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/pilut/parilut.c	2017-10-20 17:42:22.000000000 +0000
@@ -503,7 +503,7 @@
     hypre_MPI_Irecv( incolind+j, cnt, HYPRE_MPI_INT,
 	      penum, TAG_Send_colind, pilut_comm, &index_requests[i] );
 
-    hypre_MPI_Irecv( invalues+j, cnt, hypre_MPI_DOUBLE,
+    hypre_MPI_Irecv( invalues+j, cnt, hypre_MPI_REAL,
 	      penum, TAG_Send_values, pilut_comm, &value_requests[i] );
 
     j += cnt;
@@ -550,7 +550,7 @@
 
   /* send values to each neighbor */
   for (i=0; i<snnbr; i++) {
-    hypre_MPI_Send( dgatherbuf, l, hypre_MPI_DOUBLE,
+    hypre_MPI_Send( dgatherbuf, l, hypre_MPI_REAL,
 	      snbrind[i], TAG_Send_values, pilut_comm );
   }
 
diff -Nru hypre-2.11.2/src/distributed_ls/pilut/trifactor.c hypre-2.13.0/src/distributed_ls/pilut/trifactor.c
--- hypre-2.11.2/src/distributed_ls/pilut/trifactor.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/distributed_ls/pilut/trifactor.c	2017-10-20 17:42:22.000000000 +0000
@@ -126,7 +126,7 @@
     /* Recv the required lx elements from the appropriate processors */
     for (i=0; i<rnbrpes; i++) {
       if ( rnum[i] > 0 ) { /* Something to recv */
-	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_DOUBLE,
+	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_REAL,
 		  rpes[i], TAG, pilut_comm, &receive_requests[i] );
 
 	rdone[i] += rnum[i] ;
@@ -139,7 +139,7 @@
         for (j=auxsptr[i], l=0;   j<sptr[i+1] && sindex[j]<nnodes[ii];   j++, l++) 
           gatherbuf[l] = lx[sindex[j]];
 
-	hypre_MPI_Send( gatherbuf, l, hypre_MPI_DOUBLE,
+	hypre_MPI_Send( gatherbuf, l, hypre_MPI_REAL,
 		  spes[i], TAG, pilut_comm );
 
         auxsptr[i] = j;
@@ -212,7 +212,7 @@
     /* Recv the required ux elements from the appropriate processors */
     for (i=0; i<rnbrpes; i++) {
       if ( rnum[i] > 0 ) { /* Something to recv */
-	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_DOUBLE,
+	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_REAL,
 		  rpes[i], TAG, pilut_comm, &receive_requests[ i ] );
 
 	rdone[i] += rnum[i] ;
@@ -225,7 +225,7 @@
         for (j=auxsptr[i], l=0;   j<sptr[i+1] && sindex[j]>=nnodes[ii-1];   j++, l++) 
           gatherbuf[l] = ux[sindex[j]];
 
-	hypre_MPI_Send( gatherbuf, l, hypre_MPI_DOUBLE,
+	hypre_MPI_Send( gatherbuf, l, hypre_MPI_REAL,
 		  spes[i], TAG, pilut_comm );
 
         auxsptr[i] = j;
diff -Nru hypre-2.11.2/src/docs/usr_solvers.tex hypre-2.13.0/src/docs/usr_solvers.tex
--- hypre-2.11.2/src/docs/usr_solvers.tex	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/docs/usr_solvers.tex	2017-10-20 17:42:22.000000000 +0000
@@ -36,6 +36,7 @@
 AMS        &   & X & X & X \\
 ADS        &   & X & X & X \\
 MLI        &   & X & X & X \\
+MGR        &   &   &   & X \\
 ParaSails  &   & X & X & X \\
 Euclid     &   & X & X & X \\
 PILUT      &   & X & X & X \\
@@ -476,10 +477,10 @@
 e.g. block, Schwarz, ILU smoothers, it is necessary to use \code{HYPRE_BoomerAMGSetSmoothType}
 and \code{HYPRE_BoomerAMGSetSmoothNumLevels}. There are further parameter choices for the
 individual smoothers, which are described in the reference manual.
-The default relaxation type is hybrid Gauss-Seidel with CF-relaxation (relax first the C-,
-then the F-points) on the down cycle and FC-relaxation on the up-cycle. Note that if
+The default relaxation type is l1-Gauss-Seidel, using a forward solve
+on the down cycle and a backward solve on the up-cycle, to keep symmetry. Note that if
 BoomerAMG is used as a preconditioner for conjugate gradient, it is necessary to use
-a symmetric smoother such as weighted Jacobi or hybrid symmetric Gauss-Seidel.
+a symmetric smoother. Other symmetric options are weighted Jacobi or hybrid symmetric Gauss-Seidel.
 
 \subsection{AMG for systems of PDEs}
  
@@ -1190,6 +1191,49 @@
 
 %-----------------------------------------------------------------------------
 
+\section{Multigrid Reduction (MGR)}
+                                                                                   
+MGR is a parallel multigrid reduction solver and preconditioner designed to take advantage 
+of use-provided information to solve systems of equations with multiple vatiable types. 
+The algorithm is similar to two-stage preconditioner strategies and other reduction 
+techniques like ARMS, but in a standard multigrid framework.
+
+The MGR algorithm accepts information about the variables in block form from the 
+user and uses it to define the appropriate C/F splitting for the multigrid scheme. 
+The linear system solve proceeds with an F-relaxation solve on the F points, 
+folowed by a coarse grid correction. The coarse grid solve is handled by scalar 
+AMG (BoomerAMG). MGR provides users with more control over the coarsening process, 
+and can potentially be a starting point for designing multigrid-based physics-based preconditioners.
+
+The following represents a minimal set of functions, and some optional functions, to call to 
+use the MGR solver. For simplicity, we ignore the function parameters here, and refer the 
+reader to the reference manual for more details on the parameters and their defaults.
+
+\begin{itemize}
+\item {{\tt HYPRE\_MGRCreate:} Create the MGR solver object.}
+\item {{\tt HYPRE\_MGRSetCpointsByBlock:} Set up block data with information about coarse indexes
+for reduction. Here, the user specifies the number of reduction levels, as well as the the coarse nodes 
+for each level of the reduction. These coarse nodes are indexed by their index in the block of unknowns. 
+This is used internally to tag the appropriate indexes of the linear system matrix as coarse nodes.}
+\item {(Optional) {\tt HYPRE\_MGRSetReservedCoarseNodes:} Prescribe a subset of nodes to be kept as coarse 
+nodes until the coarsest level. These nodes are transferred onto the coarsest grid of the BoomerAMG 
+coarse grid solver.}
+\item {(Optional) {\tt HYPRE\_MGRSetNonCpointsToFpoints:} Set points not prescribed as C points to be fixed 
+as F points for intermediate levels. Setting this to 1 uses the user input to define the C/F splitting. 
+Otherwise, a BoomerAMG coarsening routine is used to determine the C/F splitting for intermediate levels.}
+\item {(Optional) {\tt HYPRE\_MGRSetCoarseSolver:} This function sets the BoomerAMG solver to be used for the 
+solve on the coarse grid. The user can define their own BoomerAMG solver with their preferred options and 
+pass this to the MGR solver. Otherwise, an internal BoomerAMG solver is used as the coarse grid solver 
+instead.}
+\item {{\tt HYPRE\_MGRSetup:} Setup and MGR solver object.}
+\item {{\tt HYPRE\_MGRSolve:} Solve the linear system.}
+\item {{\tt HYPRE\_MGRDestroy:} Destroy the MGR solver object}
+\end{itemize}
+
+For more details about additional solver options and parameters, please refer to the reference manual. 
+NOTE: The MGR solver is currently only supported by the IJ interface.
+%-----------------------------------------------------------------------------
+
 \section{ParaSails}
 
 ParaSails is a parallel implementation of a sparse approximate inverse
diff -Nru hypre-2.11.2/src/examples/ex12f.f hypre-2.13.0/src/examples/ex12f.f
--- hypre-2.11.2/src/examples/ex12f.f	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/examples/ex12f.f	2017-10-20 17:42:22.000000000 +0000
@@ -405,7 +405,7 @@
          call HYPRE_StructPFMGCreate(MPI_COMM_WORLD, precond, ierr)
 c        Set PFMG parameters
          call HYPRE_StructPFMGSetMaxIter(precond, 1, ierr)
-         call HYPRE_StructPFMGSetTol(precond, 0.0, ierr)
+         call HYPRE_StructPFMGSetTol(precond, 0.0d0, ierr)
          call HYPRE_StructPFMGSetZeroGuess(precond, ierr)
          call HYPRE_StructPFMGSetNumPreRelax(precond, 2, ierr)
          call HYPRE_StructPFMGSetNumPostRelax(precond, 2, ierr)
diff -Nru hypre-2.11.2/src/examples/README_files/ex12f.f.html hypre-2.13.0/src/examples/README_files/ex12f.f.html
--- hypre-2.11.2/src/examples/README_files/ex12f.f.html	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/examples/README_files/ex12f.f.html	2017-10-20 17:42:22.000000000 +0000
@@ -412,7 +412,7 @@
 <a name="line405">405</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGCreate</font><font color="4444FF">(</font><font color="#2040a0">MPI_COMM_WORLD</font>, <font color="#2040a0">precond</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line406">406</a> <font color="#444444">c        Set PFMG parameters</font>
 <a name="line407">407</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetMaxIter</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">1</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
-<a name="line408">408</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetTol</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">0.0</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
+<a name="line408">408</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetTol</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">0.0d0</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line409">409</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetZeroGuess</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line410">410</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetNumPreRelax</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">2</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line411">411</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetNumPostRelax</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">2</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
diff -Nru hypre-2.11.2/src/FEI_mv/CMakeLists.txt hypre-2.13.0/src/FEI_mv/CMakeLists.txt
--- hypre-2.11.2/src/FEI_mv/CMakeLists.txt	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/CMakeLists.txt	2017-10-20 17:42:22.000000000 +0000
@@ -5,25 +5,25 @@
 include_directories(fei-hypre)
 include_directories(ml/src/include)
 
-option(HYPRE_USING_SUPERLU "Use internal SuperLU routines" ON)
-option(HYPRE_USING_MLI     "Use MLI" ON)
-
-if(HYPRE_USING_SUPERLU)
-  include_directories(SuperLU/SRC)
-  add_definitions(-DHAVE_SUPERLU)
-  add_subdirectory(SuperLU)
-  set(FEI_LIBS ${FEI_LIBS} $<TARGET_OBJECTS:HYPRE_superlu>)
-endif()
-
-if(HYPRE_USING_MLI)
-  include_directories(femli)
-  add_definitions(-DHAVE_MLI)
-  if(HYPRE_USING_SUPERLU)
-    add_definitions(-DMLI_SUPERLU)
-  endif()
-  add_subdirectory(femli)
-  set(FEI_LIBS ${FEI_LIBS} $<TARGET_OBJECTS:HYPRE_mli>)
-endif()
+# option(HYPRE_USING_SUPERLU "Use internal SuperLU routines" ON)
+# option(HYPRE_USING_MLI     "Use MLI" ON)
+# 
+# if(HYPRE_USING_SUPERLU)
+#   include_directories(SuperLU/SRC)
+#   add_definitions(-DHAVE_SUPERLU)
+#   add_subdirectory(SuperLU)
+#   set(FEI_LIBS ${FEI_LIBS} $<TARGET_OBJECTS:HYPRE_superlu>)
+# endif()
+# 
+# if(HYPRE_USING_MLI)
+#   include_directories(femli)
+#   add_definitions(-DHAVE_MLI)
+#   if(HYPRE_USING_SUPERLU)
+#     add_definitions(-DMLI_SUPERLU)
+#   endif()
+#   add_subdirectory(femli)
+#   set(FEI_LIBS ${FEI_LIBS} $<TARGET_OBJECTS:HYPRE_mli>)
+# endif()
 
 add_subdirectory(fei-hypre)
 set(FEI_LIBS ${FEI_LIBS} $<TARGET_OBJECTS:HYPRE_fei>)
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/caxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/caxpy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/caxpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/caxpy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,90 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int caxpy_(integer *n, complex *ca, complex *cx, integer *
-	incx, complex *cy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3, i__4;
-    real r__1, r__2;
-    complex q__1, q__2;
-
-    /* Builtin functions */
-    double r_imag(complex *);
-
-    /* Local variables */
-    static integer i, ix, iy;
-
-
-/*     constant times a vector plus a vector.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define CY(I) cy[(I)-1]
-#define CX(I) cx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if ((r__1 = ca->r, dabs(r__1)) + (r__2 = r_imag(ca), dabs(r__2)) == 0.f) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = iy;
-	i__3 = iy;
-	i__4 = ix;
-	q__2.r = ca->r * CX(ix).r - ca->i * CX(ix).i, q__2.i = ca->r * CX(
-		ix).i + ca->i * CX(ix).r;
-	q__1.r = CY(iy).r + q__2.r, q__1.i = CY(iy).i + q__2.i;
-	CY(iy).r = q__1.r, CY(iy).i = q__1.i;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = i;
-	i__3 = i;
-	i__4 = i;
-	q__2.r = ca->r * CX(i).r - ca->i * CX(i).i, q__2.i = ca->r * CX(
-		i).i + ca->i * CX(i).r;
-	q__1.r = CY(i).r + q__2.r, q__1.i = CY(i).i + q__2.i;
-	CY(i).r = q__1.r, CY(i).i = q__1.i;
-/* L30: */
-    }
-    return 0;
-} /* caxpy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ccopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ccopy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ccopy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ccopy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,74 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int ccopy_(integer *n, complex *cx, integer *incx, complex *
-	cy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3;
-
-    /* Local variables */
-    static integer i, ix, iy;
-
-
-/*     copies a vector, x, to a vector, y.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define CY(I) cy[(I)-1]
-#define CX(I) cx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = iy;
-	i__3 = ix;
-	CY(iy).r = CX(ix).r, CY(iy).i = CX(ix).i;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = i;
-	i__3 = i;
-	CY(i).r = CX(i).r, CY(i).i = CX(i).i;
-/* L30: */
-    }
-    return 0;
-} /* ccopy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cdotc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cdotc.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cdotc.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cdotc.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,87 +0,0 @@
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Complex */ VOID cdotc_(complex * ret_val, integer *n, complex *cx, integer 
-	*incx, complex *cy, integer *incy)
-{
-    /* System generated locals */
-    integer i__1, i__2;
-    complex q__1, q__2, q__3;
-
-    /* Builtin functions */
-    void r_cnjg(complex *, complex *);
-
-    /* Local variables */
-    static integer i;
-    static complex ctemp;
-    static integer ix, iy;
-
-
-/*     forms the dot product of two vectors, conjugating the first   
-       vector.   
-       jack dongarra, linpack,  3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments */
-    --cy;
-    --cx;
-
-    /* Function Body */
-    ctemp.r = 0.f, ctemp.i = 0.f;
-     ret_val->r = 0.f,  ret_val->i = 0.f;
-    if (*n <= 0) {
-	return ;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	r_cnjg(&q__3, &cx[ix]);
-	i__2 = iy;
-	q__2.r = q__3.r * cy[iy].r - q__3.i * cy[iy].i, q__2.i = q__3.r * 
-		cy[iy].i + q__3.i * cy[iy].r;
-	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
-	ctemp.r = q__1.r, ctemp.i = q__1.i;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
-    return ;
-
-/*        code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	r_cnjg(&q__3, &cx[i]);
-	i__2 = i;
-	q__2.r = q__3.r * cy[i].r - q__3.i * cy[i].i, q__2.i = q__3.r * 
-		cy[i].i + q__3.i * cy[i].r;
-	q__1.r = ctemp.r + q__2.r, q__1.i = ctemp.i + q__2.i;
-	ctemp.r = q__1.r, ctemp.i = q__1.i;
-/* L30: */
-    }
-     ret_val->r = ctemp.r,  ret_val->i = ctemp.i;
-    return ;
-} /* cdotc_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgemv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgemv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,399 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int cgemv_(char *trans, integer *m, integer *n, complex *
-	alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
-	beta, complex *y, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    complex q__1, q__2, q__3;
-
-    /* Builtin functions */
-    void r_cnjg(complex *, complex *);
-
-    /* Local variables */
-    static integer info;
-    static complex temp;
-    static integer lenx, leny, i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical noconj;
-
-
-/*  Purpose   
-    =======   
-
-    CGEMV  performs one of the matrix-vector operations   
-
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or   
-
-       y := alpha*conjg( A' )*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are vectors and A is an   
-    m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-
-                TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX         .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-    X      - COMPLEX          array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - COMPLEX         .   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX          array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-  
-             updated vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! 
-	    lsame_(trans, "C")) {
-	info = 1;
-    } else if (*m < 0) {
-	info = 2;
-    } else if (*n < 0) {
-	info = 3;
-    } else if (*lda < max(1,*m)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    } else if (*incy == 0) {
-	info = 11;
-    }
-    if (info != 0) {
-	xerbla_("CGEMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r 
-	    == 1.f && beta->i == 0.f)) {
-	return 0;
-    }
-
-    noconj = lsame_(trans, "T");
-
-/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-  
-       up the start points in  X  and  Y. */
-
-    if (lsame_(trans, "N")) {
-	lenx = *n;
-	leny = *m;
-    } else {
-	lenx = *m;
-	leny = *n;
-    }
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (lenx - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (leny - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A.   
-
-       First form  y := beta*y. */
-
-    if (beta->r != 1.f || beta->i != 0.f) {
-	if (*incy == 1) {
-	    if (beta->r == 0.f && beta->i == 0.f) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = i;
-		    Y(i).r = 0.f, Y(i).i = 0.f;
-/* L10: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = i;
-		    i__3 = i;
-		    q__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
-			    q__1.i = beta->r * Y(i).i + beta->i * Y(i)
-			    .r;
-		    Y(i).r = q__1.r, Y(i).i = q__1.i;
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (beta->r == 0.f && beta->i == 0.f) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = iy;
-		    Y(iy).r = 0.f, Y(iy).i = 0.f;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = iy;
-		    i__3 = iy;
-		    q__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
-			    q__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
-			    .r;
-		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (alpha->r == 0.f && alpha->i == 0.f) {
-	return 0;
-    }
-    if (lsame_(trans, "N")) {
-
-/*        Form  y := alpha*A*x + y. */
-
-	jx = kx;
-	if (*incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		if (X(jx).r != 0.f || X(jx).i != 0.f) {
-		    i__2 = jx;
-		    q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    q__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    temp.r = q__1.r, temp.i = q__1.i;
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i;
-			i__4 = i;
-			i__5 = i + j * a_dim1;
-			q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				q__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
-				.r;
-			q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + 
-				q__2.i;
-			Y(i).r = q__1.r, Y(i).i = q__1.i;
-/* L50: */
-		    }
-		}
-		jx += *incx;
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		if (X(jx).r != 0.f || X(jx).i != 0.f) {
-		    i__2 = jx;
-		    q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    q__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    temp.r = q__1.r, temp.i = q__1.i;
-		    iy = ky;
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = iy;
-			i__4 = iy;
-			i__5 = i + j * a_dim1;
-			q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				q__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
-				.r;
-			q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + 
-				q__2.i;
-			Y(iy).r = q__1.r, Y(iy).i = q__1.i;
-			iy += *incy;
-/* L70: */
-		    }
-		}
-		jx += *incx;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
- */
-
-	jy = ky;
-	if (*incx == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp.r = 0.f, temp.i = 0.f;
-		if (noconj) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i;
-			q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(i)
-				.i, q__2.i = A(i,j).r * X(i).i + A(i,j)
-				.i * X(i).r;
-			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-			temp.r = q__1.r, temp.i = q__1.i;
-/* L90: */
-		    }
-		} else {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			r_cnjg(&q__3, &A(i,j));
-			i__3 = i;
-			q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, 
-				q__2.i = q__3.r * X(i).i + q__3.i * X(i)
-				.r;
-			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-			temp.r = q__1.r, temp.i = q__1.i;
-/* L100: */
-		    }
-		}
-		i__2 = jy;
-		i__3 = jy;
-		q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i = 
-			alpha->r * temp.i + alpha->i * temp.r;
-		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
-		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
-		jy += *incy;
-/* L110: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp.r = 0.f, temp.i = 0.f;
-		ix = kx;
-		if (noconj) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = ix;
-			q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(ix)
-				.i, q__2.i = A(i,j).r * X(ix).i + A(i,j)
-				.i * X(ix).r;
-			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-			temp.r = q__1.r, temp.i = q__1.i;
-			ix += *incx;
-/* L120: */
-		    }
-		} else {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			r_cnjg(&q__3, &A(i,j));
-			i__3 = ix;
-			q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, 
-				q__2.i = q__3.r * X(ix).i + q__3.i * X(ix)
-				.r;
-			q__1.r = temp.r + q__2.r, q__1.i = temp.i + q__2.i;
-			temp.r = q__1.r, temp.i = q__1.i;
-			ix += *incx;
-/* L130: */
-		    }
-		}
-		i__2 = jy;
-		i__3 = jy;
-		q__2.r = alpha->r * temp.r - alpha->i * temp.i, q__2.i = 
-			alpha->r * temp.i + alpha->i * temp.r;
-		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
-		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
-		jy += *incy;
-/* L140: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of CGEMV . */
-
-} /* cgemv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgerc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgerc.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgerc.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgerc.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,205 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int cgerc_(integer *m, integer *n, complex *alpha, complex *
-	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    complex q__1, q__2;
-
-    /* Builtin functions */
-    void r_cnjg(complex *, complex *);
-
-    /* Local variables */
-    static integer info;
-    static complex temp;
-    static integer i, j, ix, jy, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    CGERC  performs the rank 1 operation   
-
-       A := alpha*x*conjg( y' ) + A,   
-
-    where alpha is a scalar, x is an m element vector, y is an n element 
-  
-    vector and A is an m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX         .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - COMPLEX          array of dimension at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the m   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients. On exit, A is   
-             overwritten by the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (*m < 0) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*m)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("CGERC ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
-	return 0;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (*incy > 0) {
-	jy = 1;
-    } else {
-	jy = 1 - (*n - 1) * *incy;
-    }
-    if (*incx == 1) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
-		r_cnjg(&q__2, &Y(jy));
-		q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
-			alpha->r * q__2.i + alpha->i * q__2.r;
-		temp.r = q__1.r, temp.i = q__1.i;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = i;
-		    q__2.r = X(i).r * temp.r - X(i).i * temp.i, q__2.i =
-			     X(i).r * temp.i + X(i).i * temp.r;
-		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
-		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-/* L10: */
-		}
-	    }
-	    jy += *incy;
-/* L20: */
-	}
-    } else {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*m - 1) * *incx;
-	}
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
-		r_cnjg(&q__2, &Y(jy));
-		q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
-			alpha->r * q__2.i + alpha->i * q__2.r;
-		temp.r = q__1.r, temp.i = q__1.i;
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = ix;
-		    q__2.r = X(ix).r * temp.r - X(ix).i * temp.i, q__2.i =
-			     X(ix).r * temp.i + X(ix).i * temp.r;
-		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
-		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-		    ix += *incx;
-/* L30: */
-		}
-	    }
-	    jy += *incy;
-/* L40: */
-	}
-    }
-
-    return 0;
-
-/*     End of CGERC . */
-
-} /* cgerc_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgeru.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgeru.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cgeru.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cgeru.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,202 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int cgeru_(integer *m, integer *n, complex *alpha, complex *
-	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    complex q__1, q__2;
-
-    /* Local variables */
-    static integer info;
-    static complex temp;
-    static integer i, j, ix, jy, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    CGERU  performs the rank 1 operation   
-
-       A := alpha*x*y' + A,   
-
-    where alpha is a scalar, x is an m element vector, y is an n element 
-  
-    vector and A is an m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX         .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - COMPLEX          array of dimension at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the m   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients. On exit, A is   
-             overwritten by the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (*m < 0) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*m)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("CGERU ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
-	return 0;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (*incy > 0) {
-	jy = 1;
-    } else {
-	jy = 1 - (*n - 1) * *incy;
-    }
-    if (*incx == 1) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
-		i__2 = jy;
-		q__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, q__1.i =
-			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
-		temp.r = q__1.r, temp.i = q__1.i;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = i;
-		    q__2.r = X(i).r * temp.r - X(i).i * temp.i, q__2.i =
-			     X(i).r * temp.i + X(i).i * temp.r;
-		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
-		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-/* L10: */
-		}
-	    }
-	    jy += *incy;
-/* L20: */
-	}
-    } else {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*m - 1) * *incx;
-	}
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0.f || Y(jy).i != 0.f) {
-		i__2 = jy;
-		q__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, q__1.i =
-			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
-		temp.r = q__1.r, temp.i = q__1.i;
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = ix;
-		    q__2.r = X(ix).r * temp.r - X(ix).i * temp.i, q__2.i =
-			     X(ix).r * temp.i + X(ix).i * temp.r;
-		    q__1.r = A(i,j).r + q__2.r, q__1.i = A(i,j).i + q__2.i;
-		    A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-		    ix += *incx;
-/* L30: */
-		}
-	    }
-	    jy += *incy;
-/* L40: */
-	}
-    }
-
-    return 0;
-
-/*     End of CGERU . */
-
-} /* cgeru_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/chemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/chemv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/chemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/chemv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,421 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int chemv_(char *uplo, integer *n, complex *alpha, complex *
-	a, integer *lda, complex *x, integer *incx, complex *beta, complex *y,
-	 integer *incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    doublereal d__1;
-    complex q__1, q__2, q__3, q__4;
-
-    /* Builtin functions */
-    void r_cnjg(complex *, complex *);
-
-    /* Local variables */
-    static integer info;
-    static complex temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    CHEMV  performs the matrix-vector  operation   
-
-       y := alpha*A*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are n element vectors and   
-    A is an n by n hermitian matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX         .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the hermitian matrix and the strictly   
-             lower triangular part of A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the hermitian matrix and the strictly   
-             upper triangular part of A is not referenced.   
-             Note that the imaginary parts of the diagonal elements need 
-  
-             not be set and are assumed to be zero.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - COMPLEX         .   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y. On exit, Y is overwritten by the updated   
-             vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*lda < max(1,*n)) {
-	info = 5;
-    } else if (*incx == 0) {
-	info = 7;
-    } else if (*incy == 0) {
-	info = 10;
-    }
-    if (info != 0) {
-	xerbla_("CHEMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && 
-	    beta->i == 0.f)) {
-	return 0;
-    }
-
-/*     Set up the start points in  X  and  Y. */
-
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (*n - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (*n - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A.   
-
-       First form  y := beta*y. */
-
-    if (beta->r != 1.f || beta->i != 0.f) {
-	if (*incy == 1) {
-	    if (beta->r == 0.f && beta->i == 0.f) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = i;
-		    Y(i).r = 0.f, Y(i).i = 0.f;
-/* L10: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = i;
-		    i__3 = i;
-		    q__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
-			    q__1.i = beta->r * Y(i).i + beta->i * Y(i)
-			    .r;
-		    Y(i).r = q__1.r, Y(i).i = q__1.i;
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (beta->r == 0.f && beta->i == 0.f) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = iy;
-		    Y(iy).r = 0.f, Y(iy).i = 0.f;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = iy;
-		    i__3 = iy;
-		    q__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
-			    q__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
-			    .r;
-		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (alpha->r == 0.f && alpha->i == 0.f) {
-	return 0;
-    }
-    if (lsame_(uplo, "U")) {
-
-/*        Form  y  when A is stored in upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		q__1.r = alpha->r * X(j).r - alpha->i * X(j).i, q__1.i =
-			 alpha->r * X(j).i + alpha->i * X(j).r;
-		temp1.r = q__1.r, temp1.i = q__1.i;
-		temp2.r = 0.f, temp2.i = 0.f;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    i__3 = i;
-		    i__4 = i;
-		    i__5 = i + j * a_dim1;
-		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + q__2.i;
-		    Y(i).r = q__1.r, Y(i).i = q__1.i;
-		    r_cnjg(&q__3, &A(i,j));
-		    i__3 = i;
-		    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, q__2.i =
-			     q__3.r * X(i).i + q__3.i * X(i).r;
-		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-/* L50: */
-		}
-		i__2 = j;
-		i__3 = j;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		q__3.r = d__1 * temp1.r, q__3.i = d__1 * temp1.i;
-		q__2.r = Y(j).r + q__3.r, q__2.i = Y(j).i + q__3.i;
-		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-		Y(j).r = q__1.r, Y(j).i = q__1.i;
-/* L60: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, q__1.i =
-			 alpha->r * X(jx).i + alpha->i * X(jx).r;
-		temp1.r = q__1.r, temp1.i = q__1.i;
-		temp2.r = 0.f, temp2.i = 0.f;
-		ix = kx;
-		iy = ky;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    i__3 = iy;
-		    i__4 = iy;
-		    i__5 = i + j * a_dim1;
-		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + q__2.i;
-		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
-		    r_cnjg(&q__3, &A(i,j));
-		    i__3 = ix;
-		    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, q__2.i =
-			     q__3.r * X(ix).i + q__3.i * X(ix).r;
-		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-		    ix += *incx;
-		    iy += *incy;
-/* L70: */
-		}
-		i__2 = jy;
-		i__3 = jy;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		q__3.r = d__1 * temp1.r, q__3.i = d__1 * temp1.i;
-		q__2.r = Y(jy).r + q__3.r, q__2.i = Y(jy).i + q__3.i;
-		q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y  when A is stored in lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		q__1.r = alpha->r * X(j).r - alpha->i * X(j).i, q__1.i =
-			 alpha->r * X(j).i + alpha->i * X(j).r;
-		temp1.r = q__1.r, temp1.i = q__1.i;
-		temp2.r = 0.f, temp2.i = 0.f;
-		i__2 = j;
-		i__3 = j;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		q__2.r = d__1 * temp1.r, q__2.i = d__1 * temp1.i;
-		q__1.r = Y(j).r + q__2.r, q__1.i = Y(j).i + q__2.i;
-		Y(j).r = q__1.r, Y(j).i = q__1.i;
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    i__3 = i;
-		    i__4 = i;
-		    i__5 = i + j * a_dim1;
-		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    q__1.r = Y(i).r + q__2.r, q__1.i = Y(i).i + q__2.i;
-		    Y(i).r = q__1.r, Y(i).i = q__1.i;
-		    r_cnjg(&q__3, &A(i,j));
-		    i__3 = i;
-		    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, q__2.i =
-			     q__3.r * X(i).i + q__3.i * X(i).r;
-		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-/* L90: */
-		}
-		i__2 = j;
-		i__3 = j;
-		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		q__1.r = Y(j).r + q__2.r, q__1.i = Y(j).i + q__2.i;
-		Y(j).r = q__1.r, Y(j).i = q__1.i;
-/* L100: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		q__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, q__1.i =
-			 alpha->r * X(jx).i + alpha->i * X(jx).r;
-		temp1.r = q__1.r, temp1.i = q__1.i;
-		temp2.r = 0.f, temp2.i = 0.f;
-		i__2 = jy;
-		i__3 = jy;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		q__2.r = d__1 * temp1.r, q__2.i = d__1 * temp1.i;
-		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
-		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
-		ix = jx;
-		iy = jy;
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    ix += *incx;
-		    iy += *incy;
-		    i__3 = iy;
-		    i__4 = iy;
-		    i__5 = i + j * a_dim1;
-		    q__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    q__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    q__1.r = Y(iy).r + q__2.r, q__1.i = Y(iy).i + q__2.i;
-		    Y(iy).r = q__1.r, Y(iy).i = q__1.i;
-		    r_cnjg(&q__3, &A(i,j));
-		    i__3 = ix;
-		    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, q__2.i =
-			     q__3.r * X(ix).i + q__3.i * X(ix).r;
-		    q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-/* L110: */
-		}
-		i__2 = jy;
-		i__3 = jy;
-		q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		q__1.r = Y(jy).r + q__2.r, q__1.i = Y(jy).i + q__2.i;
-		Y(jy).r = q__1.r, Y(jy).i = q__1.i;
-		jx += *incx;
-		jy += *incy;
-/* L120: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of CHEMV . */
-
-} /* chemv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cher2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cher2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cher2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cher2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,436 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int cher2_(char *uplo, integer *n, complex *alpha, complex *
-	x, integer *incx, complex *y, integer *incy, complex *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
-    doublereal d__1;
-    complex q__1, q__2, q__3, q__4;
-
-    /* Builtin functions */
-    void r_cnjg(complex *, complex *);
-
-    /* Local variables */
-    static integer info;
-    static complex temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    CHER2  performs the hermitian rank 2 operation   
-
-       A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,   
-
-    where alpha is a scalar, x and y are n element vectors and A is an n 
-  
-    by n hermitian matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX         .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the hermitian matrix and the strictly   
-             lower triangular part of A is not referenced. On exit, the   
-             upper triangular part of the array A is overwritten by the   
-             upper triangular part of the updated matrix.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the hermitian matrix and the strictly   
-             upper triangular part of A is not referenced. On exit, the   
-             lower triangular part of the array A is overwritten by the   
-             lower triangular part of the updated matrix.   
-             Note that the imaginary parts of the diagonal elements need 
-  
-             not be set, they are assumed to be zero, and on exit they   
-             are set to zero.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*n)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("CHER2 ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f) {
-	return 0;
-    }
-
-/*     Set up the start points in X and Y if the increments are not both 
-  
-       unity. */
-
-    if (*incx != 1 || *incy != 1) {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*n - 1) * *incx;
-	}
-	if (*incy > 0) {
-	    ky = 1;
-	} else {
-	    ky = 1 - (*n - 1) * *incy;
-	}
-	jx = kx;
-	jy = ky;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A. */
-
-    if (lsame_(uplo, "U")) {
-
-/*        Form  A  when A is stored in the upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		i__3 = j;
-		if (X(j).r != 0.f || X(j).i != 0.f || (Y(j).r != 0.f 
-			|| Y(j).i != 0.f)) {
-		    r_cnjg(&q__2, &Y(j));
-		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
-			    alpha->r * q__2.i + alpha->i * q__2.r;
-		    temp1.r = q__1.r, temp1.i = q__1.i;
-		    i__2 = j;
-		    q__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
-			    q__2.i = alpha->r * X(j).i + alpha->i * X(j)
-			    .r;
-		    r_cnjg(&q__1, &q__2);
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = i;
-			q__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
-				q__3.i = X(i).r * temp1.i + X(i).i * 
-				temp1.r;
-			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
-				q__3.i;
-			i__6 = i;
-			q__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
-				q__4.i = Y(i).r * temp2.i + Y(i).i * 
-				temp2.r;
-			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-/* L10: */
-		    }
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = j;
-		    q__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
-			    q__2.i = X(j).r * temp1.i + X(j).i * 
-			    temp1.r;
-		    i__5 = j;
-		    q__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
-			    q__3.i = Y(j).r * temp2.i + Y(j).i * 
-			    temp2.r;
-		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
-		    d__1 = A(j,j).r + q__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		}
-/* L20: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		i__3 = jy;
-		if (X(jx).r != 0.f || X(jx).i != 0.f || (Y(jy).r != 0.f 
-			|| Y(jy).i != 0.f)) {
-		    r_cnjg(&q__2, &Y(jy));
-		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
-			    alpha->r * q__2.i + alpha->i * q__2.r;
-		    temp1.r = q__1.r, temp1.i = q__1.i;
-		    i__2 = jx;
-		    q__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    q__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    r_cnjg(&q__1, &q__2);
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-		    ix = kx;
-		    iy = ky;
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = ix;
-			q__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
-				q__3.i = X(ix).r * temp1.i + X(ix).i * 
-				temp1.r;
-			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
-				q__3.i;
-			i__6 = iy;
-			q__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
-				q__4.i = Y(iy).r * temp2.i + Y(iy).i * 
-				temp2.r;
-			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-			ix += *incx;
-			iy += *incy;
-/* L30: */
-		    }
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = jx;
-		    q__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
-			    q__2.i = X(jx).r * temp1.i + X(jx).i * 
-			    temp1.r;
-		    i__5 = jy;
-		    q__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
-			    q__3.i = Y(jy).r * temp2.i + Y(jy).i * 
-			    temp2.r;
-		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
-		    d__1 = A(j,j).r + q__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		}
-		jx += *incx;
-		jy += *incy;
-/* L40: */
-	    }
-	}
-    } else {
-
-/*        Form  A  when A is stored in the lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		i__3 = j;
-		if (X(j).r != 0.f || X(j).i != 0.f || (Y(j).r != 0.f 
-			|| Y(j).i != 0.f)) {
-		    r_cnjg(&q__2, &Y(j));
-		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
-			    alpha->r * q__2.i + alpha->i * q__2.r;
-		    temp1.r = q__1.r, temp1.i = q__1.i;
-		    i__2 = j;
-		    q__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
-			    q__2.i = alpha->r * X(j).i + alpha->i * X(j)
-			    .r;
-		    r_cnjg(&q__1, &q__2);
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = j;
-		    q__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
-			    q__2.i = X(j).r * temp1.i + X(j).i * 
-			    temp1.r;
-		    i__5 = j;
-		    q__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
-			    q__3.i = Y(j).r * temp2.i + Y(j).i * 
-			    temp2.r;
-		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
-		    d__1 = A(j,j).r + q__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		    i__2 = *n;
-		    for (i = j + 1; i <= *n; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = i;
-			q__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
-				q__3.i = X(i).r * temp1.i + X(i).i * 
-				temp1.r;
-			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
-				q__3.i;
-			i__6 = i;
-			q__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
-				q__4.i = Y(i).r * temp2.i + Y(i).i * 
-				temp2.r;
-			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-/* L50: */
-		    }
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		}
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		i__3 = jy;
-		if (X(jx).r != 0.f || X(jx).i != 0.f || (Y(jy).r != 0.f 
-			|| Y(jy).i != 0.f)) {
-		    r_cnjg(&q__2, &Y(jy));
-		    q__1.r = alpha->r * q__2.r - alpha->i * q__2.i, q__1.i = 
-			    alpha->r * q__2.i + alpha->i * q__2.r;
-		    temp1.r = q__1.r, temp1.i = q__1.i;
-		    i__2 = jx;
-		    q__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    q__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    r_cnjg(&q__1, &q__2);
-		    temp2.r = q__1.r, temp2.i = q__1.i;
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = jx;
-		    q__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
-			    q__2.i = X(jx).r * temp1.i + X(jx).i * 
-			    temp1.r;
-		    i__5 = jy;
-		    q__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
-			    q__3.i = Y(jy).r * temp2.i + Y(jy).i * 
-			    temp2.r;
-		    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
-		    d__1 = A(j,j).r + q__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		    ix = jx;
-		    iy = jy;
-		    i__2 = *n;
-		    for (i = j + 1; i <= *n; ++i) {
-			ix += *incx;
-			iy += *incy;
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = ix;
-			q__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
-				q__3.i = X(ix).r * temp1.i + X(ix).i * 
-				temp1.r;
-			q__2.r = A(i,j).r + q__3.r, q__2.i = A(i,j).i + 
-				q__3.i;
-			i__6 = iy;
-			q__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
-				q__4.i = Y(iy).r * temp2.i + Y(iy).i * 
-				temp2.r;
-			q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
-			A(i,j).r = q__1.r, A(i,j).i = q__1.i;
-/* L70: */
-		    }
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.f;
-		}
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of CHER2 . */
-
-} /* cher2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Cnames.h hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Cnames.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Cnames.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Cnames.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,346 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
-#define __SUPERLU_CNAMES
-
-/*
- * These macros define how C routines will be called.  ADD_ assumes that
- * they will be called by fortran, which expects C routines to have an
- * underscore postfixed to the name (Suns, and the Intel expect this).
- * NOCHANGE indicates that fortran will be calling, and that it expects
- * the name called by fortran to be identical to that compiled by the C
- * (RS6K's do this).  UPCASE says it expects C routines called by fortran
- * to be in all upcase (CRAY wants this). 
- */
-
-#define ADD_       0
-#define NOCHANGE   1
-#define UPCASE     2
-#define C_CALL     3
-
-#ifdef UpCase
-#define F77_CALL_C UPCASE
-#endif
-
-#ifdef NoChange
-#define F77_CALL_C NOCHANGE
-#endif
-
-#ifdef Add_
-#define F77_CALL_C ADD_
-#endif
-
-#ifndef F77_CALL_C
-#define F77_CALL_C ADD_
-#endif
-
-#if (F77_CALL_C == ADD_)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine
- * No redefinition necessary to have following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm_(...)
- *
- * This is the default.
- */
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle_
-#define f_create_options_handle          f_create_options_handle_
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle_
-#define f_create_LUstruct_handle         f_create_lustruct_handle_
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle_
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle_
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle_
-#define f_destroy_options_handle         f_destroy_options_handle_
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle_
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle_
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle_
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle_
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle_
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle_
-#define f_get_gridinfo                   f_get_gridinfo_
-#define f_get_SuperMatrix                f_get_supermatrix_
-#define f_set_SuperMatrix                f_set_supermatrix_
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix_ 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix_
-#define f_get_superlu_options            f_get_superlu_options_
-#define f_set_superlu_options            f_set_superlu_options_
-#define f_set_default_options            f_set_default_options_
-#define f_superlu_gridinit               f_superlu_gridinit_
-#define f_superlu_gridexit               f_superlu_gridexit_
-#define f_ScalePermstructInit            f_scalepermstructinit_
-#define f_ScalePermstructFree            f_scalepermstructfree_
-#define f_PStatInit                      f_pstatinit_
-#define f_PStatFree                      f_pstatfree_
-#define f_LUstructInit                   f_lustructinit_
-#define f_LUstructFree                   f_lustructfree_
-#define f_Destroy_LU                     f_destroy_lu_
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist_
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist_
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist_
-#define f_dSolveFinalize                 f_dsolvefinalize_
-#define f_pdgssvx                        f_pdgssvx_
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix_
-#define f_check_malloc                   f_check_malloc_
-#endif
-
-#if (F77_CALL_C == UPCASE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void DGEMM(...)
- */
-/* BLAS */
-#define sasum_    SASUM
-#define isamax_   ISAMAX
-#define scopy_    SCOPY
-#define sscal_    SSCAL
-#define sger_     SGER
-#define snrm2_    SNRM2
-#define ssymv_    SSYMV
-#define sdot_     SDOT
-#define saxpy_    SAXPY
-#define ssyr2_    SSYR2
-#define srot_     SROT
-#define sgemv_    SGEMV
-#define strsv_    STRSV
-#define sgemm_    SGEMM
-#define strsm_    STRSM
-
-#define dasum_    DASUM
-#define idamax_   IDAMAX
-#define dcopy_    DCOPY
-#define dscal_    DSCAL
-#define dger_     DGER
-#define dnrm2_    DNRM2
-#define dsymv_    DSYMV
-#define ddot_     DDOT
-#define daxpy_    DAXPY
-#define dsyr2_    DSYR2
-#define drot_     DROT
-#define dgemv_    DGEMV
-#define dtrsv_    DTRSV
-#define dgemm_    DGEMM
-#define dtrsm_    DTRSM
-
-#define scasum_   SCASUM
-#define icamax_   ICAMAX
-#define ccopy_    CCOPY
-#define cscal_    CSCAL
-#define scnrm2_   SCNRM2
-#define caxpy_    CAXPY
-#define cgemv_    CGEMV
-#define ctrsv_    CTRSV
-#define cgemm_    CGEMM
-#define ctrsm_    CTRSM
-#define cgerc_    CGERC
-#define chemv_    CHEMV
-#define cher2_    CHER2
-
-#define dzasum_   DZASUM
-#define izamax_   IZAMAX
-#define zcopy_    ZCOPY
-#define zscal_    ZSCAL
-#define dznrm2_   DZNRM2
-#define zaxpy_    ZAXPY
-#define zgemv_    ZGEMV
-#define ztrsv_    ZTRSV
-#define zgemm_    ZGEMM
-#define ztrsm_    ZTRSM
-#define zgerc_    ZGERC
-#define zhemv_    ZHEMV
-#define zher2_    ZHER2
-#define zgeru_    ZGERU
-
-/* LAPACK */
-#define dlamch_         DLAMCH
-#define slamch_         SLAMCH
-#define xerbla_         XERBLA
-#define lsame_          LSAME
-
-#define mc64id_         MC64ID
-#define mc64ad_         MC64AD
-#define c_bridge_dgssv_               C_BRIDGE_DGSSV
-#define c_fortran_slugrid_            C_FORTRAN_SLUGRID
-#define c_fortran_pdgssvx_            C_FORTRAN_PDGSSVX
-#define c_fortran_pdgssvx_ABglobal_   C_FORTRAN_PDGSSVX_ABGLOBAL
-#define c_fortran_pzgssvx_            C_FORTRAN_PZGSSVX
-#define c_fortran_pzgssvx_ABglobal_   C_FORTRAN_PZGSSVX_ABGLOBAL
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         F_CREATE_GRIDINFO_HANDLE
-#define f_create_options_handle          F_CREATE_OPTIONS_HANDLE
-#define f_create_ScalePerm_handle        F_CREATE_SCALEPERM_HANDLE
-#define f_create_LUstruct_handle         F_CREATE_LUSTRUCT_HANDLE
-#define f_create_SOLVEstruct_handle      F_CREATE_SOLVESTRUCT_HANDLE
-#define f_create_SuperMatrix_handle      F_CREATE_SUPERMATRIX_HANDLE
-#define f_destroy_gridinfo_handle        F_DESTROY_GRIDINFO_HANDLE
-#define f_destroy_options_handle         F_DESTROY_OPTIONS_HANDLE
-#define f_destroy_ScalePerm_handle       F_DESTROY_SCALEPERM_HANDLE
-#define f_destroy_LUstruct_handle        F_DESTROY_LUSTRUCT_HANDLE
-#define f_destroy_SOLVEstruct_handle     F_DESTROY_SOLVESTRUCT_HANDLE
-#define f_destroy_SuperMatrix_handle     F_DESTROY_SUPERMATRIX_HANDLE
-#define f_create_SuperLUStat_handle      F_CREATE_SUPERLUSTAT_HANDLE
-#define f_destroy_SuperLUStat_handle     F_DESTROY_SUPERLUSTAT_HANDLE
-#define f_get_gridinfo                   F_GET_GRIDINFO
-#define f_get_SuperMatrix                F_GET_SUPERMATRIX
-#define f_set_SuperMatrix                F_SET_SUPERMATRIX
-#define f_get_CompRowLoc_Matrix          F_GET_COMPROWLOC_MATRIX
-#define f_set_CompRowLoc_Matrix          F_SET_COMPROWLOC_MATRIX
-#define f_get_superlu_options            F_GET_SUPERLU_OPTIONS
-#define f_set_superlu_options            F_SET_SUPERLU_OPTIONS
-#define f_set_default_options            F_SET_DEFAULT_OPTIONS
-#define f_superlu_gridinit               F_SUPERLU_GRIDINIT
-#define f_superlu_gridexit               F_SUPERLU_GRIDEXIT
-#define f_ScalePermstructInit            F_SCALEPERMSTRUCTINIT
-#define f_ScalePermstructFree            F_SCALEPERMSTRUCTFREE
-#define f_PStatInit                      F_PSTATINIT
-#define f_PStatFree                      F_PSTATFREE
-#define f_LUstructInit                   F_LUSTRUCTINIT
-#define f_LUstructFree                   F_LUSTRUCTFREE
-#define f_Destroy_LU                     F_DESTROY_LU
-#define f_dCreate_CompRowLoc_Mat_dist    F_DCREATE_COMPROWLOC_MAT_DIST
-#define f_Destroy_CompRowLoc_Mat_dist    F_DESTROY_COMPROWLOC_MAT_DIST
-#define f_Destroy_SuperMat_Store_dist    F_DESTROY_SUPERMAT_STORE_DIST
-#define f_dSolveFinalize                 F_DSOLVEFINALIZE
-#define f_pdgssvx                        F_PDGSSVX
-#define f_dcreate_dist_matrix            F_DCREATE_DIST_MATRIX
-#define f_check_malloc                   F_CHECK_MALLOC
-#endif
-
-#if (F77_CALL_C == NOCHANGE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * for following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm(...)
- */
-/* BLAS */
-#define sasum_    sasum
-#define isamax_   isamax
-#define scopy_    scopy
-#define sscal_    sscal
-#define sger_     sger
-#define snrm2_    snrm2
-#define ssymv_    ssymv
-#define sdot_     sdot
-#define saxpy_    saxpy
-#define ssyr2_    ssyr2
-#define srot_     srot
-#define sgemv_    sgemv
-#define strsv_    strsv
-#define sgemm_    sgemm
-#define strsm_    strsm
-
-#define dasum_    dasum
-#define idamax_   idamax
-#define dcopy_    dcopy
-#define dscal_    dscal
-#define dger_     dger
-#define dnrm2_    dnrm2
-#define dsymv_    dsymv
-#define ddot_     ddot
-#define daxpy_    daxpy
-#define dsyr2_    dsyr2
-#define drot_     drot
-#define dgemv_    dgemv
-#define dtrsv_    dtrsv
-#define dgemm_    dgemm
-#define dtrsm_    dtrsm
-
-#define scasum_   scasum
-#define icamax_   icamax
-#define ccopy_    ccopy
-#define cscal_    cscal
-#define scnrm2_   scnrm2
-#define caxpy_    caxpy
-#define cgemv_    cgemv
-#define ctrsv_    ctrsv
-#define cgemm_    cgemm
-#define ctrsm_    ctrsm
-#define cgerc_    cgerc
-#define chemv_    chemv
-#define cher2_    cher2
-
-#define dzasum_   dzasum
-#define izamax_   izamax
-#define zcopy_    zcopy
-#define zscal_    zscal
-#define dznrm2_   dznrm2
-#define zaxpy_    zaxpy
-#define zgemv_    zgemv
-#define ztrsv_    ztrsv
-#define zgemm_    zgemm
-#define ztrsm_    ztrsm
-#define zgerc_    zgerc
-#define zhemv_    zhemv
-#define zher2_    zher2
-#define zgeru_    zgeru
-
-/* LAPACK */
-#define dlamch_         dlamch
-#define slamch_         slamch
-#define xerbla_         xerbla
-#define lsame_          lsame
-
-#define mc64id_         mc64id
-#define mc64ad_         mc64ad
-
-#define c_bridge_dgssv_               c_bridge_dgssv
-#define c_fortran_slugrid_            c_fortran_slugrid
-#define c_fortran_pdgssvx_            c_fortran_pdgssvx
-#define c_fortran_pdgssvx_ABglobal_   c_fortran_pdgssvx_abglobal
-#define c_fortran_pzgssvx_            c_fortran_pzgssvx
-#define c_fortran_pzgssvx_ABglobal_   c_fortran_pzgssvx_abglobal
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle
-#define f_create_options_handle          f_create_options_handle
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle
-#define f_create_LUstruct_handle         f_create_lustruct_handle
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle
-#define f_destroy_options_handle         f_destroy_options_handle
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle
-#define f_get_gridinfo                   f_get_gridinfo
-#define f_get_SuperMatrix                f_get_supermatrix
-#define f_set_SuperMatrix                f_set_supermatrix
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix
-#define f_get_superlu_options            f_get_superlu_options
-#define f_set_superlu_options            f_set_superlu_options
-#define f_set_default_options            f_set_default_options
-#define f_superlu_gridinit               f_superlu_gridinit
-#define f_superlu_gridexit               f_superlu_gridexit
-#define f_ScalePermstructInit            f_scalepermstructinit
-#define f_ScalePermstructFree            f_scalepermstructfree
-#define f_PStatInit                      f_pstatinit
-#define f_PStatFree                      f_pstatfree
-#define f_LUstructInit                   f_lustructinit
-#define f_LUstructFree                   f_lustructfree
-#define f_Destroy_LU                     f_destroy_lu
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist
-#define f_dSolveFinalize                 f_dsolvefinalize
-#define f_pdgssvx                        f_pdgssvx
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix
-#define f_check_malloc                   f_check_malloc
-#endif
-
-#endif /* __SUPERLU_CNAMES */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cscal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/cscal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/cscal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,70 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int cscal_(integer *n, complex *ca, complex *cx, integer *
-	incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3, i__4;
-    complex q__1;
-
-    /* Local variables */
-    static integer i, nincx;
-
-
-/*     scales a vector by a constant.   
-       jack dongarra, linpack,  3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define CX(I) cx[(I)-1]
-
-
-    if (*n <= 0 || *incx <= 0) {
-	return 0;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-	i__3 = i;
-	i__4 = i;
-	q__1.r = ca->r * CX(i).r - ca->i * CX(i).i, q__1.i = ca->r * CX(
-		i).i + ca->i * CX(i).r;
-	CX(i).r = q__1.r, CX(i).i = q__1.i;
-/* L10: */
-    }
-    return 0;
-
-/*        code for increment equal to 1 */
-
-L20:
-    i__2 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__1 = i;
-	i__3 = i;
-	q__1.r = ca->r * CX(i).r - ca->i * CX(i).i, q__1.i = ca->r * CX(
-		i).i + ca->i * CX(i).r;
-	CX(i).r = q__1.r, CX(i).i = q__1.i;
-/* L30: */
-    }
-    return 0;
-} /* cscal_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ctrsv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,509 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int ctrsv_(char *uplo, char *trans, char *diag, integer *n, 
-	complex *a, integer *lda, complex *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    complex q__1, q__2, q__3;
-
-    /* Builtin functions */
-    void c_div(complex *, complex *, complex *), r_cnjg(complex *, complex *);
-
-    /* Local variables */
-    static integer info;
-    static complex temp;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, jx, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical noconj, nounit;
-
-
-/*  Purpose   
-    =======   
-
-    CTRSV  solves one of the systems of equations   
-
-       A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,   
-
-    where b and x are n element vectors and A is an n by n unit, or   
-    non-unit, upper or lower triangular matrix.   
-
-    No test for singularity or near-singularity is included in this   
-    routine. Such tests must be performed before calling this routine.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the matrix is an upper or   
-             lower triangular matrix as follows:   
-
-                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
-
-                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
-
-             Unchanged on exit.   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the equations to be solved as   
-             follows:   
-
-                TRANS = 'N' or 'n'   A*x = b.   
-
-                TRANS = 'T' or 't'   A'*x = b.   
-
-                TRANS = 'C' or 'c'   conjg( A' )*x = b.   
-
-             Unchanged on exit.   
-
-    DIAG   - CHARACTER*1.   
-             On entry, DIAG specifies whether or not A is unit   
-             triangular as follows:   
-
-                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
-
-                DIAG = 'N' or 'n'   A is not assumed to be unit   
-                                    triangular.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX          array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular matrix and the strictly lower triangular part of 
-  
-             A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular matrix and the strictly upper triangular part of 
-  
-             A is not referenced.   
-             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
-  
-             A are not referenced either, but are assumed to be unity.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - COMPLEX          array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element right-hand side vector b. On exit, X is overwritten 
-  
-             with the solution vector x.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
-	     ! lsame_(trans, "C")) {
-	info = 2;
-    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
-	info = 3;
-    } else if (*n < 0) {
-	info = 4;
-    } else if (*lda < max(1,*n)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    }
-    if (info != 0) {
-	xerbla_("CTRSV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0) {
-	return 0;
-    }
-
-    noconj = lsame_(trans, "T");
-    nounit = lsame_(diag, "N");
-
-/*     Set up the start point in X if the increment is not unity. This   
-       will be  ( N - 1 )*INCX  too small for descending loops. */
-
-    if (*incx <= 0) {
-	kx = 1 - (*n - 1) * *incx;
-    } else if (*incx != 1) {
-	kx = 1;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (lsame_(trans, "N")) {
-
-/*        Form  x := inv( A )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    i__1 = j;
-		    if (X(j).r != 0.f || X(j).i != 0.f) {
-			if (nounit) {
-			    i__1 = j;
-			    c_div(&q__1, &X(j), &A(j,j));
-			    X(j).r = q__1.r, X(j).i = q__1.i;
-			}
-			i__1 = j;
-			temp.r = X(j).r, temp.i = X(j).i;
-			for (i = j - 1; i >= 1; --i) {
-			    i__1 = i;
-			    i__2 = i;
-			    i__3 = i + j * a_dim1;
-			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    q__1.r = X(i).r - q__2.r, q__1.i = X(i).i - 
-				    q__2.i;
-			    X(i).r = q__1.r, X(i).i = q__1.i;
-/* L10: */
-			}
-		    }
-/* L20: */
-		}
-	    } else {
-		jx = kx + (*n - 1) * *incx;
-		for (j = *n; j >= 1; --j) {
-		    i__1 = jx;
-		    if (X(jx).r != 0.f || X(jx).i != 0.f) {
-			if (nounit) {
-			    i__1 = jx;
-			    c_div(&q__1, &X(jx), &A(j,j));
-			    X(jx).r = q__1.r, X(jx).i = q__1.i;
-			}
-			i__1 = jx;
-			temp.r = X(jx).r, temp.i = X(jx).i;
-			ix = jx;
-			for (i = j - 1; i >= 1; --i) {
-			    ix -= *incx;
-			    i__1 = ix;
-			    i__2 = ix;
-			    i__3 = i + j * a_dim1;
-			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    q__1.r = X(ix).r - q__2.r, q__1.i = X(ix).i - 
-				    q__2.i;
-			    X(ix).r = q__1.r, X(ix).i = q__1.i;
-/* L30: */
-			}
-		    }
-		    jx -= *incx;
-/* L40: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = j;
-		    if (X(j).r != 0.f || X(j).i != 0.f) {
-			if (nounit) {
-			    i__2 = j;
-			    c_div(&q__1, &X(j), &A(j,j));
-			    X(j).r = q__1.r, X(j).i = q__1.i;
-			}
-			i__2 = j;
-			temp.r = X(j).r, temp.i = X(j).i;
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    i__3 = i;
-			    i__4 = i;
-			    i__5 = i + j * a_dim1;
-			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    q__1.r = X(i).r - q__2.r, q__1.i = X(i).i - 
-				    q__2.i;
-			    X(i).r = q__1.r, X(i).i = q__1.i;
-/* L50: */
-			}
-		    }
-/* L60: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = jx;
-		    if (X(jx).r != 0.f || X(jx).i != 0.f) {
-			if (nounit) {
-			    i__2 = jx;
-			    c_div(&q__1, &X(jx), &A(j,j));
-			    X(jx).r = q__1.r, X(jx).i = q__1.i;
-			}
-			i__2 = jx;
-			temp.r = X(jx).r, temp.i = X(jx).i;
-			ix = jx;
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    ix += *incx;
-			    i__3 = ix;
-			    i__4 = ix;
-			    i__5 = i + j * a_dim1;
-			    q__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    q__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    q__1.r = X(ix).r - q__2.r, q__1.i = X(ix).i - 
-				    q__2.i;
-			    X(ix).r = q__1.r, X(ix).i = q__1.i;
-/* L70: */
-			}
-		    }
-		    jx += *incx;
-/* L80: */
-		}
-	    }
-	}
-    } else {
-
-/*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = j;
-		    temp.r = X(j).r, temp.i = X(j).i;
-		    if (noconj) {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    i__3 = i + j * a_dim1;
-			    i__4 = i;
-			    q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
-				    i).i, q__2.i = A(i,j).r * X(i).i + 
-				    A(i,j).i * X(i).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-/* L90: */
-			}
-			if (nounit) {
-			    c_div(&q__1, &temp, &A(j,j));
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    } else {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    r_cnjg(&q__3, &A(i,j));
-			    i__3 = i;
-			    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, 
-				    q__2.i = q__3.r * X(i).i + q__3.i * X(
-				    i).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-/* L100: */
-			}
-			if (nounit) {
-			    r_cnjg(&q__2, &A(j,j));
-			    c_div(&q__1, &temp, &q__2);
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    }
-		    i__2 = j;
-		    X(j).r = temp.r, X(j).i = temp.i;
-/* L110: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    ix = kx;
-		    i__2 = jx;
-		    temp.r = X(jx).r, temp.i = X(jx).i;
-		    if (noconj) {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    i__3 = i + j * a_dim1;
-			    i__4 = ix;
-			    q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
-				    ix).i, q__2.i = A(i,j).r * X(ix).i + 
-				    A(i,j).i * X(ix).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-			    ix += *incx;
-/* L120: */
-			}
-			if (nounit) {
-			    c_div(&q__1, &temp, &A(j,j));
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    } else {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    r_cnjg(&q__3, &A(i,j));
-			    i__3 = ix;
-			    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, 
-				    q__2.i = q__3.r * X(ix).i + q__3.i * X(
-				    ix).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-			    ix += *incx;
-/* L130: */
-			}
-			if (nounit) {
-			    r_cnjg(&q__2, &A(j,j));
-			    c_div(&q__1, &temp, &q__2);
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    }
-		    i__2 = jx;
-		    X(jx).r = temp.r, X(jx).i = temp.i;
-		    jx += *incx;
-/* L140: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    i__1 = j;
-		    temp.r = X(j).r, temp.i = X(j).i;
-		    if (noconj) {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    i__2 = i + j * a_dim1;
-			    i__3 = i;
-			    q__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
-				    i).i, q__2.i = A(i,j).r * X(i).i + 
-				    A(i,j).i * X(i).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-/* L150: */
-			}
-			if (nounit) {
-			    c_div(&q__1, &temp, &A(j,j));
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    } else {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    r_cnjg(&q__3, &A(i,j));
-			    i__2 = i;
-			    q__2.r = q__3.r * X(i).r - q__3.i * X(i).i, 
-				    q__2.i = q__3.r * X(i).i + q__3.i * X(
-				    i).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-/* L160: */
-			}
-			if (nounit) {
-			    r_cnjg(&q__2, &A(j,j));
-			    c_div(&q__1, &temp, &q__2);
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    }
-		    i__1 = j;
-		    X(j).r = temp.r, X(j).i = temp.i;
-/* L170: */
-		}
-	    } else {
-		kx += (*n - 1) * *incx;
-		jx = kx;
-		for (j = *n; j >= 1; --j) {
-		    ix = kx;
-		    i__1 = jx;
-		    temp.r = X(jx).r, temp.i = X(jx).i;
-		    if (noconj) {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    i__2 = i + j * a_dim1;
-			    i__3 = ix;
-			    q__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
-				    ix).i, q__2.i = A(i,j).r * X(ix).i + 
-				    A(i,j).i * X(ix).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-			    ix -= *incx;
-/* L180: */
-			}
-			if (nounit) {
-			    c_div(&q__1, &temp, &A(j,j));
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    } else {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    r_cnjg(&q__3, &A(i,j));
-			    i__2 = ix;
-			    q__2.r = q__3.r * X(ix).r - q__3.i * X(ix).i, 
-				    q__2.i = q__3.r * X(ix).i + q__3.i * X(
-				    ix).r;
-			    q__1.r = temp.r - q__2.r, q__1.i = temp.i - 
-				    q__2.i;
-			    temp.r = q__1.r, temp.i = q__1.i;
-			    ix -= *incx;
-/* L190: */
-			}
-			if (nounit) {
-			    r_cnjg(&q__2, &A(j,j));
-			    c_div(&q__1, &temp, &q__2);
-			    temp.r = q__1.r, temp.i = q__1.i;
-			}
-		    }
-		    i__1 = jx;
-		    X(jx).r = temp.r, X(jx).i = temp.i;
-		    jx -= *incx;
-/* L200: */
-		}
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of CTRSV . */
-
-} /* ctrsv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dasum.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dasum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dasum.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,88 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-doublereal dasum_(integer *n, doublereal *dx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2;
-    doublereal ret_val, d__1, d__2, d__3, d__4, d__5, d__6;
-
-    /* Local variables */
-    static integer i, m;
-    static doublereal dtemp;
-    static integer nincx, mp1;
-
-
-/*     takes the sum of the absolute values.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DX(I) dx[(I)-1]
-
-
-    ret_val = 0.;
-    dtemp = 0.;
-    if (*n <= 0 || *incx <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-	dtemp += (d__1 = DX(i), abs(d__1));
-/* L10: */
-    }
-    ret_val = dtemp;
-    return ret_val;
-
-/*        code for increment equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 6;
-    if (m == 0) {
-	goto L40;
-    }
-    i__2 = m;
-    for (i = 1; i <= m; ++i) {
-	dtemp += (d__1 = DX(i), abs(d__1));
-/* L30: */
-    }
-    if (*n < 6) {
-	goto L60;
-    }
-L40:
-    mp1 = m + 1;
-    i__2 = *n;
-    for (i = mp1; i <= *n; i += 6) {
-	dtemp = dtemp + (d__1 = DX(i), abs(d__1)) + (d__2 = DX(i + 1), abs(
-		d__2)) + (d__3 = DX(i + 2), abs(d__3)) + (d__4 = DX(i + 3), 
-		abs(d__4)) + (d__5 = DX(i + 4), abs(d__5)) + (d__6 = DX(i + 5)
-		, abs(d__6));
-/* L50: */
-    }
-L60:
-    ret_val = dtemp;
-    return ret_val;
-} /* dasum_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/daxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/daxpy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/daxpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/daxpy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,94 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int daxpy_(integer *n, doublereal *da, doublereal *dx, 
-	integer *incx, doublereal *dy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-
-    /* Local variables */
-    static integer i, m, ix, iy, mp1;
-
-
-/*     constant times a vector plus a vector.   
-       uses unrolled loops for increments equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DY(I) dy[(I)-1]
-#define DX(I) dx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*da == 0.) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	DY(iy) += *da * DX(ix);
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 4;
-    if (m == 0) {
-	goto L40;
-    }
-    i__1 = m;
-    for (i = 1; i <= m; ++i) {
-	DY(i) += *da * DX(i);
-/* L30: */
-    }
-    if (*n < 4) {
-	return 0;
-    }
-L40:
-    mp1 = m + 1;
-    i__1 = *n;
-    for (i = mp1; i <= *n; i += 4) {
-	DY(i) += *da * DX(i);
-	DY(i + 1) += *da * DX(i + 1);
-	DY(i + 2) += *da * DX(i + 2);
-	DY(i + 3) += *da * DX(i + 3);
-/* L50: */
-    }
-    return 0;
-} /* daxpy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcabs1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,28 +0,0 @@
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-doublereal dcabs1_(doublecomplex *z)
-{
-/* >>Start of File<<   
-
-       System generated locals */
-    doublereal ret_val;
-    static doublecomplex equiv_0[1];
-
-    /* Local variables */
-#define t ((doublereal *)equiv_0)
-#define zz (equiv_0)
-
-    zz->r = z->r, zz->i = z->i;
-    ret_val = abs(t[0]) + abs(t[1]);
-    return ret_val;
-} /* dcabs1_ */
-
-#undef zz
-#undef t
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcopy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dcopy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dcopy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,94 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dcopy_(integer *n, doublereal *dx, integer *incx, 
-	doublereal *dy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-
-    /* Local variables */
-    static integer i, m, ix, iy, mp1;
-
-
-/*     copies a vector, x, to a vector, y.   
-       uses unrolled loops for increments equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DY(I) dy[(I)-1]
-#define DX(I) dx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	DY(iy) = DX(ix);
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 7;
-    if (m == 0) {
-	goto L40;
-    }
-    i__1 = m;
-    for (i = 1; i <= m; ++i) {
-	DY(i) = DX(i);
-/* L30: */
-    }
-    if (*n < 7) {
-	return 0;
-    }
-L40:
-    mp1 = m + 1;
-    i__1 = *n;
-    for (i = mp1; i <= *n; i += 7) {
-	DY(i) = DX(i);
-	DY(i + 1) = DX(i + 1);
-	DY(i + 2) = DX(i + 2);
-	DY(i + 3) = DX(i + 3);
-	DY(i + 4) = DX(i + 4);
-	DY(i + 5) = DX(i + 5);
-	DY(i + 6) = DX(i + 6);
-/* L50: */
-    }
-    return 0;
-} /* dcopy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ddot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ddot.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ddot.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ddot.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,97 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-doublereal ddot_(integer *n, doublereal *dx, integer *incx, doublereal *dy, 
-	integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-    doublereal ret_val;
-
-    /* Local variables */
-    static integer i, m;
-    static doublereal dtemp;
-    static integer ix, iy, mp1;
-
-
-/*     forms the dot product of two vectors.   
-       uses unrolled loops for increments equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DY(I) dy[(I)-1]
-#define DX(I) dx[(I)-1]
-
-
-    ret_val = 0.;
-    dtemp = 0.;
-    if (*n <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	dtemp += DX(ix) * DY(iy);
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    ret_val = dtemp;
-    return ret_val;
-
-/*        code for both increments equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 5;
-    if (m == 0) {
-	goto L40;
-    }
-    i__1 = m;
-    for (i = 1; i <= m; ++i) {
-	dtemp += DX(i) * DY(i);
-/* L30: */
-    }
-    if (*n < 5) {
-	goto L60;
-    }
-L40:
-    mp1 = m + 1;
-    i__1 = *n;
-    for (i = mp1; i <= *n; i += 5) {
-	dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) * 
-		DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4);
-/* L50: */
-    }
-L60:
-    ret_val = dtemp;
-    return ret_val;
-} /* ddot_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,395 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dgemm_(char *transa, char *transb, integer *m, integer *
-	n, integer *k, doublereal *alpha, doublereal *a, integer *lda, 
-	doublereal *b, integer *ldb, doublereal *beta, doublereal *c, integer 
-	*ldc)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
-	    i__3;
-
-    /* Local variables */
-    static integer info;
-    static logical nota, notb;
-    static doublereal temp;
-    static integer i, j, l, ncola;
-    extern logical lsame_(char *, char *);
-    static integer nrowa, nrowb;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    DGEMM  performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of   
-
-       op( X ) = X   or   op( X ) = X',   
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-  
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - CHARACTER*1.   
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-  
-             the matrix multiplication as follows:   
-
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-
-                TRANSA = 'C' or 'c',  op( A ) = A'.   
-
-             Unchanged on exit.   
-
-    TRANSB - CHARACTER*1.   
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-  
-             the matrix multiplication as follows:   
-
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-
-                TRANSB = 'C' or 'c',  op( B ) = B'.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry,  M  specifies  the number  of rows  of the  matrix 
-  
-             op( A )  and of the  matrix  C.  M  must  be at least  zero. 
-  
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry,  N  specifies the number  of columns of the matrix 
-  
-             op( B ) and the number of columns of the matrix C. N must be 
-  
-             at least zero.   
-             Unchanged on exit.   
-
-    K      - INTEGER.   
-             On entry,  K  specifies  the number of columns of the matrix 
-  
-             op( A ) and the number of rows of the matrix op( B ). K must 
-  
-             be at least  zero.   
-             Unchanged on exit.   
-
-    ALPHA  - DOUBLE PRECISION.   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is 
-  
-             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.   
-             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k 
-  
-             part of the array  A  must contain the matrix  A,  otherwise 
-  
-             the leading  k by m  part of the array  A  must contain  the 
-  
-             matrix A.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. When  TRANSA = 'N' or 'n' then 
-  
-             LDA must be at least  max( 1, m ), otherwise  LDA must be at 
-  
-             least  max( 1, k ).   
-             Unchanged on exit.   
-
-    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-  
-             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n 
-  
-             part of the array  B  must contain the matrix  B,  otherwise 
-  
-             the leading  n by k  part of the array  B  must contain  the 
-  
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - INTEGER.   
-             On entry, LDB specifies the first dimension of B as declared 
-  
-             in the calling (sub) program. When  TRANSB = 'N' or 'n' then 
-  
-             LDB must be at least  max( 1, k ), otherwise  LDB must be at 
-  
-             least  max( 1, n ).   
-             Unchanged on exit.   
-
-    BETA   - DOUBLE PRECISION.   
-             On entry,  BETA  specifies the scalar  beta.  When  BETA  is 
-  
-             supplied as zero then C need not be set on input.   
-             Unchanged on exit.   
-
-    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading  m by n  part of the array  C must 
-  
-             contain the matrix  C,  except when  beta  is zero, in which 
-  
-             case C need not be set on entry.   
-             On exit, the array  C  is overwritten by the  m by n  matrix 
-  
-             ( alpha*op( A )*op( B ) + beta*C ).   
-
-    LDC    - INTEGER.   
-             On entry, LDC specifies the first dimension of C as declared 
-  
-             in  the  calling  (sub)  program.   LDC  must  be  at  least 
-  
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 3 Blas routine.   
-
-    -- Written on 8-February-1989.   
-       Jack Dongarra, Argonne National Laboratory.   
-       Iain Duff, AERE Harwell.   
-       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
-       Sven Hammarling, Numerical Algorithms Group Ltd.   
-
-
-
-       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not 
-  
-       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows 
-  
-       and  columns of  A  and the  number of  rows  of  B  respectively. 
-  
-
-    
-   Parameter adjustments   
-       Function Body */
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
-#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
-
-    nota = lsame_(transa, "N");
-    notb = lsame_(transb, "N");
-    if (nota) {
-	nrowa = *m;
-	ncola = *k;
-    } else {
-	nrowa = *k;
-	ncola = *m;
-    }
-    if (notb) {
-	nrowb = *k;
-    } else {
-	nrowb = *n;
-    }
-
-/*     Test the input parameters. */
-
-    info = 0;
-    if (! nota && ! lsame_(transa, "C") && ! lsame_(transa, "T")) {
-	info = 1;
-    } else if (! notb && ! lsame_(transb, "C") && ! lsame_(transb, 
-	    "T")) {
-	info = 2;
-    } else if (*m < 0) {
-	info = 3;
-    } else if (*n < 0) {
-	info = 4;
-    } else if (*k < 0) {
-	info = 5;
-    } else if (*lda < max(1,nrowa)) {
-	info = 8;
-    } else if (*ldb < max(1,nrowb)) {
-	info = 10;
-    } else if (*ldc < max(1,*m)) {
-	info = 13;
-    }
-    if (info != 0) {
-	xerbla_("DGEMM ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
-	return 0;
-    }
-
-/*     And if  alpha.eq.zero. */
-
-    if (*alpha == 0.) {
-	if (*beta == 0.) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    C(i,j) = 0.;
-/* L10: */
-		}
-/* L20: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    C(i,j) = *beta * C(i,j);
-/* L30: */
-		}
-/* L40: */
-	    }
-	}
-	return 0;
-    }
-
-/*     Start the operations. */
-
-    if (notb) {
-	if (nota) {
-
-/*           Form  C := alpha*A*B + beta*C. */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (*beta == 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			C(i,j) = 0.;
-/* L50: */
-		    }
-		} else if (*beta != 1.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			C(i,j) = *beta * C(i,j);
-/* L60: */
-		    }
-		}
-		i__2 = *k;
-		for (l = 1; l <= *k; ++l) {
-		    if (B(l,j) != 0.) {
-			temp = *alpha * B(l,j);
-			i__3 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    C(i,j) += temp * A(i,l);
-/* L70: */
-			}
-		    }
-/* L80: */
-		}
-/* L90: */
-	    }
-	} else {
-
-/*           Form  C := alpha*A'*B + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			temp += A(l,i) * B(l,j);
-/* L100: */
-		    }
-		    if (*beta == 0.) {
-			C(i,j) = *alpha * temp;
-		    } else {
-			C(i,j) = *alpha * temp + *beta * C(i,j);
-		    }
-/* L110: */
-		}
-/* L120: */
-	    }
-	}
-    } else {
-	if (nota) {
-
-/*           Form  C := alpha*A*B' + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (*beta == 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			C(i,j) = 0.;
-/* L130: */
-		    }
-		} else if (*beta != 1.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			C(i,j) = *beta * C(i,j);
-/* L140: */
-		    }
-		}
-		i__2 = *k;
-		for (l = 1; l <= *k; ++l) {
-		    if (B(j,l) != 0.) {
-			temp = *alpha * B(j,l);
-			i__3 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    C(i,j) += temp * A(i,l);
-/* L150: */
-			}
-		    }
-/* L160: */
-		}
-/* L170: */
-	    }
-	} else {
-
-/*           Form  C := alpha*A'*B' + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			temp += A(l,i) * B(j,l);
-/* L180: */
-		    }
-		    if (*beta == 0.) {
-			C(i,j) = *alpha * temp;
-		    } else {
-			C(i,j) = *alpha * temp + *beta * C(i,j);
-		    }
-/* L190: */
-		}
-/* L200: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of DGEMM . */
-
-} /* dgemm_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dgemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dgemv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,299 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dgemv_(char *trans, integer *m, integer *n, doublereal *
-	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
-	doublereal *beta, doublereal *y, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static doublereal temp;
-    static integer lenx, leny, i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    DGEMV  performs one of the matrix-vector operations   
-
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are vectors and A is an   
-    m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - DOUBLE PRECISION.   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-    X      - DOUBLE PRECISION array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - DOUBLE PRECISION.   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - DOUBLE PRECISION array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-  
-             updated vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! 
-	    lsame_(trans, "C")) {
-	info = 1;
-    } else if (*m < 0) {
-	info = 2;
-    } else if (*n < 0) {
-	info = 3;
-    } else if (*lda < max(1,*m)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    } else if (*incy == 0) {
-	info = 11;
-    }
-    if (info != 0) {
-	xerbla_("DGEMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) {
-	return 0;
-    }
-
-/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-  
-       up the start points in  X  and  Y. */
-
-    if (lsame_(trans, "N")) {
-	lenx = *n;
-	leny = *m;
-    } else {
-	lenx = *m;
-	leny = *n;
-    }
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (lenx - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (leny - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A.   
-
-       First form  y := beta*y. */
-
-    if (*beta != 1.) {
-	if (*incy == 1) {
-	    if (*beta == 0.) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(i) = 0.;
-/* L10: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(i) = *beta * Y(i);
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (*beta == 0.) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(iy) = 0.;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(iy) = *beta * Y(iy);
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (*alpha == 0.) {
-	return 0;
-    }
-    if (lsame_(trans, "N")) {
-
-/*        Form  y := alpha*A*x + y. */
-
-	jx = kx;
-	if (*incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0.) {
-		    temp = *alpha * X(jx);
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			Y(i) += temp * A(i,j);
-/* L50: */
-		    }
-		}
-		jx += *incx;
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0.) {
-		    temp = *alpha * X(jx);
-		    iy = ky;
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			Y(iy) += temp * A(i,j);
-			iy += *incy;
-/* L70: */
-		    }
-		}
-		jx += *incx;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y := alpha*A'*x + y. */
-
-	jy = ky;
-	if (*incx == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp = 0.;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp += A(i,j) * X(i);
-/* L90: */
-		}
-		Y(jy) += *alpha * temp;
-		jy += *incy;
-/* L100: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp = 0.;
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp += A(i,j) * X(ix);
-		    ix += *incx;
-/* L110: */
-		}
-		Y(jy) += *alpha * temp;
-		jy += *incy;
-/* L120: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of DGEMV . */
-
-} /* dgemv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dger.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dger.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dger.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dger.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,182 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dger_(integer *m, integer *n, doublereal *alpha, 
-	doublereal *x, integer *incx, doublereal *y, integer *incy, 
-	doublereal *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static doublereal temp;
-    static integer i, j, ix, jy, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    DGER   performs the rank 1 operation   
-
-       A := alpha*x*y' + A,   
-
-    where alpha is a scalar, x is an m element vector, y is an n element 
-  
-    vector and A is an m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - DOUBLE PRECISION.   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the m   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients. On exit, A is   
-             overwritten by the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (*m < 0) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*m)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("DGER  ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || *alpha == 0.) {
-	return 0;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (*incy > 0) {
-	jy = 1;
-    } else {
-	jy = 1 - (*n - 1) * *incy;
-    }
-    if (*incx == 1) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    if (Y(jy) != 0.) {
-		temp = *alpha * Y(jy);
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    A(i,j) += X(i) * temp;
-/* L10: */
-		}
-	    }
-	    jy += *incy;
-/* L20: */
-	}
-    } else {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*m - 1) * *incx;
-	}
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    if (Y(jy) != 0.) {
-		temp = *alpha * Y(jy);
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    A(i,j) += X(ix) * temp;
-		    ix += *incx;
-/* L30: */
-		}
-	    }
-	    jy += *incy;
-/* L40: */
-	}
-    }
-
-    return 0;
-
-/*     End of DGER  . */
-
-} /* dger_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dnrm2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,83 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-doublereal dnrm2_(integer *n, doublereal *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2;
-    doublereal ret_val, d__1;
-
-    /* Builtin functions */
-    double sqrt(doublereal);
-
-    /* Local variables */
-    static doublereal norm, scale, absxi;
-    static integer ix;
-    static doublereal ssq;
-
-
-/*  DNRM2 returns the euclidean norm of a vector via the function   
-    name, so that   
-
-       DNRM2 := sqrt( x'*x )   
-
-
-
-    -- This version written on 25-October-1982.   
-       Modified on 14-October-1993 to inline the call to DLASSQ.   
-       Sven Hammarling, Nag Ltd.   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-
-    if (*n < 1 || *incx < 1) {
-	norm = 0.;
-    } else if (*n == 1) {
-	norm = abs(X(1));
-    } else {
-	scale = 0.;
-	ssq = 1.;
-/*        The following loop is equivalent to this call to the LAPACK 
-  
-          auxiliary routine:   
-          CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */
-
-	i__1 = (*n - 1) * *incx + 1;
-	i__2 = *incx;
-	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
-	    if (X(ix) != 0.) {
-		absxi = (d__1 = X(ix), abs(d__1));
-		if (scale < absxi) {
-/* Computing 2nd power */
-		    d__1 = scale / absxi;
-		    ssq = ssq * (d__1 * d__1) + 1.;
-		    scale = absxi;
-		} else {
-/* Computing 2nd power */
-		    d__1 = absxi / scale;
-		    ssq += d__1 * d__1;
-		}
-	    }
-/* L10: */
-	}
-	norm = scale * sqrt(ssq);
-    }
-
-    ret_val = norm;
-    return ret_val;
-
-/*     End of DNRM2. */
-
-} /* dnrm2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/drot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/drot.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/drot.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/drot.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,76 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int drot_(integer *n, doublereal *dx, integer *incx, 
-	doublereal *dy, integer *incy, doublereal *c, doublereal *s)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-
-    /* Local variables */
-    static integer i;
-    static doublereal dtemp;
-    static integer ix, iy;
-
-
-/*     applies a plane rotation.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DY(I) dy[(I)-1]
-#define DX(I) dx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*       code for unequal increments or equal increments not equal   
-           to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	dtemp = *c * DX(ix) + *s * DY(iy);
-	DY(iy) = *c * DY(iy) - *s * DX(ix);
-	DX(ix) = dtemp;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*       code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	dtemp = *c * DX(i) + *s * DY(i);
-	DY(i) = *c * DY(i) - *s * DX(i);
-	DX(i) = dtemp;
-/* L30: */
-    }
-    return 0;
-} /* drot_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dscal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dscal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dscal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,83 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dscal_(integer *n, doublereal *da, doublereal *dx, 
-	integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2;
-
-    /* Local variables */
-    static integer i, m, nincx, mp1;
-
-
-/*     scales a vector by a constant.   
-       uses unrolled loops for increment equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DX(I) dx[(I)-1]
-
-
-    if (*n <= 0 || *incx <= 0) {
-	return 0;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-	DX(i) = *da * DX(i);
-/* L10: */
-    }
-    return 0;
-
-/*        code for increment equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 5;
-    if (m == 0) {
-	goto L40;
-    }
-    i__2 = m;
-    for (i = 1; i <= m; ++i) {
-	DX(i) = *da * DX(i);
-/* L30: */
-    }
-    if (*n < 5) {
-	return 0;
-    }
-L40:
-    mp1 = m + 1;
-    i__2 = *n;
-    for (i = mp1; i <= *n; i += 5) {
-	DX(i) = *da * DX(i);
-	DX(i + 1) = *da * DX(i + 1);
-	DX(i + 2) = *da * DX(i + 2);
-	DX(i + 3) = *da * DX(i + 3);
-	DX(i + 4) = *da * DX(i + 4);
-/* L50: */
-    }
-    return 0;
-} /* dscal_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsymv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsymv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsymv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsymv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,300 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dsymv_(char *uplo, integer *n, doublereal *alpha, 
-	doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal 
-	*beta, doublereal *y, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static doublereal temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    DSYMV  performs the matrix-vector  operation   
-
-       y := alpha*A*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are n element vectors and   
-    A is an n by n symmetric matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - DOUBLE PRECISION.   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the symmetric matrix and the strictly   
-             lower triangular part of A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the symmetric matrix and the strictly   
-             upper triangular part of A is not referenced.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - DOUBLE PRECISION.   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y. On exit, Y is overwritten by the updated   
-             vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*lda < max(1,*n)) {
-	info = 5;
-    } else if (*incx == 0) {
-	info = 7;
-    } else if (*incy == 0) {
-	info = 10;
-    }
-    if (info != 0) {
-	xerbla_("DSYMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || *alpha == 0. && *beta == 1.) {
-	return 0;
-    }
-
-/*     Set up the start points in  X  and  Y. */
-
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (*n - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (*n - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A.   
-
-       First form  y := beta*y. */
-
-    if (*beta != 1.) {
-	if (*incy == 1) {
-	    if (*beta == 0.) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(i) = 0.;
-/* L10: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(i) = *beta * Y(i);
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (*beta == 0.) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(iy) = 0.;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(iy) = *beta * Y(iy);
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (*alpha == 0.) {
-	return 0;
-    }
-    if (lsame_(uplo, "U")) {
-
-/*        Form  y  when A is stored in upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(j);
-		temp2 = 0.;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    Y(i) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(i);
-/* L50: */
-		}
-		Y(j) = Y(j) + temp1 * A(j,j) + *alpha * temp2;
-/* L60: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(jx);
-		temp2 = 0.;
-		ix = kx;
-		iy = ky;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    Y(iy) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(ix);
-		    ix += *incx;
-		    iy += *incy;
-/* L70: */
-		}
-		Y(jy) = Y(jy) + temp1 * A(j,j) + *alpha * temp2;
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y  when A is stored in lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(j);
-		temp2 = 0.;
-		Y(j) += temp1 * A(j,j);
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    Y(i) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(i);
-/* L90: */
-		}
-		Y(j) += *alpha * temp2;
-/* L100: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(jx);
-		temp2 = 0.;
-		Y(jy) += temp1 * A(j,j);
-		ix = jx;
-		iy = jy;
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    ix += *incx;
-		    iy += *incy;
-		    Y(iy) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(ix);
-/* L110: */
-		}
-		Y(jy) += *alpha * temp2;
-		jx += *incx;
-		jy += *incy;
-/* L120: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of DSYMV . */
-
-} /* dsymv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dsyr2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,264 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dsyr2_(char *uplo, integer *n, doublereal *alpha, 
-	doublereal *x, integer *incx, doublereal *y, integer *incy, 
-	doublereal *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static doublereal temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    DSYR2  performs the symmetric rank 2 operation   
-
-       A := alpha*x*y' + alpha*y*x' + A,   
-
-    where alpha is a scalar, x and y are n element vectors and A is an n 
-  
-    by n symmetric matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - DOUBLE PRECISION.   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the symmetric matrix and the strictly   
-             lower triangular part of A is not referenced. On exit, the   
-             upper triangular part of the array A is overwritten by the   
-             upper triangular part of the updated matrix.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the symmetric matrix and the strictly   
-             upper triangular part of A is not referenced. On exit, the   
-             lower triangular part of the array A is overwritten by the   
-             lower triangular part of the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*n)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("DSYR2 ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || *alpha == 0.) {
-	return 0;
-    }
-
-/*     Set up the start points in X and Y if the increments are not both 
-  
-       unity. */
-
-    if (*incx != 1 || *incy != 1) {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*n - 1) * *incx;
-	}
-	if (*incy > 0) {
-	    ky = 1;
-	} else {
-	    ky = 1 - (*n - 1) * *incy;
-	}
-	jx = kx;
-	jy = ky;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A. */
-
-    if (lsame_(uplo, "U")) {
-
-/*        Form  A  when A is stored in the upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(j) != 0. || Y(j) != 0.) {
-		    temp1 = *alpha * Y(j);
-		    temp2 = *alpha * X(j);
-		    i__2 = j;
-		    for (i = 1; i <= j; ++i) {
-			A(i,j) = A(i,j) + X(i) * temp1 
-				+ Y(i) * temp2;
-/* L10: */
-		    }
-		}
-/* L20: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0. || Y(jy) != 0.) {
-		    temp1 = *alpha * Y(jy);
-		    temp2 = *alpha * X(jx);
-		    ix = kx;
-		    iy = ky;
-		    i__2 = j;
-		    for (i = 1; i <= j; ++i) {
-			A(i,j) = A(i,j) + X(ix) * temp1 
-				+ Y(iy) * temp2;
-			ix += *incx;
-			iy += *incy;
-/* L30: */
-		    }
-		}
-		jx += *incx;
-		jy += *incy;
-/* L40: */
-	    }
-	}
-    } else {
-
-/*        Form  A  when A is stored in the lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(j) != 0. || Y(j) != 0.) {
-		    temp1 = *alpha * Y(j);
-		    temp2 = *alpha * X(j);
-		    i__2 = *n;
-		    for (i = j; i <= *n; ++i) {
-			A(i,j) = A(i,j) + X(i) * temp1 
-				+ Y(i) * temp2;
-/* L50: */
-		    }
-		}
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0. || Y(jy) != 0.) {
-		    temp1 = *alpha * Y(jy);
-		    temp2 = *alpha * X(jx);
-		    ix = jx;
-		    iy = jy;
-		    i__2 = *n;
-		    for (i = j; i <= *n; ++i) {
-			A(i,j) = A(i,j) + X(ix) * temp1 
-				+ Y(iy) * temp2;
-			ix += *incx;
-			iy += *incy;
-/* L70: */
-		    }
-		}
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of DSYR2 . */
-
-} /* dsyr2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,485 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dtrsm_(char *side, char *uplo, char *transa, char *diag, 
-	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
-	lda, doublereal *b, integer *ldb)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
-
-    /* Local variables */
-    static integer info;
-    static doublereal temp;
-    static integer i, j, k;
-    static logical lside;
-    extern logical lsame_(char *, char *);
-    static integer nrowa;
-    static logical upper;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical nounit;
-
-
-/*  Purpose   
-    =======   
-
-    DTRSM  solves one of the matrix equations   
-
-       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,   
-
-    where alpha is a scalar, X and B are m by n matrices, A is a unit, or 
-  
-    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of 
-  
-
-       op( A ) = A   or   op( A ) = A'.   
-
-    The matrix X is overwritten on B.   
-
-    Parameters   
-    ==========   
-
-    SIDE   - CHARACTER*1.   
-             On entry, SIDE specifies whether op( A ) appears on the left 
-  
-             or right of X as follows:   
-
-                SIDE = 'L' or 'l'   op( A )*X = alpha*B.   
-
-                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.   
-
-             Unchanged on exit.   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the matrix A is an upper or 
-  
-             lower triangular matrix as follows:   
-
-                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
-
-                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
-
-             Unchanged on exit.   
-
-    TRANSA - CHARACTER*1.   
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-  
-             the matrix multiplication as follows:   
-
-                TRANSA = 'N' or 'n'   op( A ) = A.   
-
-                TRANSA = 'T' or 't'   op( A ) = A'.   
-
-                TRANSA = 'C' or 'c'   op( A ) = A'.   
-
-             Unchanged on exit.   
-
-    DIAG   - CHARACTER*1.   
-             On entry, DIAG specifies whether or not A is unit triangular 
-  
-             as follows:   
-
-                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
-
-                DIAG = 'N' or 'n'   A is not assumed to be unit   
-                                    triangular.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of B. M must be at 
-  
-             least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of B.  N must be 
-  
-             at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - DOUBLE PRECISION.   
-             On entry,  ALPHA specifies the scalar  alpha. When  alpha is 
-  
-             zero then  A is not referenced and  B need not be set before 
-  
-             entry.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m 
-  
-             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'. 
-  
-             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k 
-  
-             upper triangular part of the array  A must contain the upper 
-  
-             triangular matrix  and the strictly lower triangular part of 
-  
-             A is not referenced.   
-             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k 
-  
-             lower triangular part of the array  A must contain the lower 
-  
-             triangular matrix  and the strictly upper triangular part of 
-  
-             A is not referenced.   
-             Note that when  DIAG = 'U' or 'u',  the diagonal elements of 
-  
-             A  are not referenced either,  but are assumed to be  unity. 
-  
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then 
-  
-             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r' 
-  
-             then LDA must be at least max( 1, n ).   
-             Unchanged on exit.   
-
-    B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).   
-             Before entry,  the leading  m by n part of the array  B must 
-  
-             contain  the  right-hand  side  matrix  B,  and  on exit  is 
-  
-             overwritten by the solution matrix  X.   
-
-    LDB    - INTEGER.   
-             On entry, LDB specifies the first dimension of B as declared 
-  
-             in  the  calling  (sub)  program.   LDB  must  be  at  least 
-  
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 3 Blas routine.   
-
-
-    -- Written on 8-February-1989.   
-       Jack Dongarra, Argonne National Laboratory.   
-       Iain Duff, AERE Harwell.   
-       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
-       Sven Hammarling, Numerical Algorithms Group Ltd.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
-
-    lside = lsame_(side, "L");
-    if (lside) {
-	nrowa = *m;
-    } else {
-	nrowa = *n;
-    }
-    nounit = lsame_(diag, "N");
-    upper = lsame_(uplo, "U");
-
-    info = 0;
-    if (! lside && ! lsame_(side, "R")) {
-	info = 1;
-    } else if (! upper && ! lsame_(uplo, "L")) {
-	info = 2;
-    } else if (! lsame_(transa, "N") && ! lsame_(transa, "T") 
-	    && ! lsame_(transa, "C")) {
-	info = 3;
-    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
-	info = 4;
-    } else if (*m < 0) {
-	info = 5;
-    } else if (*n < 0) {
-	info = 6;
-    } else if (*lda < max(1,nrowa)) {
-	info = 9;
-    } else if (*ldb < max(1,*m)) {
-	info = 11;
-    }
-    if (info != 0) {
-	xerbla_("DTRSM ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0) {
-	return 0;
-    }
-
-/*     And when  alpha.eq.zero. */
-
-    if (*alpha == 0.) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = *m;
-	    for (i = 1; i <= *m; ++i) {
-		B(i,j) = 0.;
-/* L10: */
-	    }
-/* L20: */
-	}
-	return 0;
-    }
-
-/*     Start the operations. */
-
-    if (lside) {
-	if (lsame_(transa, "N")) {
-
-/*           Form  B := alpha*inv( A )*B. */
-
-	    if (upper) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (*alpha != 1.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,j) = *alpha * B(i,j);
-/* L30: */
-			}
-		    }
-		    for (k = *m; k >= 1; --k) {
-			if (B(k,j) != 0.) {
-			    if (nounit) {
-				B(k,j) /= A(k,k);
-			    }
-			    i__2 = k - 1;
-			    for (i = 1; i <= k-1; ++i) {
-				B(i,j) -= B(k,j) * A(i,k);
-/* L40: */
-			    }
-			}
-/* L50: */
-		    }
-/* L60: */
-		}
-	    } else {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (*alpha != 1.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,j) = *alpha * B(i,j);
-/* L70: */
-			}
-		    }
-		    i__2 = *m;
-		    for (k = 1; k <= *m; ++k) {
-			if (B(k,j) != 0.) {
-			    if (nounit) {
-				B(k,j) /= A(k,k);
-			    }
-			    i__3 = *m;
-			    for (i = k + 1; i <= *m; ++i) {
-				B(i,j) -= B(k,j) * A(i,k);
-/* L80: */
-			    }
-			}
-/* L90: */
-		    }
-/* L100: */
-		}
-	    }
-	} else {
-
-/*           Form  B := alpha*inv( A' )*B. */
-
-	    if (upper) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			temp = *alpha * B(i,j);
-			i__3 = i - 1;
-			for (k = 1; k <= i-1; ++k) {
-			    temp -= A(k,i) * B(k,j);
-/* L110: */
-			}
-			if (nounit) {
-			    temp /= A(i,i);
-			}
-			B(i,j) = temp;
-/* L120: */
-		    }
-/* L130: */
-		}
-	    } else {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    for (i = *m; i >= 1; --i) {
-			temp = *alpha * B(i,j);
-			i__2 = *m;
-			for (k = i + 1; k <= *m; ++k) {
-			    temp -= A(k,i) * B(k,j);
-/* L140: */
-			}
-			if (nounit) {
-			    temp /= A(i,i);
-			}
-			B(i,j) = temp;
-/* L150: */
-		    }
-/* L160: */
-		}
-	    }
-	}
-    } else {
-	if (lsame_(transa, "N")) {
-
-/*           Form  B := alpha*B*inv( A ). */
-
-	    if (upper) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (*alpha != 1.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,j) = *alpha * B(i,j);
-/* L170: */
-			}
-		    }
-		    i__2 = j - 1;
-		    for (k = 1; k <= j-1; ++k) {
-			if (A(k,j) != 0.) {
-			    i__3 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				B(i,j) -= A(k,j) * B(i,k);
-/* L180: */
-			    }
-			}
-/* L190: */
-		    }
-		    if (nounit) {
-			temp = 1. / A(j,j);
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,j) = temp * B(i,j);
-/* L200: */
-			}
-		    }
-/* L210: */
-		}
-	    } else {
-		for (j = *n; j >= 1; --j) {
-		    if (*alpha != 1.) {
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,j) = *alpha * B(i,j);
-/* L220: */
-			}
-		    }
-		    i__1 = *n;
-		    for (k = j + 1; k <= *n; ++k) {
-			if (A(k,j) != 0.) {
-			    i__2 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				B(i,j) -= A(k,j) * B(i,k);
-/* L230: */
-			    }
-			}
-/* L240: */
-		    }
-		    if (nounit) {
-			temp = 1. / A(j,j);
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,j) = temp * B(i,j);
-/* L250: */
-			}
-		    }
-/* L260: */
-		}
-	    }
-	} else {
-
-/*           Form  B := alpha*B*inv( A' ). */
-
-	    if (upper) {
-		for (k = *n; k >= 1; --k) {
-		    if (nounit) {
-			temp = 1. / A(k,k);
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,k) = temp * B(i,k);
-/* L270: */
-			}
-		    }
-		    i__1 = k - 1;
-		    for (j = 1; j <= k-1; ++j) {
-			if (A(j,k) != 0.) {
-			    temp = A(j,k);
-			    i__2 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				B(i,j) -= temp * B(i,k);
-/* L280: */
-			    }
-			}
-/* L290: */
-		    }
-		    if (*alpha != 1.) {
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,k) = *alpha * B(i,k);
-/* L300: */
-			}
-		    }
-/* L310: */
-		}
-	    } else {
-		i__1 = *n;
-		for (k = 1; k <= *n; ++k) {
-		    if (nounit) {
-			temp = 1. / A(k,k);
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,k) = temp * B(i,k);
-/* L320: */
-			}
-		    }
-		    i__2 = *n;
-		    for (j = k + 1; j <= *n; ++j) {
-			if (A(j,k) != 0.) {
-			    temp = A(j,k);
-			    i__3 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				B(i,j) -= temp * B(i,k);
-/* L330: */
-			    }
-			}
-/* L340: */
-		    }
-		    if (*alpha != 1.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    B(i,k) = *alpha * B(i,k);
-/* L350: */
-			}
-		    }
-/* L360: */
-		}
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of DTRSM . */
-
-} /* dtrsm_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dtrsv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,338 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int dtrsv_(char *uplo, char *trans, char *diag, integer *n, 
-	doublereal *a, integer *lda, doublereal *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static doublereal temp;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, jx, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical nounit;
-
-
-/*  Purpose   
-    =======   
-
-    DTRSV  solves one of the systems of equations   
-
-       A*x = b,   or   A'*x = b,   
-
-    where b and x are n element vectors and A is an n by n unit, or   
-    non-unit, upper or lower triangular matrix.   
-
-    No test for singularity or near-singularity is included in this   
-    routine. Such tests must be performed before calling this routine.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the matrix is an upper or   
-             lower triangular matrix as follows:   
-
-                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
-
-                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
-
-             Unchanged on exit.   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the equations to be solved as   
-             follows:   
-
-                TRANS = 'N' or 'n'   A*x = b.   
-
-                TRANS = 'T' or 't'   A'*x = b.   
-
-                TRANS = 'C' or 'c'   A'*x = b.   
-
-             Unchanged on exit.   
-
-    DIAG   - CHARACTER*1.   
-             On entry, DIAG specifies whether or not A is unit   
-             triangular as follows:   
-
-                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
-
-                DIAG = 'N' or 'n'   A is not assumed to be unit   
-                                    triangular.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular matrix and the strictly lower triangular part of 
-  
-             A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular matrix and the strictly upper triangular part of 
-  
-             A is not referenced.   
-             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
-  
-             A are not referenced either, but are assumed to be unity.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - DOUBLE PRECISION array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element right-hand side vector b. On exit, X is overwritten 
-  
-             with the solution vector x.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
-	     ! lsame_(trans, "C")) {
-	info = 2;
-    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
-	info = 3;
-    } else if (*n < 0) {
-	info = 4;
-    } else if (*lda < max(1,*n)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    }
-    if (info != 0) {
-	xerbla_("DTRSV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0) {
-	return 0;
-    }
-
-    nounit = lsame_(diag, "N");
-
-/*     Set up the start point in X if the increment is not unity. This   
-       will be  ( N - 1 )*INCX  too small for descending loops. */
-
-    if (*incx <= 0) {
-	kx = 1 - (*n - 1) * *incx;
-    } else if (*incx != 1) {
-	kx = 1;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (lsame_(trans, "N")) {
-
-/*        Form  x := inv( A )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    if (X(j) != 0.) {
-			if (nounit) {
-			    X(j) /= A(j,j);
-			}
-			temp = X(j);
-			for (i = j - 1; i >= 1; --i) {
-			    X(i) -= temp * A(i,j);
-/* L10: */
-			}
-		    }
-/* L20: */
-		}
-	    } else {
-		jx = kx + (*n - 1) * *incx;
-		for (j = *n; j >= 1; --j) {
-		    if (X(jx) != 0.) {
-			if (nounit) {
-			    X(jx) /= A(j,j);
-			}
-			temp = X(jx);
-			ix = jx;
-			for (i = j - 1; i >= 1; --i) {
-			    ix -= *incx;
-			    X(ix) -= temp * A(i,j);
-/* L30: */
-			}
-		    }
-		    jx -= *incx;
-/* L40: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (X(j) != 0.) {
-			if (nounit) {
-			    X(j) /= A(j,j);
-			}
-			temp = X(j);
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    X(i) -= temp * A(i,j);
-/* L50: */
-			}
-		    }
-/* L60: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (X(jx) != 0.) {
-			if (nounit) {
-			    X(jx) /= A(j,j);
-			}
-			temp = X(jx);
-			ix = jx;
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    ix += *incx;
-			    X(ix) -= temp * A(i,j);
-/* L70: */
-			}
-		    }
-		    jx += *incx;
-/* L80: */
-		}
-	    }
-	}
-    } else {
-
-/*        Form  x := inv( A' )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    temp = X(j);
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			temp -= A(i,j) * X(i);
-/* L90: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(j) = temp;
-/* L100: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    temp = X(jx);
-		    ix = kx;
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			temp -= A(i,j) * X(ix);
-			ix += *incx;
-/* L110: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(jx) = temp;
-		    jx += *incx;
-/* L120: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    temp = X(j);
-		    i__1 = j + 1;
-		    for (i = *n; i >= j+1; --i) {
-			temp -= A(i,j) * X(i);
-/* L130: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(j) = temp;
-/* L140: */
-		}
-	    } else {
-		kx += (*n - 1) * *incx;
-		jx = kx;
-		for (j = *n; j >= 1; --j) {
-		    temp = X(jx);
-		    ix = kx;
-		    i__1 = j + 1;
-		    for (i = *n; i >= j+1; --i) {
-			temp -= A(i,j) * X(ix);
-			ix -= *incx;
-/* L150: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(jx) = temp;
-		    jx -= *incx;
-/* L160: */
-		}
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of DTRSV . */
-
-} /* dtrsv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dzasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dzasum.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dzasum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dzasum.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,68 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-doublereal dzasum_(integer *n, doublecomplex *zx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-    doublereal ret_val;
-
-    /* Local variables */
-    static integer i;
-    static doublereal stemp;
-    extern doublereal dcabs1_(doublecomplex *);
-    static integer ix;
-
-
-/*     takes the sum of the absolute values.   
-       jack dongarra, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define ZX(I) zx[(I)-1]
-
-
-    ret_val = 0.;
-    stemp = 0.;
-    if (*n <= 0 || *incx <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    ix = 1;
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	stemp += dcabs1_(&ZX(ix));
-	ix += *incx;
-/* L10: */
-    }
-    ret_val = stemp;
-    return ret_val;
-
-/*        code for increment equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	stemp += dcabs1_(&ZX(i));
-/* L30: */
-    }
-    ret_val = stemp;
-    return ret_val;
-} /* dzasum_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/dznrm2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,96 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-doublereal dznrm2_(integer *n, doublecomplex *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3;
-    doublereal ret_val, d__1;
-
-    /* Builtin functions */
-    double d_imag(doublecomplex *), sqrt(doublereal);
-
-    /* Local variables */
-    static doublereal temp, norm, scale;
-    static integer ix;
-    static doublereal ssq;
-
-
-/*  DZNRM2 returns the euclidean norm of a vector via the function   
-    name, so that   
-
-       DZNRM2 := sqrt( conjg( x' )*x )   
-
-
-
-    -- This version written on 25-October-1982.   
-       Modified on 14-October-1993 to inline the call to ZLASSQ.   
-       Sven Hammarling, Nag Ltd.   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-
-    if (*n < 1 || *incx < 1) {
-	norm = 0.;
-    } else {
-	scale = 0.;
-	ssq = 1.;
-/*        The following loop is equivalent to this call to the LAPACK 
-  
-          auxiliary routine:   
-          CALL ZLASSQ( N, X, INCX, SCALE, SSQ ) */
-
-	i__1 = (*n - 1) * *incx + 1;
-	i__2 = *incx;
-	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
-	    i__3 = ix;
-	    if (X(ix).r != 0.) {
-		i__3 = ix;
-		temp = (d__1 = X(ix).r, abs(d__1));
-		if (scale < temp) {
-/* Computing 2nd power */
-		    d__1 = scale / temp;
-		    ssq = ssq * (d__1 * d__1) + 1.;
-		    scale = temp;
-		} else {
-/* Computing 2nd power */
-		    d__1 = temp / scale;
-		    ssq += d__1 * d__1;
-		}
-	    }
-	    if (d_imag(&X(ix)) != 0.) {
-		temp = (d__1 = d_imag(&X(ix)), abs(d__1));
-		if (scale < temp) {
-/* Computing 2nd power */
-		    d__1 = scale / temp;
-		    ssq = ssq * (d__1 * d__1) + 1.;
-		    scale = temp;
-		} else {
-/* Computing 2nd power */
-		    d__1 = temp / scale;
-		    ssq += d__1 * d__1;
-		}
-	    }
-/* L10: */
-	}
-	norm = scale * sqrt(ssq);
-    }
-
-    ret_val = norm;
-    return ret_val;
-
-/*     End of DZNRM2. */
-
-} /* dznrm2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/f2c.h hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/f2c.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/f2c.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/f2c.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,43 +0,0 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#include "Cnames.h"
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
-typedef int integer;
-typedef int logical;
-
-typedef char *address;
-typedef short int shortint;
-typedef float real;
-typedef double doublereal;
-typedef struct { real r, i; } complex;
-typedef struct { doublereal r, i; } doublecomplex;
-typedef short int shortlogical;
-typedef char logical1;
-typedef char integer1;
-/* typedef long long longint; */ /* system-dependent */
-
-#define TRUE_ (1)
-#define FALSE_ (0)
-
-/* Extern is for use with -E */
-#ifndef Extern
-#define Extern extern
-#endif
-
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define dabs(x) (doublereal)abs(x)
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-#define dmin(a,b) (doublereal)min(a,b)
-#define dmax(a,b) (doublereal)max(a,b)
-
-#define VOID void
-
-#endif
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/icamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/icamax.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/icamax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/icamax.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,72 +0,0 @@
-#include "f2c.h"
-
-integer icamax_(integer *n, complex *cx, integer *incx)
-{
-    /* System generated locals */
-    integer ret_val, i__1, i__2;
-    real r__1, r__2;
-    /* Builtin functions */
-    double r_imag(complex *);
-    /* Local variables */
-    static real smax;
-    static integer i, ix;
-/*     finds the index of element having max. absolute value.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-    
-   Parameter adjustments   
-       Function Body */
-#define CX(I) cx[(I)-1]
-    ret_val = 0;
-    if (*n < 1 || *incx <= 0) {
-	return ret_val;
-    }
-    ret_val = 1;
-    if (*n == 1) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-/*        code for increment not equal to 1 */
-    ix = 1;
-    smax = (r__1 = CX(1).r, dabs(r__1)) + (r__2 = r_imag(&CX(1)), dabs(r__2));
-    ix += *incx;
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	i__2 = ix;
-	if ((r__1 = CX(ix).r, dabs(r__1)) + (r__2 = r_imag(&CX(ix)), dabs(
-		r__2)) <= smax) {
-	    goto L5;
-	}
-	ret_val = i;
-	i__2 = ix;
-	smax = (r__1 = CX(ix).r, dabs(r__1)) + (r__2 = r_imag(&CX(ix)), 
-		dabs(r__2));
-L5:
-	ix += *incx;
-/* L10: */
-    }
-    return ret_val;
-/*        code for increment equal to 1 */
-L20:
-    smax = (r__1 = CX(1).r, dabs(r__1)) + (r__2 = r_imag(&CX(1)), dabs(r__2));
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	i__2 = i;
-	if ((r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(i)), dabs(
-		r__2)) <= smax) {
-	    goto L30;
-	}
-	ret_val = i;
-	i__2 = i;
-	smax = (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(i)), dabs(
-		r__2));
-L30:
-	;
-    }
-    return ret_val;
-} /* icamax_ */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/idamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/idamax.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/idamax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/idamax.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,80 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-integer idamax_(integer *n, doublereal *dx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer ret_val, i__1;
-    doublereal d__1;
-
-    /* Local variables */
-    static doublereal dmax__;
-    static integer i, ix;
-
-
-/*     finds the index of element having max. absolute value.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define DX(I) dx[(I)-1]
-
-
-    ret_val = 0;
-    if (*n < 1 || *incx <= 0) {
-	return ret_val;
-    }
-    ret_val = 1;
-    if (*n == 1) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    ix = 1;
-    dmax__ = abs(DX(1));
-    ix += *incx;
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	if ((d__1 = DX(ix), abs(d__1)) <= dmax__) {
-	    goto L5;
-	}
-	ret_val = i;
-	dmax__ = (d__1 = DX(ix), abs(d__1));
-L5:
-	ix += *incx;
-/* L10: */
-    }
-    return ret_val;
-
-/*        code for increment equal to 1 */
-
-L20:
-    dmax__ = abs(DX(1));
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	if ((d__1 = DX(i), abs(d__1)) <= dmax__) {
-	    goto L30;
-	}
-	ret_val = i;
-	dmax__ = (d__1 = DX(i), abs(d__1));
-L30:
-	;
-    }
-    return ret_val;
-} /* idamax_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/isamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/isamax.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/isamax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/isamax.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,80 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-integer isamax_(integer *n, real *sx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer ret_val, i__1;
-    real r__1;
-
-    /* Local variables */
-    static real smax;
-    static integer i, ix;
-
-
-/*     finds the index of element having max. absolute value.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SX(I) sx[(I)-1]
-
-
-    ret_val = 0;
-    if (*n < 1 || *incx <= 0) {
-	return ret_val;
-    }
-    ret_val = 1;
-    if (*n == 1) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    ix = 1;
-    smax = dabs(SX(1));
-    ix += *incx;
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	if ((r__1 = SX(ix), dabs(r__1)) <= smax) {
-	    goto L5;
-	}
-	ret_val = i;
-	smax = (r__1 = SX(ix), dabs(r__1));
-L5:
-	ix += *incx;
-/* L10: */
-    }
-    return ret_val;
-
-/*        code for increment equal to 1 */
-
-L20:
-    smax = dabs(SX(1));
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	if ((r__1 = SX(i), dabs(r__1)) <= smax) {
-	    goto L30;
-	}
-	ret_val = i;
-	smax = (r__1 = SX(i), dabs(r__1));
-L30:
-	;
-    }
-    return ret_val;
-} /* isamax_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/izamax.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/izamax.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/izamax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/izamax.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,81 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-integer izamax_(integer *n, doublecomplex *zx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer ret_val, i__1;
-
-    /* Local variables */
-    static doublereal smax;
-    static integer i;
-    extern doublereal dcabs1_(doublecomplex *);
-    static integer ix;
-
-
-/*     finds the index of element having max. absolute value.   
-       jack dongarra, 1/15/85.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define ZX(I) zx[(I)-1]
-
-
-    ret_val = 0;
-    if (*n < 1 || *incx <= 0) {
-	return ret_val;
-    }
-    ret_val = 1;
-    if (*n == 1) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    ix = 1;
-    smax = dcabs1_(&ZX(1));
-    ix += *incx;
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	if (dcabs1_(&ZX(ix)) <= smax) {
-	    goto L5;
-	}
-	ret_val = i;
-	smax = dcabs1_(&ZX(ix));
-L5:
-	ix += *incx;
-/* L10: */
-    }
-    return ret_val;
-
-/*        code for increment equal to 1 */
-
-L20:
-    smax = dcabs1_(&ZX(1));
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	if (dcabs1_(&ZX(i)) <= smax) {
-	    goto L30;
-	}
-	ret_val = i;
-	smax = dcabs1_(&ZX(i));
-L30:
-	;
-    }
-    return ret_val;
-} /* izamax_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Makefile
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,92 +0,0 @@
-include ../make.inc
-HEADER = ../SRC
-
-#######################################################################
-#  This is the makefile to create a library for C-BLAS.
-#  The files are organized as follows:
-#
-#       SBLAS1 -- Single precision real BLAS routines
-#       CBLAS1 -- Single precision complex BLAS routines
-#       DBLAS1 -- Double precision real BLAS routines
-#       ZBLAS1 -- Double precision complex BLAS routines
-#
-#       CB1AUX -- Real BLAS routines called by complex routines
-#       ZB1AUX -- D.P. real BLAS routines called by d.p. complex
-#                 routines
-#
-#      ALLBLAS -- Auxiliary routines for Level 2 and 3 BLAS
-#
-#       SBLAS2 -- Single precision real BLAS2 routines
-#       CBLAS2 -- Single precision complex BLAS2 routines
-#       DBLAS2 -- Double precision real BLAS2 routines
-#       ZBLAS2 -- Double precision complex BLAS2 routines
-#
-#       SBLAS3 -- Single precision real BLAS3 routines
-#       CBLAS3 -- Single precision complex BLAS3 routines
-#       DBLAS3 -- Double precision real BLAS3 routines
-#       ZBLAS3 -- Double precision complex BLAS3 routines
-#
-#  The library can be set up to include routines for any combination
-#  of the four precisions.  To create or add to the library, enter make
-#  followed by one or more of the precisions desired.  Some examples:
-#       make single
-#       make single complex
-#       make single double complex complex16
-#  Alternatively, the command
-#       make
-#  without any arguments creates a library of all four precisions.
-#  The library is called
-#       blas.a
-#  and is created at the next higher directory level.
-#
-#  To remove the object files after the library is created, enter
-#       make clean
-#
-#######################################################################
-
-SBLAS1 = isamax.o sasum.o saxpy.o scopy.o sdot.o snrm2.o \
-	 srot.o sscal.o
-SBLAS2 = sgemv.o ssymv.o strsv.o sger.o ssyr2.o
-
-DBLAS1 = idamax.o dasum.o daxpy.o dcopy.o ddot.o dnrm2.o \
-	 drot.o dscal.o
-DBLAS2 = dgemv.o dsymv.o dtrsv.o dger.o dsyr2.o
-DBLAS3 = dgemm.o dtrsm.o
-
-CBLAS1 = icamax.o scasum.o caxpy.o ccopy.o scnrm2.o \
-	 cscal.o
-CBLAS2 = cgemv.o chemv.o ctrsv.o cgerc.o cgeru.o cher2.o
-
-ZBLAS1 = izamax.o dzasum.o zaxpy.o zcopy.o dznrm2.o \
-	 zscal.o dcabs1.o
-ZBLAS2 = zgemv.o zhemv.o ztrsv.o zgerc.o zgeru.o zher2.o
-ZBLAS3 = zgemm.o ztrsm.o
-
-
-all: single double complex complex16
-
-single: $(SBLAS1) $(SBLAS2) $(SBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(SBLAS1) $(ALLBLAS) \
-	$(SBLAS2) $(SBLAS3)
-	$(RANLIB) $(BLASLIB)
-
-double: $(DBLAS1) $(DBLAS2) $(DBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \
-	$(DBLAS2) $(DBLAS3)
-	$(RANLIB) $(BLASLIB)
-
-complex: $(CBLAS1) $(CBLAS2) $(CBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(CBLAS1) $(ALLBLAS) \
-	$(CBLAS2) $(CBLAS3)
-	$(RANLIB) $(BLASLIB)
-
-complex16: $(ZBLAS1) $(ZBLAS2) $(ZBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(ZBLAS1) $(ALLBLAS) \
-	$(ZBLAS2) $(ZBLAS3)
-	$(RANLIB) $(BLASLIB)
-
-.c.o:
-	$(CC) $(CFLAGS) $(CDEFS) -I$(HEADER) -c $< $(VERBOSE)
-
-clean:	
-	rm -f *.o
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sasum.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sasum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sasum.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,89 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-real sasum_(integer *n, real *sx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2;
-    real ret_val, r__1, r__2, r__3, r__4, r__5, r__6;
-
-    /* Local variables */
-    static integer i, m, nincx;
-    static real stemp;
-    static integer mp1;
-
-
-/*     takes the sum of the absolute values.   
-       uses unrolled loops for increment equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SX(I) sx[(I)-1]
-
-
-    ret_val = 0.f;
-    stemp = 0.f;
-    if (*n <= 0 || *incx <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-	stemp += (r__1 = SX(i), dabs(r__1));
-/* L10: */
-    }
-    ret_val = stemp;
-    return ret_val;
-
-/*        code for increment equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 6;
-    if (m == 0) {
-	goto L40;
-    }
-    i__2 = m;
-    for (i = 1; i <= m; ++i) {
-	stemp += (r__1 = SX(i), dabs(r__1));
-/* L30: */
-    }
-    if (*n < 6) {
-	goto L60;
-    }
-L40:
-    mp1 = m + 1;
-    i__2 = *n;
-    for (i = mp1; i <= *n; i += 6) {
-	stemp = stemp + (r__1 = SX(i), dabs(r__1)) + (r__2 = SX(i + 1), dabs(
-		r__2)) + (r__3 = SX(i + 2), dabs(r__3)) + (r__4 = SX(i + 3), 
-		dabs(r__4)) + (r__5 = SX(i + 4), dabs(r__5)) + (r__6 = SX(i + 
-		5), dabs(r__6));
-/* L50: */
-    }
-L60:
-    ret_val = stemp;
-    return ret_val;
-} /* sasum_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/saxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/saxpy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/saxpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/saxpy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,94 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int saxpy_(integer *n, real *sa, real *sx, integer *incx, 
-	real *sy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-
-    /* Local variables */
-    static integer i, m, ix, iy, mp1;
-
-
-/*     constant times a vector plus a vector.   
-       uses unrolled loop for increments equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SY(I) sy[(I)-1]
-#define SX(I) sx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*sa == 0.f) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	SY(iy) += *sa * SX(ix);
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 4;
-    if (m == 0) {
-	goto L40;
-    }
-    i__1 = m;
-    for (i = 1; i <= m; ++i) {
-	SY(i) += *sa * SX(i);
-/* L30: */
-    }
-    if (*n < 4) {
-	return 0;
-    }
-L40:
-    mp1 = m + 1;
-    i__1 = *n;
-    for (i = mp1; i <= *n; i += 4) {
-	SY(i) += *sa * SX(i);
-	SY(i + 1) += *sa * SX(i + 1);
-	SY(i + 2) += *sa * SX(i + 2);
-	SY(i + 3) += *sa * SX(i + 3);
-/* L50: */
-    }
-    return 0;
-} /* saxpy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scasum.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scasum.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scasum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scasum.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,74 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-real scasum_(integer *n, complex *cx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3;
-    real ret_val, r__1, r__2;
-
-    /* Builtin functions */
-    double r_imag(complex *);
-
-    /* Local variables */
-    static integer i, nincx;
-    static real stemp;
-
-
-/*     takes the sum of the absolute values of a complex vector and   
-       returns a single precision result.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define CX(I) cx[(I)-1]
-
-
-    ret_val = 0.f;
-    stemp = 0.f;
-    if (*n <= 0 || *incx <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-	i__3 = i;
-	stemp = stemp + (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(
-		i)), dabs(r__2));
-/* L10: */
-    }
-    ret_val = stemp;
-    return ret_val;
-
-/*        code for increment equal to 1 */
-
-L20:
-    i__2 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__1 = i;
-	stemp = stemp + (r__1 = CX(i).r, dabs(r__1)) + (r__2 = r_imag(&CX(
-		i)), dabs(r__2));
-/* L30: */
-    }
-    ret_val = stemp;
-    return ret_val;
-} /* scasum_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scnrm2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,96 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-real scnrm2_(integer *n, complex *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3;
-    real ret_val, r__1;
-
-    /* Builtin functions */
-    double r_imag(complex *), sqrt(doublereal);
-
-    /* Local variables */
-    static real temp, norm, scale;
-    static integer ix;
-    static real ssq;
-
-
-/*  SCNRM2 returns the euclidean norm of a vector via the function   
-    name, so that   
-
-       SCNRM2 := sqrt( conjg( x' )*x )   
-
-
-
-    -- This version written on 25-October-1982.   
-       Modified on 14-October-1993 to inline the call to CLASSQ.   
-       Sven Hammarling, Nag Ltd.   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-
-    if (*n < 1 || *incx < 1) {
-	norm = 0.f;
-    } else {
-	scale = 0.f;
-	ssq = 1.f;
-/*        The following loop is equivalent to this call to the LAPACK 
-  
-          auxiliary routine:   
-          CALL CLASSQ( N, X, INCX, SCALE, SSQ ) */
-
-	i__1 = (*n - 1) * *incx + 1;
-	i__2 = *incx;
-	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
-	    i__3 = ix;
-	    if (X(ix).r != 0.f) {
-		i__3 = ix;
-		temp = (r__1 = X(ix).r, dabs(r__1));
-		if (scale < temp) {
-/* Computing 2nd power */
-		    r__1 = scale / temp;
-		    ssq = ssq * (r__1 * r__1) + 1.f;
-		    scale = temp;
-		} else {
-/* Computing 2nd power */
-		    r__1 = temp / scale;
-		    ssq += r__1 * r__1;
-		}
-	    }
-	    if (r_imag(&X(ix)) != 0.f) {
-		temp = (r__1 = r_imag(&X(ix)), dabs(r__1));
-		if (scale < temp) {
-/* Computing 2nd power */
-		    r__1 = scale / temp;
-		    ssq = ssq * (r__1 * r__1) + 1.f;
-		    scale = temp;
-		} else {
-/* Computing 2nd power */
-		    r__1 = temp / scale;
-		    ssq += r__1 * r__1;
-		}
-	    }
-/* L10: */
-	}
-	norm = scale * sqrt(ssq);
-    }
-
-    ret_val = norm;
-    return ret_val;
-
-/*     End of SCNRM2. */
-
-} /* scnrm2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scopy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/scopy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/scopy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,94 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int scopy_(integer *n, real *sx, integer *incx, real *sy, 
-	integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-
-    /* Local variables */
-    static integer i, m, ix, iy, mp1;
-
-
-/*     copies a vector, x, to a vector, y.   
-       uses unrolled loops for increments equal to 1.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SY(I) sy[(I)-1]
-#define SX(I) sx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	SY(iy) = SX(ix);
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 7;
-    if (m == 0) {
-	goto L40;
-    }
-    i__1 = m;
-    for (i = 1; i <= m; ++i) {
-	SY(i) = SX(i);
-/* L30: */
-    }
-    if (*n < 7) {
-	return 0;
-    }
-L40:
-    mp1 = m + 1;
-    i__1 = *n;
-    for (i = mp1; i <= *n; i += 7) {
-	SY(i) = SX(i);
-	SY(i + 1) = SX(i + 1);
-	SY(i + 2) = SX(i + 2);
-	SY(i + 3) = SX(i + 3);
-	SY(i + 4) = SX(i + 4);
-	SY(i + 5) = SX(i + 5);
-	SY(i + 6) = SX(i + 6);
-/* L50: */
-    }
-    return 0;
-} /* scopy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sdot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sdot.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sdot.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sdot.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,96 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-real sdot_(integer *n, real *sx, integer *incx, real *sy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-    real ret_val;
-
-    /* Local variables */
-    static integer i, m;
-    static real stemp;
-    static integer ix, iy, mp1;
-
-
-/*     forms the dot product of two vectors.   
-       uses unrolled loops for increments equal to one.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SY(I) sy[(I)-1]
-#define SX(I) sx[(I)-1]
-
-
-    stemp = 0.f;
-    ret_val = 0.f;
-    if (*n <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	stemp += SX(ix) * SY(iy);
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    ret_val = stemp;
-    return ret_val;
-
-/*        code for both increments equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 5;
-    if (m == 0) {
-	goto L40;
-    }
-    i__1 = m;
-    for (i = 1; i <= m; ++i) {
-	stemp += SX(i) * SY(i);
-/* L30: */
-    }
-    if (*n < 5) {
-	goto L60;
-    }
-L40:
-    mp1 = m + 1;
-    i__1 = *n;
-    for (i = mp1; i <= *n; i += 5) {
-	stemp = stemp + SX(i) * SY(i) + SX(i + 1) * SY(i + 1) + SX(i + 2) * 
-		SY(i + 2) + SX(i + 3) * SY(i + 3) + SX(i + 4) * SY(i + 4);
-/* L50: */
-    }
-L60:
-    ret_val = stemp;
-    return ret_val;
-} /* sdot_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sgemv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sgemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sgemv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,299 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int sgemv_(char *trans, integer *m, integer *n, real *alpha, 
-	real *a, integer *lda, real *x, integer *incx, real *beta, real *y, 
-	integer *incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static real temp;
-    static integer lenx, leny, i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    SGEMV  performs one of the matrix-vector operations   
-
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are vectors and A is an   
-    m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - REAL            .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - REAL             array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-    X      - REAL             array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - REAL            .   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - REAL             array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-  
-             updated vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! 
-	    lsame_(trans, "C")) {
-	info = 1;
-    } else if (*m < 0) {
-	info = 2;
-    } else if (*n < 0) {
-	info = 3;
-    } else if (*lda < max(1,*m)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    } else if (*incy == 0) {
-	info = 11;
-    }
-    if (info != 0) {
-	xerbla_("SGEMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) {
-	return 0;
-    }
-
-/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-  
-       up the start points in  X  and  Y. */
-
-    if (lsame_(trans, "N")) {
-	lenx = *n;
-	leny = *m;
-    } else {
-	lenx = *m;
-	leny = *n;
-    }
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (lenx - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (leny - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A.   
-
-       First form  y := beta*y. */
-
-    if (*beta != 1.f) {
-	if (*incy == 1) {
-	    if (*beta == 0.f) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(i) = 0.f;
-/* L10: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(i) = *beta * Y(i);
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (*beta == 0.f) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(iy) = 0.f;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    Y(iy) = *beta * Y(iy);
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (*alpha == 0.f) {
-	return 0;
-    }
-    if (lsame_(trans, "N")) {
-
-/*        Form  y := alpha*A*x + y. */
-
-	jx = kx;
-	if (*incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0.f) {
-		    temp = *alpha * X(jx);
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			Y(i) += temp * A(i,j);
-/* L50: */
-		    }
-		}
-		jx += *incx;
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0.f) {
-		    temp = *alpha * X(jx);
-		    iy = ky;
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			Y(iy) += temp * A(i,j);
-			iy += *incy;
-/* L70: */
-		    }
-		}
-		jx += *incx;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y := alpha*A'*x + y. */
-
-	jy = ky;
-	if (*incx == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp = 0.f;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp += A(i,j) * X(i);
-/* L90: */
-		}
-		Y(jy) += *alpha * temp;
-		jy += *incy;
-/* L100: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp = 0.f;
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp += A(i,j) * X(ix);
-		    ix += *incx;
-/* L110: */
-		}
-		Y(jy) += *alpha * temp;
-		jy += *incy;
-/* L120: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of SGEMV . */
-
-} /* sgemv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sger.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sger.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sger.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sger.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,181 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int sger_(integer *m, integer *n, real *alpha, real *x, 
-	integer *incx, real *y, integer *incy, real *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static real temp;
-    static integer i, j, ix, jy, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    SGER   performs the rank 1 operation   
-
-       A := alpha*x*y' + A,   
-
-    where alpha is a scalar, x is an m element vector, y is an n element 
-  
-    vector and A is an m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - REAL            .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - REAL             array of dimension at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the m   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - REAL             array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - REAL             array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients. On exit, A is   
-             overwritten by the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (*m < 0) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*m)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("SGER  ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || *alpha == 0.f) {
-	return 0;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (*incy > 0) {
-	jy = 1;
-    } else {
-	jy = 1 - (*n - 1) * *incy;
-    }
-    if (*incx == 1) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    if (Y(jy) != 0.f) {
-		temp = *alpha * Y(jy);
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    A(i,j) += X(i) * temp;
-/* L10: */
-		}
-	    }
-	    jy += *incy;
-/* L20: */
-	}
-    } else {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*m - 1) * *incx;
-	}
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    if (Y(jy) != 0.f) {
-		temp = *alpha * Y(jy);
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    A(i,j) += X(ix) * temp;
-		    ix += *incx;
-/* L30: */
-		}
-	    }
-	    jy += *incy;
-/* L40: */
-	}
-    }
-
-    return 0;
-
-/*     End of SGER  . */
-
-} /* sger_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/snrm2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/snrm2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/snrm2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/snrm2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,83 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-real snrm2_(integer *n, real *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2;
-    real ret_val, r__1;
-
-    /* Builtin functions */
-    double sqrt(doublereal);
-
-    /* Local variables */
-    static real norm, scale, absxi;
-    static integer ix;
-    static real ssq;
-
-
-/*  SNRM2 returns the euclidean norm of a vector via the function   
-    name, so that   
-
-       SNRM2 := sqrt( x'*x )   
-
-
-
-    -- This version written on 25-October-1982.   
-       Modified on 14-October-1993 to inline the call to SLASSQ.   
-       Sven Hammarling, Nag Ltd.   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-
-    if (*n < 1 || *incx < 1) {
-	norm = 0.f;
-    } else if (*n == 1) {
-	norm = dabs(X(1));
-    } else {
-	scale = 0.f;
-	ssq = 1.f;
-/*        The following loop is equivalent to this call to the LAPACK 
-  
-          auxiliary routine:   
-          CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */
-
-	i__1 = (*n - 1) * *incx + 1;
-	i__2 = *incx;
-	for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
-	    if (X(ix) != 0.f) {
-		absxi = (r__1 = X(ix), dabs(r__1));
-		if (scale < absxi) {
-/* Computing 2nd power */
-		    r__1 = scale / absxi;
-		    ssq = ssq * (r__1 * r__1) + 1.f;
-		    scale = absxi;
-		} else {
-/* Computing 2nd power */
-		    r__1 = absxi / scale;
-		    ssq += r__1 * r__1;
-		}
-	    }
-/* L10: */
-	}
-	norm = scale * sqrt(ssq);
-    }
-
-    ret_val = norm;
-    return ret_val;
-
-/*     End of SNRM2. */
-
-} /* snrm2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/srot.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/srot.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/srot.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/srot.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,76 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int srot_(integer *n, real *sx, integer *incx, real *sy, 
-	integer *incy, real *c, real *s)
-{
-
-
-    /* System generated locals */
-    integer i__1;
-
-    /* Local variables */
-    static integer i;
-    static real stemp;
-    static integer ix, iy;
-
-
-/*     applies a plane rotation.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SY(I) sy[(I)-1]
-#define SX(I) sx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*       code for unequal increments or equal increments not equal   
-           to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	stemp = *c * SX(ix) + *s * SY(iy);
-	SY(iy) = *c * SY(iy) - *s * SX(ix);
-	SX(ix) = stemp;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*       code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	stemp = *c * SX(i) + *s * SY(i);
-	SY(i) = *c * SY(i) - *s * SX(i);
-	SX(i) = stemp;
-/* L30: */
-    }
-    return 0;
-} /* srot_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sscal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/sscal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/sscal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,82 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int sscal_(integer *n, real *sa, real *sx, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2;
-
-    /* Local variables */
-    static integer i, m, nincx, mp1;
-
-
-/*     scales a vector by a constant.   
-       uses unrolled loops for increment equal to 1.   
-       jack dongarra, linpack, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define SX(I) sx[(I)-1]
-
-
-    if (*n <= 0 || *incx <= 0) {
-	return 0;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-	SX(i) = *sa * SX(i);
-/* L10: */
-    }
-    return 0;
-
-/*        code for increment equal to 1   
-
-
-          clean-up loop */
-
-L20:
-    m = *n % 5;
-    if (m == 0) {
-	goto L40;
-    }
-    i__2 = m;
-    for (i = 1; i <= m; ++i) {
-	SX(i) = *sa * SX(i);
-/* L30: */
-    }
-    if (*n < 5) {
-	return 0;
-    }
-L40:
-    mp1 = m + 1;
-    i__2 = *n;
-    for (i = mp1; i <= *n; i += 5) {
-	SX(i) = *sa * SX(i);
-	SX(i + 1) = *sa * SX(i + 1);
-	SX(i + 2) = *sa * SX(i + 2);
-	SX(i + 3) = *sa * SX(i + 3);
-	SX(i + 4) = *sa * SX(i + 4);
-/* L50: */
-    }
-    return 0;
-} /* sscal_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssymv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssymv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssymv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssymv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,300 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int ssymv_(char *uplo, integer *n, real *alpha, real *a, 
-	integer *lda, real *x, integer *incx, real *beta, real *y, integer *
-	incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static real temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    SSYMV  performs the matrix-vector  operation   
-
-       y := alpha*A*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are n element vectors and   
-    A is an n by n symmetric matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - REAL            .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - REAL             array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the symmetric matrix and the strictly   
-             lower triangular part of A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the symmetric matrix and the strictly   
-             upper triangular part of A is not referenced.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - REAL             array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - REAL            .   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - REAL             array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y. On exit, Y is overwritten by the updated   
-             vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*lda < max(1,*n)) {
-	info = 5;
-    } else if (*incx == 0) {
-	info = 7;
-    } else if (*incy == 0) {
-	info = 10;
-    }
-    if (info != 0) {
-	xerbla_("SSYMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || *alpha == 0.f && *beta == 1.f) {
-	return 0;
-    }
-
-/*     Set up the start points in  X  and  Y. */
-
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (*n - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (*n - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A.   
-
-       First form  y := beta*y. */
-
-    if (*beta != 1.f) {
-	if (*incy == 1) {
-	    if (*beta == 0.f) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(i) = 0.f;
-/* L10: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(i) = *beta * Y(i);
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (*beta == 0.f) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(iy) = 0.f;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    Y(iy) = *beta * Y(iy);
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (*alpha == 0.f) {
-	return 0;
-    }
-    if (lsame_(uplo, "U")) {
-
-/*        Form  y  when A is stored in upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(j);
-		temp2 = 0.f;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    Y(i) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(i);
-/* L50: */
-		}
-		Y(j) = Y(j) + temp1 * A(j,j) + *alpha * temp2;
-/* L60: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(jx);
-		temp2 = 0.f;
-		ix = kx;
-		iy = ky;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    Y(iy) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(ix);
-		    ix += *incx;
-		    iy += *incy;
-/* L70: */
-		}
-		Y(jy) = Y(jy) + temp1 * A(j,j) + *alpha * temp2;
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y  when A is stored in lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(j);
-		temp2 = 0.f;
-		Y(j) += temp1 * A(j,j);
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    Y(i) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(i);
-/* L90: */
-		}
-		Y(j) += *alpha * temp2;
-/* L100: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp1 = *alpha * X(jx);
-		temp2 = 0.f;
-		Y(jy) += temp1 * A(j,j);
-		ix = jx;
-		iy = jy;
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    ix += *incx;
-		    iy += *incy;
-		    Y(iy) += temp1 * A(i,j);
-		    temp2 += A(i,j) * X(ix);
-/* L110: */
-		}
-		Y(jy) += *alpha * temp2;
-		jx += *incx;
-		jy += *incy;
-/* L120: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of SSYMV . */
-
-} /* ssymv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ssyr2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,263 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int ssyr2_(char *uplo, integer *n, real *alpha, real *x, 
-	integer *incx, real *y, integer *incy, real *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static real temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    SSYR2  performs the symmetric rank 2 operation   
-
-       A := alpha*x*y' + alpha*y*x' + A,   
-
-    where alpha is a scalar, x and y are n element vectors and A is an n 
-  
-    by n symmetric matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - REAL            .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - REAL             array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - REAL             array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - REAL             array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the symmetric matrix and the strictly   
-             lower triangular part of A is not referenced. On exit, the   
-             upper triangular part of the array A is overwritten by the   
-             upper triangular part of the updated matrix.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the symmetric matrix and the strictly   
-             upper triangular part of A is not referenced. On exit, the   
-             lower triangular part of the array A is overwritten by the   
-             lower triangular part of the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*n)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("SSYR2 ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || *alpha == 0.f) {
-	return 0;
-    }
-
-/*     Set up the start points in X and Y if the increments are not both 
-  
-       unity. */
-
-    if (*incx != 1 || *incy != 1) {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*n - 1) * *incx;
-	}
-	if (*incy > 0) {
-	    ky = 1;
-	} else {
-	    ky = 1 - (*n - 1) * *incy;
-	}
-	jx = kx;
-	jy = ky;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A. */
-
-    if (lsame_(uplo, "U")) {
-
-/*        Form  A  when A is stored in the upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(j) != 0.f || Y(j) != 0.f) {
-		    temp1 = *alpha * Y(j);
-		    temp2 = *alpha * X(j);
-		    i__2 = j;
-		    for (i = 1; i <= j; ++i) {
-			A(i,j) = A(i,j) + X(i) * temp1 
-				+ Y(i) * temp2;
-/* L10: */
-		    }
-		}
-/* L20: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0.f || Y(jy) != 0.f) {
-		    temp1 = *alpha * Y(jy);
-		    temp2 = *alpha * X(jx);
-		    ix = kx;
-		    iy = ky;
-		    i__2 = j;
-		    for (i = 1; i <= j; ++i) {
-			A(i,j) = A(i,j) + X(ix) * temp1 
-				+ Y(iy) * temp2;
-			ix += *incx;
-			iy += *incy;
-/* L30: */
-		    }
-		}
-		jx += *incx;
-		jy += *incy;
-/* L40: */
-	    }
-	}
-    } else {
-
-/*        Form  A  when A is stored in the lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(j) != 0.f || Y(j) != 0.f) {
-		    temp1 = *alpha * Y(j);
-		    temp2 = *alpha * X(j);
-		    i__2 = *n;
-		    for (i = j; i <= *n; ++i) {
-			A(i,j) = A(i,j) + X(i) * temp1 
-				+ Y(i) * temp2;
-/* L50: */
-		    }
-		}
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (X(jx) != 0.f || Y(jy) != 0.f) {
-		    temp1 = *alpha * Y(jy);
-		    temp2 = *alpha * X(jx);
-		    ix = jx;
-		    iy = jy;
-		    i__2 = *n;
-		    for (i = j; i <= *n; ++i) {
-			A(i,j) = A(i,j) + X(ix) * temp1 
-				+ Y(iy) * temp2;
-			ix += *incx;
-			iy += *incy;
-/* L70: */
-		    }
-		}
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of SSYR2 . */
-
-} /* ssyr2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/strsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/strsv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/strsv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/strsv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,338 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int strsv_(char *uplo, char *trans, char *diag, integer *n, 
-	real *a, integer *lda, real *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2;
-
-    /* Local variables */
-    static integer info;
-    static real temp;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, jx, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical nounit;
-
-
-/*  Purpose   
-    =======   
-
-    STRSV  solves one of the systems of equations   
-
-       A*x = b,   or   A'*x = b,   
-
-    where b and x are n element vectors and A is an n by n unit, or   
-    non-unit, upper or lower triangular matrix.   
-
-    No test for singularity or near-singularity is included in this   
-    routine. Such tests must be performed before calling this routine.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the matrix is an upper or   
-             lower triangular matrix as follows:   
-
-                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
-
-                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
-
-             Unchanged on exit.   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the equations to be solved as   
-             follows:   
-
-                TRANS = 'N' or 'n'   A*x = b.   
-
-                TRANS = 'T' or 't'   A'*x = b.   
-
-                TRANS = 'C' or 'c'   A'*x = b.   
-
-             Unchanged on exit.   
-
-    DIAG   - CHARACTER*1.   
-             On entry, DIAG specifies whether or not A is unit   
-             triangular as follows:   
-
-                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
-
-                DIAG = 'N' or 'n'   A is not assumed to be unit   
-                                    triangular.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    A      - REAL             array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular matrix and the strictly lower triangular part of 
-  
-             A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular matrix and the strictly upper triangular part of 
-  
-             A is not referenced.   
-             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
-  
-             A are not referenced either, but are assumed to be unity.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - REAL             array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element right-hand side vector b. On exit, X is overwritten 
-  
-             with the solution vector x.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
-	     ! lsame_(trans, "C")) {
-	info = 2;
-    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
-	info = 3;
-    } else if (*n < 0) {
-	info = 4;
-    } else if (*lda < max(1,*n)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    }
-    if (info != 0) {
-	xerbla_("STRSV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0) {
-	return 0;
-    }
-
-    nounit = lsame_(diag, "N");
-
-/*     Set up the start point in X if the increment is not unity. This   
-       will be  ( N - 1 )*INCX  too small for descending loops. */
-
-    if (*incx <= 0) {
-	kx = 1 - (*n - 1) * *incx;
-    } else if (*incx != 1) {
-	kx = 1;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (lsame_(trans, "N")) {
-
-/*        Form  x := inv( A )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    if (X(j) != 0.f) {
-			if (nounit) {
-			    X(j) /= A(j,j);
-			}
-			temp = X(j);
-			for (i = j - 1; i >= 1; --i) {
-			    X(i) -= temp * A(i,j);
-/* L10: */
-			}
-		    }
-/* L20: */
-		}
-	    } else {
-		jx = kx + (*n - 1) * *incx;
-		for (j = *n; j >= 1; --j) {
-		    if (X(jx) != 0.f) {
-			if (nounit) {
-			    X(jx) /= A(j,j);
-			}
-			temp = X(jx);
-			ix = jx;
-			for (i = j - 1; i >= 1; --i) {
-			    ix -= *incx;
-			    X(ix) -= temp * A(i,j);
-/* L30: */
-			}
-		    }
-		    jx -= *incx;
-/* L40: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (X(j) != 0.f) {
-			if (nounit) {
-			    X(j) /= A(j,j);
-			}
-			temp = X(j);
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    X(i) -= temp * A(i,j);
-/* L50: */
-			}
-		    }
-/* L60: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (X(jx) != 0.f) {
-			if (nounit) {
-			    X(jx) /= A(j,j);
-			}
-			temp = X(jx);
-			ix = jx;
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    ix += *incx;
-			    X(ix) -= temp * A(i,j);
-/* L70: */
-			}
-		    }
-		    jx += *incx;
-/* L80: */
-		}
-	    }
-	}
-    } else {
-
-/*        Form  x := inv( A' )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    temp = X(j);
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			temp -= A(i,j) * X(i);
-/* L90: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(j) = temp;
-/* L100: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    temp = X(jx);
-		    ix = kx;
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			temp -= A(i,j) * X(ix);
-			ix += *incx;
-/* L110: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(jx) = temp;
-		    jx += *incx;
-/* L120: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    temp = X(j);
-		    i__1 = j + 1;
-		    for (i = *n; i >= j+1; --i) {
-			temp -= A(i,j) * X(i);
-/* L130: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(j) = temp;
-/* L140: */
-		}
-	    } else {
-		kx += (*n - 1) * *incx;
-		jx = kx;
-		for (j = *n; j >= 1; --j) {
-		    temp = X(jx);
-		    ix = kx;
-		    i__1 = j + 1;
-		    for (i = *n; i >= j+1; --i) {
-			temp -= A(i,j) * X(ix);
-			ix -= *incx;
-/* L150: */
-		    }
-		    if (nounit) {
-			temp /= A(j,j);
-		    }
-		    X(jx) = temp;
-		    jx -= *incx;
-/* L160: */
-		}
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of STRSV . */
-
-} /* strsv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/superlu_f2c.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,43 +0,0 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#include "Cnames.h"
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
-typedef int integer;
-typedef int logical;
-
-typedef char *address;
-typedef short int shortint;
-typedef float real;
-typedef double doublereal;
-typedef struct { real r, i; } complex;
-typedef struct { doublereal r, i; } doublecomplex;
-typedef short int shortlogical;
-typedef char logical1;
-typedef char integer1;
-/* typedef long long longint; */ /* system-dependent */
-
-#define TRUE_ (1)
-#define FALSE_ (0)
-
-/* Extern is for use with -E */
-#ifndef Extern
-#define Extern extern
-#endif
-
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define dabs(x) (doublereal)abs(x)
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-#define dmin(a,b) (doublereal)min(a,b)
-#define dmax(a,b) (doublereal)max(a,b)
-
-#define VOID void
-
-#endif
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zaxpy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,87 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zaxpy_(integer *n, doublecomplex *za, doublecomplex *zx, 
-	integer *incx, doublecomplex *zy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3, i__4;
-    doublecomplex z__1, z__2;
-
-    /* Local variables */
-    static integer i;
-    extern doublereal dcabs1_(doublecomplex *);
-    static integer ix, iy;
-
-
-/*     constant times a vector plus a vector.   
-       jack dongarra, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define ZY(I) zy[(I)-1]
-#define ZX(I) zx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (dcabs1_(za) == 0.) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = iy;
-	i__3 = iy;
-	i__4 = ix;
-	z__2.r = za->r * ZX(ix).r - za->i * ZX(ix).i, z__2.i = za->r * ZX(
-		ix).i + za->i * ZX(ix).r;
-	z__1.r = ZY(iy).r + z__2.r, z__1.i = ZY(iy).i + z__2.i;
-	ZY(iy).r = z__1.r, ZY(iy).i = z__1.i;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = i;
-	i__3 = i;
-	i__4 = i;
-	z__2.r = za->r * ZX(i).r - za->i * ZX(i).i, z__2.i = za->r * ZX(
-		i).i + za->i * ZX(i).r;
-	z__1.r = ZY(i).r + z__2.r, z__1.i = ZY(i).i + z__2.i;
-	ZY(i).r = z__1.r, ZY(i).i = z__1.i;
-/* L30: */
-    }
-    return 0;
-} /* zaxpy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zcopy.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zcopy.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zcopy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zcopy.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,74 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zcopy_(integer *n, doublecomplex *zx, integer *incx, 
-	doublecomplex *zy, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3;
-
-    /* Local variables */
-    static integer i, ix, iy;
-
-
-/*     copies a vector, x, to a vector, y.   
-       jack dongarra, linpack, 4/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define ZY(I) zy[(I)-1]
-#define ZX(I) zx[(I)-1]
-
-
-    if (*n <= 0) {
-	return 0;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = iy;
-	i__3 = ix;
-	ZY(iy).r = ZX(ix).r, ZY(iy).i = ZX(ix).i;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-    return 0;
-
-/*        code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = i;
-	i__3 = i;
-	ZY(i).r = ZX(i).r, ZY(i).i = ZX(i).i;
-/* L30: */
-    }
-    return 0;
-} /* zcopy_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zdotc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zdotc.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zdotc.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zdotc.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,85 +0,0 @@
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Double Complex */ VOID zdotc_(doublecomplex * ret_val, integer *n, 
-	doublecomplex *zx, integer *incx, doublecomplex *zy, integer *incy)
-{
-    /* System generated locals */
-    integer i__1, i__2;
-    doublecomplex z__1, z__2, z__3;
-
-    /* Builtin functions */
-    void d_cnjg(doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer i;
-    static doublecomplex ztemp;
-    static integer ix, iy;
-
-
-/*     forms the dot product of a vector.   
-       jack dongarra, 3/11/78.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-    
-   Parameter adjustments */
-    --zy;
-    --zx;
-
-    /* Function Body */
-    ztemp.r = 0., ztemp.i = 0.;
-     ret_val->r = 0.,  ret_val->i = 0.;
-    if (*n <= 0) {
-	return ;
-    }
-    if (*incx == 1 && *incy == 1) {
-	goto L20;
-    }
-
-/*        code for unequal increments or equal increments   
-            not equal to 1 */
-
-    ix = 1;
-    iy = 1;
-    if (*incx < 0) {
-	ix = (-(*n) + 1) * *incx + 1;
-    }
-    if (*incy < 0) {
-	iy = (-(*n) + 1) * *incy + 1;
-    }
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	d_cnjg(&z__3, &zx[ix]);
-	i__2 = iy;
-	z__2.r = z__3.r * zy[iy].r - z__3.i * zy[iy].i, z__2.i = z__3.r * 
-		zy[iy].i + z__3.i * zy[iy].r;
-	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
-	ztemp.r = z__1.r, ztemp.i = z__1.i;
-	ix += *incx;
-	iy += *incy;
-/* L10: */
-    }
-     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
-    return ;
-
-/*        code for both increments equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	d_cnjg(&z__3, &zx[i]);
-	i__2 = i;
-	z__2.r = z__3.r * zy[i].r - z__3.i * zy[i].i, z__2.i = z__3.r * 
-		zy[i].i + z__3.i * zy[i].r;
-	z__1.r = ztemp.r + z__2.r, z__1.i = ztemp.i + z__2.i;
-	ztemp.r = z__1.r, ztemp.i = z__1.i;
-/* L30: */
-    }
-     ret_val->r = ztemp.r,  ret_val->i = ztemp.i;
-    return ;
-} /* zdotc_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,697 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zgemm_(char *transa, char *transb, integer *m, integer *
-	n, integer *k, doublecomplex *alpha, doublecomplex *a, integer *lda, 
-	doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *c,
-	 integer *ldc)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
-	    i__3, i__4, i__5, i__6;
-    doublecomplex z__1, z__2, z__3, z__4;
-
-    /* Builtin functions */
-    void d_cnjg(doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static logical nota, notb;
-    static doublecomplex temp;
-    static integer i, j, l;
-    static logical conja, conjb;
-    static integer ncola;
-    extern logical lsame_(char *, char *);
-    static integer nrowa, nrowb;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    ZGEMM  performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of   
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),   
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-  
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - CHARACTER*1.   
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-  
-             the matrix multiplication as follows:   
-
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-
-             Unchanged on exit.   
-
-    TRANSB - CHARACTER*1.   
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-  
-             the matrix multiplication as follows:   
-
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry,  M  specifies  the number  of rows  of the  matrix 
-  
-             op( A )  and of the  matrix  C.  M  must  be at least  zero. 
-  
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry,  N  specifies the number  of columns of the matrix 
-  
-             op( B ) and the number of columns of the matrix C. N must be 
-  
-             at least zero.   
-             Unchanged on exit.   
-
-    K      - INTEGER.   
-             On entry,  K  specifies  the number of columns of the matrix 
-  
-             op( A ) and the number of rows of the matrix op( B ). K must 
-  
-             be at least  zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, ka ), where ka is 
-  
-             k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.   
-             Before entry with  TRANSA = 'N' or 'n',  the leading  m by k 
-  
-             part of the array  A  must contain the matrix  A,  otherwise 
-  
-             the leading  k by m  part of the array  A  must contain  the 
-  
-             matrix A.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. When  TRANSA = 'N' or 'n' then 
-  
-             LDA must be at least  max( 1, m ), otherwise  LDA must be at 
-  
-             least  max( 1, k ).   
-             Unchanged on exit.   
-
-    B      - COMPLEX*16       array of DIMENSION ( LDB, kb ), where kb is 
-  
-             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n 
-  
-             part of the array  B  must contain the matrix  B,  otherwise 
-  
-             the leading  n by k  part of the array  B  must contain  the 
-  
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - INTEGER.   
-             On entry, LDB specifies the first dimension of B as declared 
-  
-             in the calling (sub) program. When  TRANSB = 'N' or 'n' then 
-  
-             LDB must be at least  max( 1, k ), otherwise  LDB must be at 
-  
-             least  max( 1, n ).   
-             Unchanged on exit.   
-
-    BETA   - COMPLEX*16      .   
-             On entry,  BETA  specifies the scalar  beta.  When  BETA  is 
-  
-             supplied as zero then C need not be set on input.   
-             Unchanged on exit.   
-
-    C      - COMPLEX*16       array of DIMENSION ( LDC, n ).   
-             Before entry, the leading  m by n  part of the array  C must 
-  
-             contain the matrix  C,  except when  beta  is zero, in which 
-  
-             case C need not be set on entry.   
-             On exit, the array  C  is overwritten by the  m by n  matrix 
-  
-             ( alpha*op( A )*op( B ) + beta*C ).   
-
-    LDC    - INTEGER.   
-             On entry, LDC specifies the first dimension of C as declared 
-  
-             in  the  calling  (sub)  program.   LDC  must  be  at  least 
-  
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 3 Blas routine.   
-
-    -- Written on 8-February-1989.   
-       Jack Dongarra, Argonne National Laboratory.   
-       Iain Duff, AERE Harwell.   
-       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
-       Sven Hammarling, Numerical Algorithms Group Ltd.   
-
-
-
-       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not 
-  
-       conjugated or transposed, set  CONJA and CONJB  as true if  A  and 
-  
-       B  respectively are to be  transposed but  not conjugated  and set 
-  
-       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A 
-  
-       and the number of rows of  B  respectively.   
-
-    
-   Parameter adjustments   
-       Function Body */
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
-#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
-
-    nota = lsame_(transa, "N");
-    notb = lsame_(transb, "N");
-    conja = lsame_(transa, "C");
-    conjb = lsame_(transb, "C");
-    if (nota) {
-	nrowa = *m;
-	ncola = *k;
-    } else {
-	nrowa = *k;
-	ncola = *m;
-    }
-    if (notb) {
-	nrowb = *k;
-    } else {
-	nrowb = *n;
-    }
-
-/*     Test the input parameters. */
-
-    info = 0;
-    if (! nota && ! conja && ! lsame_(transa, "T")) {
-	info = 1;
-    } else if (! notb && ! conjb && ! lsame_(transb, "T")) {
-	info = 2;
-    } else if (*m < 0) {
-	info = 3;
-    } else if (*n < 0) {
-	info = 4;
-    } else if (*k < 0) {
-	info = 5;
-    } else if (*lda < max(1,nrowa)) {
-	info = 8;
-    } else if (*ldb < max(1,nrowb)) {
-	info = 10;
-    } else if (*ldc < max(1,*m)) {
-	info = 13;
-    }
-    if (info != 0) {
-	xerbla_("ZGEMM ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || (alpha->r == 0. && alpha->i == 0. || *k == 0) &&
-	     (beta->r == 1. && beta->i == 0.)) {
-	return 0;
-    }
-
-/*     And when  alpha.eq.zero. */
-
-    if (alpha->r == 0. && alpha->i == 0.) {
-	if (beta->r == 0. && beta->i == 0.) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * c_dim1;
-		    C(i,j).r = 0., C(i,j).i = 0.;
-/* L10: */
-		}
-/* L20: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * c_dim1;
-		    i__4 = i + j * c_dim1;
-		    z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-			    z__1.i = beta->r * C(i,j).i + beta->i * C(i,j)
-			    .r;
-		    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L30: */
-		}
-/* L40: */
-	    }
-	}
-	return 0;
-    }
-
-/*     Start the operations. */
-
-    if (notb) {
-	if (nota) {
-
-/*           Form  C := alpha*A*B + beta*C. */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (beta->r == 0. && beta->i == 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * c_dim1;
-			C(i,j).r = 0., C(i,j).i = 0.;
-/* L50: */
-		    }
-		} else if (beta->r != 1. || beta->i != 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * c_dim1;
-			i__4 = i + j * c_dim1;
-			z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L60: */
-		    }
-		}
-		i__2 = *k;
-		for (l = 1; l <= *k; ++l) {
-		    i__3 = l + j * b_dim1;
-		    if (B(l,j).r != 0. || B(l,j).i != 0.) {
-			i__3 = l + j * b_dim1;
-			z__1.r = alpha->r * B(l,j).r - alpha->i * B(l,j).i, 
-				z__1.i = alpha->r * B(l,j).i + alpha->i * B(l,j).r;
-			temp.r = z__1.r, temp.i = z__1.i;
-			i__3 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__4 = i + j * c_dim1;
-			    i__5 = i + j * c_dim1;
-			    i__6 = i + l * a_dim1;
-			    z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, 
-				    z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r;
-			    z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + 
-				    z__2.i;
-			    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L70: */
-			}
-		    }
-/* L80: */
-		}
-/* L90: */
-	    }
-	} else if (conja) {
-
-/*           Form  C := alpha*conjg( A' )*B + beta*C. */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp.r = 0., temp.i = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			d_cnjg(&z__3, &A(l,i));
-			i__4 = l + j * b_dim1;
-			z__2.r = z__3.r * B(l,j).r - z__3.i * B(l,j).i, 
-				z__2.i = z__3.r * B(l,j).i + z__3.i * B(l,j)
-				.r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L100: */
-		    }
-		    if (beta->r == 0. && beta->i == 0.) {
-			i__3 = i + j * c_dim1;
-			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__1.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    } else {
-			i__3 = i + j * c_dim1;
-			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__2.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			i__4 = i + j * c_dim1;
-			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    }
-/* L110: */
-		}
-/* L120: */
-	    }
-	} else {
-
-/*           Form  C := alpha*A'*B + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp.r = 0., temp.i = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			i__4 = l + i * a_dim1;
-			i__5 = l + j * b_dim1;
-			z__2.r = A(l,i).r * B(l,j).r - A(l,i).i * B(l,j)
-				.i, z__2.i = A(l,i).r * B(l,j).i + A(l,i)
-				.i * B(l,j).r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L130: */
-		    }
-		    if (beta->r == 0. && beta->i == 0.) {
-			i__3 = i + j * c_dim1;
-			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__1.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    } else {
-			i__3 = i + j * c_dim1;
-			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__2.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			i__4 = i + j * c_dim1;
-			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    }
-/* L140: */
-		}
-/* L150: */
-	    }
-	}
-    } else if (nota) {
-	if (conjb) {
-
-/*           Form  C := alpha*A*conjg( B' ) + beta*C. */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (beta->r == 0. && beta->i == 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * c_dim1;
-			C(i,j).r = 0., C(i,j).i = 0.;
-/* L160: */
-		    }
-		} else if (beta->r != 1. || beta->i != 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * c_dim1;
-			i__4 = i + j * c_dim1;
-			z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L170: */
-		    }
-		}
-		i__2 = *k;
-		for (l = 1; l <= *k; ++l) {
-		    i__3 = j + l * b_dim1;
-		    if (B(j,l).r != 0. || B(j,l).i != 0.) {
-			d_cnjg(&z__2, &B(j,l));
-			z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, 
-				z__1.i = alpha->r * z__2.i + alpha->i * 
-				z__2.r;
-			temp.r = z__1.r, temp.i = z__1.i;
-			i__3 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__4 = i + j * c_dim1;
-			    i__5 = i + j * c_dim1;
-			    i__6 = i + l * a_dim1;
-			    z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, 
-				    z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r;
-			    z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + 
-				    z__2.i;
-			    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L180: */
-			}
-		    }
-/* L190: */
-		}
-/* L200: */
-	    }
-	} else {
-
-/*           Form  C := alpha*A*B'          + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		if (beta->r == 0. && beta->i == 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * c_dim1;
-			C(i,j).r = 0., C(i,j).i = 0.;
-/* L210: */
-		    }
-		} else if (beta->r != 1. || beta->i != 0.) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * c_dim1;
-			i__4 = i + j * c_dim1;
-			z__1.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__1.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L220: */
-		    }
-		}
-		i__2 = *k;
-		for (l = 1; l <= *k; ++l) {
-		    i__3 = j + l * b_dim1;
-		    if (B(j,l).r != 0. || B(j,l).i != 0.) {
-			i__3 = j + l * b_dim1;
-			z__1.r = alpha->r * B(j,l).r - alpha->i * B(j,l).i, 
-				z__1.i = alpha->r * B(j,l).i + alpha->i * B(j,l).r;
-			temp.r = z__1.r, temp.i = z__1.i;
-			i__3 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__4 = i + j * c_dim1;
-			    i__5 = i + j * c_dim1;
-			    i__6 = i + l * a_dim1;
-			    z__2.r = temp.r * A(i,l).r - temp.i * A(i,l).i, 
-				    z__2.i = temp.r * A(i,l).i + temp.i * A(i,l).r;
-			    z__1.r = C(i,j).r + z__2.r, z__1.i = C(i,j).i + 
-				    z__2.i;
-			    C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-/* L230: */
-			}
-		    }
-/* L240: */
-		}
-/* L250: */
-	    }
-	}
-    } else if (conja) {
-	if (conjb) {
-
-/*           Form  C := alpha*conjg( A' )*conjg( B' ) + beta*C. */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp.r = 0., temp.i = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			d_cnjg(&z__3, &A(l,i));
-			d_cnjg(&z__4, &B(j,l));
-			z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, z__2.i = 
-				z__3.r * z__4.i + z__3.i * z__4.r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L260: */
-		    }
-		    if (beta->r == 0. && beta->i == 0.) {
-			i__3 = i + j * c_dim1;
-			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__1.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    } else {
-			i__3 = i + j * c_dim1;
-			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__2.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			i__4 = i + j * c_dim1;
-			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    }
-/* L270: */
-		}
-/* L280: */
-	    }
-	} else {
-
-/*           Form  C := alpha*conjg( A' )*B' + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp.r = 0., temp.i = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			d_cnjg(&z__3, &A(l,i));
-			i__4 = j + l * b_dim1;
-			z__2.r = z__3.r * B(j,l).r - z__3.i * B(j,l).i, 
-				z__2.i = z__3.r * B(j,l).i + z__3.i * B(j,l)
-				.r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L290: */
-		    }
-		    if (beta->r == 0. && beta->i == 0.) {
-			i__3 = i + j * c_dim1;
-			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__1.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    } else {
-			i__3 = i + j * c_dim1;
-			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__2.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			i__4 = i + j * c_dim1;
-			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    }
-/* L300: */
-		}
-/* L310: */
-	    }
-	}
-    } else {
-	if (conjb) {
-
-/*           Form  C := alpha*A'*conjg( B' ) + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp.r = 0., temp.i = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			i__4 = l + i * a_dim1;
-			d_cnjg(&z__3, &B(j,l));
-			z__2.r = A(l,i).r * z__3.r - A(l,i).i * z__3.i, 
-				z__2.i = A(l,i).r * z__3.i + A(l,i).i * 
-				z__3.r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L320: */
-		    }
-		    if (beta->r == 0. && beta->i == 0.) {
-			i__3 = i + j * c_dim1;
-			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__1.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    } else {
-			i__3 = i + j * c_dim1;
-			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__2.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			i__4 = i + j * c_dim1;
-			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    }
-/* L330: */
-		}
-/* L340: */
-	    }
-	} else {
-
-/*           Form  C := alpha*A'*B' + beta*C */
-
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    temp.r = 0., temp.i = 0.;
-		    i__3 = *k;
-		    for (l = 1; l <= *k; ++l) {
-			i__4 = l + i * a_dim1;
-			i__5 = j + l * b_dim1;
-			z__2.r = A(l,i).r * B(j,l).r - A(l,i).i * B(j,l)
-				.i, z__2.i = A(l,i).r * B(j,l).i + A(l,i)
-				.i * B(j,l).r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L350: */
-		    }
-		    if (beta->r == 0. && beta->i == 0.) {
-			i__3 = i + j * c_dim1;
-			z__1.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__1.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    } else {
-			i__3 = i + j * c_dim1;
-			z__2.r = alpha->r * temp.r - alpha->i * temp.i, 
-				z__2.i = alpha->r * temp.i + alpha->i * 
-				temp.r;
-			i__4 = i + j * c_dim1;
-			z__3.r = beta->r * C(i,j).r - beta->i * C(i,j).i, 
-				z__3.i = beta->r * C(i,j).i + beta->i * C(i,j).r;
-			z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-			C(i,j).r = z__1.r, C(i,j).i = z__1.i;
-		    }
-/* L360: */
-		}
-/* L370: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of ZGEMM . */
-
-} /* zgemm_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgemv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,400 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zgemv_(char *trans, integer *m, integer *n, 
-	doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *
-	x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *
-	incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    doublecomplex z__1, z__2, z__3;
-
-    /* Builtin functions */
-    void d_cnjg(doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp;
-    static integer lenx, leny, i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical noconj;
-
-
-/*  Purpose   
-    =======   
-
-    ZGEMV  performs one of the matrix-vector operations   
-
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   or   
-
-       y := alpha*conjg( A' )*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are vectors and A is an   
-    m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-
-                TRANS = 'C' or 'c'   y := alpha*conjg( A' )*x + beta*y.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-    X      - COMPLEX*16       array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - COMPLEX*16      .   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX*16       array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-  
-             updated vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! 
-	    lsame_(trans, "C")) {
-	info = 1;
-    } else if (*m < 0) {
-	info = 2;
-    } else if (*n < 0) {
-	info = 3;
-    } else if (*lda < max(1,*m)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    } else if (*incy == 0) {
-	info = 11;
-    }
-    if (info != 0) {
-	xerbla_("ZGEMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 
-	    1. && beta->i == 0.)) {
-	return 0;
-    }
-
-    noconj = lsame_(trans, "T");
-
-/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-  
-       up the start points in  X  and  Y. */
-
-    if (lsame_(trans, "N")) {
-	lenx = *n;
-	leny = *m;
-    } else {
-	lenx = *m;
-	leny = *n;
-    }
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (lenx - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (leny - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A.   
-
-       First form  y := beta*y. */
-
-    if (beta->r != 1. || beta->i != 0.) {
-	if (*incy == 1) {
-	    if (beta->r == 0. && beta->i == 0.) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = i;
-		    Y(i).r = 0., Y(i).i = 0.;
-/* L10: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = i;
-		    i__3 = i;
-		    z__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
-			    z__1.i = beta->r * Y(i).i + beta->i * Y(i)
-			    .r;
-		    Y(i).r = z__1.r, Y(i).i = z__1.i;
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (beta->r == 0. && beta->i == 0.) {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = iy;
-		    Y(iy).r = 0., Y(iy).i = 0.;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = leny;
-		for (i = 1; i <= leny; ++i) {
-		    i__2 = iy;
-		    i__3 = iy;
-		    z__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
-			    z__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
-			    .r;
-		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (alpha->r == 0. && alpha->i == 0.) {
-	return 0;
-    }
-    if (lsame_(trans, "N")) {
-
-/*        Form  y := alpha*A*x + y. */
-
-	jx = kx;
-	if (*incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		if (X(jx).r != 0. || X(jx).i != 0.) {
-		    i__2 = jx;
-		    z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    z__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    temp.r = z__1.r, temp.i = z__1.i;
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i;
-			i__4 = i;
-			i__5 = i + j * a_dim1;
-			z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				z__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
-				.r;
-			z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + 
-				z__2.i;
-			Y(i).r = z__1.r, Y(i).i = z__1.i;
-/* L50: */
-		    }
-		}
-		jx += *incx;
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		if (X(jx).r != 0. || X(jx).i != 0.) {
-		    i__2 = jx;
-		    z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    z__1.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    temp.r = z__1.r, temp.i = z__1.i;
-		    iy = ky;
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = iy;
-			i__4 = iy;
-			i__5 = i + j * a_dim1;
-			z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				z__2.i = temp.r * A(i,j).i + temp.i * A(i,j)
-				.r;
-			z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + 
-				z__2.i;
-			Y(iy).r = z__1.r, Y(iy).i = z__1.i;
-			iy += *incy;
-/* L70: */
-		    }
-		}
-		jx += *incx;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y := alpha*A'*x + y  or  y := alpha*conjg( A' )*x + y.
- */
-
-	jy = ky;
-	if (*incx == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp.r = 0., temp.i = 0.;
-		if (noconj) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i;
-			z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(i)
-				.i, z__2.i = A(i,j).r * X(i).i + A(i,j)
-				.i * X(i).r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L90: */
-		    }
-		} else {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			d_cnjg(&z__3, &A(i,j));
-			i__3 = i;
-			z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, 
-				z__2.i = z__3.r * X(i).i + z__3.i * X(i)
-				.r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-/* L100: */
-		    }
-		}
-		i__2 = jy;
-		i__3 = jy;
-		z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i = 
-			alpha->r * temp.i + alpha->i * temp.r;
-		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
-		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
-		jy += *incy;
-/* L110: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		temp.r = 0., temp.i = 0.;
-		ix = kx;
-		if (noconj) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = ix;
-			z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(ix)
-				.i, z__2.i = A(i,j).r * X(ix).i + A(i,j)
-				.i * X(ix).r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-			ix += *incx;
-/* L120: */
-		    }
-		} else {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			d_cnjg(&z__3, &A(i,j));
-			i__3 = ix;
-			z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, 
-				z__2.i = z__3.r * X(ix).i + z__3.i * X(ix)
-				.r;
-			z__1.r = temp.r + z__2.r, z__1.i = temp.i + z__2.i;
-			temp.r = z__1.r, temp.i = z__1.i;
-			ix += *incx;
-/* L130: */
-		    }
-		}
-		i__2 = jy;
-		i__3 = jy;
-		z__2.r = alpha->r * temp.r - alpha->i * temp.i, z__2.i = 
-			alpha->r * temp.i + alpha->i * temp.r;
-		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
-		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
-		jy += *incy;
-/* L140: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of ZGEMV . */
-
-} /* zgemv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgerc.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgerc.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgerc.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgerc.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,206 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zgerc_(integer *m, integer *n, doublecomplex *alpha, 
-	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, 
-	doublecomplex *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    doublecomplex z__1, z__2;
-
-    /* Builtin functions */
-    void d_cnjg(doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp;
-    static integer i, j, ix, jy, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    ZGERC  performs the rank 1 operation   
-
-       A := alpha*x*conjg( y' ) + A,   
-
-    where alpha is a scalar, x is an m element vector, y is an n element 
-  
-    vector and A is an m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the m   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients. On exit, A is   
-             overwritten by the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (*m < 0) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*m)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("ZGERC ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) {
-	return 0;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (*incy > 0) {
-	jy = 1;
-    } else {
-	jy = 1 - (*n - 1) * *incy;
-    }
-    if (*incx == 1) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
-		d_cnjg(&z__2, &Y(jy));
-		z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
-			alpha->r * z__2.i + alpha->i * z__2.r;
-		temp.r = z__1.r, temp.i = z__1.i;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = i;
-		    z__2.r = X(i).r * temp.r - X(i).i * temp.i, z__2.i =
-			     X(i).r * temp.i + X(i).i * temp.r;
-		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
-		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-/* L10: */
-		}
-	    }
-	    jy += *incy;
-/* L20: */
-	}
-    } else {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*m - 1) * *incx;
-	}
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
-		d_cnjg(&z__2, &Y(jy));
-		z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
-			alpha->r * z__2.i + alpha->i * z__2.r;
-		temp.r = z__1.r, temp.i = z__1.i;
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = ix;
-		    z__2.r = X(ix).r * temp.r - X(ix).i * temp.i, z__2.i =
-			     X(ix).r * temp.i + X(ix).i * temp.r;
-		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
-		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-		    ix += *incx;
-/* L30: */
-		}
-	    }
-	    jy += *incy;
-/* L40: */
-	}
-    }
-
-    return 0;
-
-/*     End of ZGERC . */
-
-} /* zgerc_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgeru.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgeru.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zgeru.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zgeru.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,203 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zgeru_(integer *m, integer *n, doublecomplex *alpha, 
-	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, 
-	doublecomplex *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    doublecomplex z__1, z__2;
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp;
-    static integer i, j, ix, jy, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    ZGERU  performs the rank 1 operation   
-
-       A := alpha*x*y' + A,   
-
-    where alpha is a scalar, x is an m element vector, y is an n element 
-  
-    vector and A is an m by n matrix.   
-
-    Parameters   
-    ==========   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of the matrix A.   
-             M must be at least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of the matrix A. 
-  
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the m   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients. On exit, A is   
-             overwritten by the updated matrix.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (*m < 0) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*m)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("ZGERU ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*m == 0 || *n == 0 || alpha->r == 0. && alpha->i == 0.) {
-	return 0;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (*incy > 0) {
-	jy = 1;
-    } else {
-	jy = 1 - (*n - 1) * *incy;
-    }
-    if (*incx == 1) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
-		i__2 = jy;
-		z__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, z__1.i =
-			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
-		temp.r = z__1.r, temp.i = z__1.i;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = i;
-		    z__2.r = X(i).r * temp.r - X(i).i * temp.i, z__2.i =
-			     X(i).r * temp.i + X(i).i * temp.r;
-		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
-		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-/* L10: */
-		}
-	    }
-	    jy += *incy;
-/* L20: */
-	}
-    } else {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*m - 1) * *incx;
-	}
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = jy;
-	    if (Y(jy).r != 0. || Y(jy).i != 0.) {
-		i__2 = jy;
-		z__1.r = alpha->r * Y(jy).r - alpha->i * Y(jy).i, z__1.i =
-			 alpha->r * Y(jy).i + alpha->i * Y(jy).r;
-		temp.r = z__1.r, temp.i = z__1.i;
-		ix = kx;
-		i__2 = *m;
-		for (i = 1; i <= *m; ++i) {
-		    i__3 = i + j * a_dim1;
-		    i__4 = i + j * a_dim1;
-		    i__5 = ix;
-		    z__2.r = X(ix).r * temp.r - X(ix).i * temp.i, z__2.i =
-			     X(ix).r * temp.i + X(ix).i * temp.r;
-		    z__1.r = A(i,j).r + z__2.r, z__1.i = A(i,j).i + z__2.i;
-		    A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-		    ix += *incx;
-/* L30: */
-		}
-	    }
-	    jy += *incy;
-/* L40: */
-	}
-    }
-
-    return 0;
-
-/*     End of ZGERU . */
-
-} /* zgeru_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zhemv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zhemv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zhemv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zhemv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,421 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zhemv_(char *uplo, integer *n, doublecomplex *alpha, 
-	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, 
-	doublecomplex *beta, doublecomplex *y, integer *incy)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    doublereal d__1;
-    doublecomplex z__1, z__2, z__3, z__4;
-
-    /* Builtin functions */
-    void d_cnjg(doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    ZHEMV  performs the matrix-vector  operation   
-
-       y := alpha*A*x + beta*y,   
-
-    where alpha and beta are scalars, x and y are n element vectors and   
-    A is an n by n hermitian matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the hermitian matrix and the strictly   
-             lower triangular part of A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the hermitian matrix and the strictly   
-             upper triangular part of A is not referenced.   
-             Note that the imaginary parts of the diagonal elements need 
-  
-             not be set and are assumed to be zero.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    BETA   - COMPLEX*16      .   
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y. On exit, Y is overwritten by the updated   
-             vector y.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*lda < max(1,*n)) {
-	info = 5;
-    } else if (*incx == 0) {
-	info = 7;
-    } else if (*incy == 0) {
-	info = 10;
-    }
-    if (info != 0) {
-	xerbla_("ZHEMV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && 
-	    beta->i == 0.)) {
-	return 0;
-    }
-
-/*     Set up the start points in  X  and  Y. */
-
-    if (*incx > 0) {
-	kx = 1;
-    } else {
-	kx = 1 - (*n - 1) * *incx;
-    }
-    if (*incy > 0) {
-	ky = 1;
-    } else {
-	ky = 1 - (*n - 1) * *incy;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A.   
-
-       First form  y := beta*y. */
-
-    if (beta->r != 1. || beta->i != 0.) {
-	if (*incy == 1) {
-	    if (beta->r == 0. && beta->i == 0.) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = i;
-		    Y(i).r = 0., Y(i).i = 0.;
-/* L10: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = i;
-		    i__3 = i;
-		    z__1.r = beta->r * Y(i).r - beta->i * Y(i).i, 
-			    z__1.i = beta->r * Y(i).i + beta->i * Y(i)
-			    .r;
-		    Y(i).r = z__1.r, Y(i).i = z__1.i;
-/* L20: */
-		}
-	    }
-	} else {
-	    iy = ky;
-	    if (beta->r == 0. && beta->i == 0.) {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = iy;
-		    Y(iy).r = 0., Y(iy).i = 0.;
-		    iy += *incy;
-/* L30: */
-		}
-	    } else {
-		i__1 = *n;
-		for (i = 1; i <= *n; ++i) {
-		    i__2 = iy;
-		    i__3 = iy;
-		    z__1.r = beta->r * Y(iy).r - beta->i * Y(iy).i, 
-			    z__1.i = beta->r * Y(iy).i + beta->i * Y(iy)
-			    .r;
-		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
-		    iy += *incy;
-/* L40: */
-		}
-	    }
-	}
-    }
-    if (alpha->r == 0. && alpha->i == 0.) {
-	return 0;
-    }
-    if (lsame_(uplo, "U")) {
-
-/*        Form  y  when A is stored in upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		z__1.r = alpha->r * X(j).r - alpha->i * X(j).i, z__1.i =
-			 alpha->r * X(j).i + alpha->i * X(j).r;
-		temp1.r = z__1.r, temp1.i = z__1.i;
-		temp2.r = 0., temp2.i = 0.;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    i__3 = i;
-		    i__4 = i;
-		    i__5 = i + j * a_dim1;
-		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + z__2.i;
-		    Y(i).r = z__1.r, Y(i).i = z__1.i;
-		    d_cnjg(&z__3, &A(i,j));
-		    i__3 = i;
-		    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, z__2.i =
-			     z__3.r * X(i).i + z__3.i * X(i).r;
-		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-/* L50: */
-		}
-		i__2 = j;
-		i__3 = j;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
-		z__2.r = Y(j).r + z__3.r, z__2.i = Y(j).i + z__3.i;
-		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-		Y(j).r = z__1.r, Y(j).i = z__1.i;
-/* L60: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, z__1.i =
-			 alpha->r * X(jx).i + alpha->i * X(jx).r;
-		temp1.r = z__1.r, temp1.i = z__1.i;
-		temp2.r = 0., temp2.i = 0.;
-		ix = kx;
-		iy = ky;
-		i__2 = j - 1;
-		for (i = 1; i <= j-1; ++i) {
-		    i__3 = iy;
-		    i__4 = iy;
-		    i__5 = i + j * a_dim1;
-		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + z__2.i;
-		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
-		    d_cnjg(&z__3, &A(i,j));
-		    i__3 = ix;
-		    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, z__2.i =
-			     z__3.r * X(ix).i + z__3.i * X(ix).r;
-		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-		    ix += *incx;
-		    iy += *incy;
-/* L70: */
-		}
-		i__2 = jy;
-		i__3 = jy;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
-		z__2.r = Y(jy).r + z__3.r, z__2.i = Y(jy).i + z__3.i;
-		z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    } else {
-
-/*        Form  y  when A is stored in lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		z__1.r = alpha->r * X(j).r - alpha->i * X(j).i, z__1.i =
-			 alpha->r * X(j).i + alpha->i * X(j).r;
-		temp1.r = z__1.r, temp1.i = z__1.i;
-		temp2.r = 0., temp2.i = 0.;
-		i__2 = j;
-		i__3 = j;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
-		z__1.r = Y(j).r + z__2.r, z__1.i = Y(j).i + z__2.i;
-		Y(j).r = z__1.r, Y(j).i = z__1.i;
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    i__3 = i;
-		    i__4 = i;
-		    i__5 = i + j * a_dim1;
-		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    z__1.r = Y(i).r + z__2.r, z__1.i = Y(i).i + z__2.i;
-		    Y(i).r = z__1.r, Y(i).i = z__1.i;
-		    d_cnjg(&z__3, &A(i,j));
-		    i__3 = i;
-		    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, z__2.i =
-			     z__3.r * X(i).i + z__3.i * X(i).r;
-		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-/* L90: */
-		}
-		i__2 = j;
-		i__3 = j;
-		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		z__1.r = Y(j).r + z__2.r, z__1.i = Y(j).i + z__2.i;
-		Y(j).r = z__1.r, Y(j).i = z__1.i;
-/* L100: */
-	    }
-	} else {
-	    jx = kx;
-	    jy = ky;
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		z__1.r = alpha->r * X(jx).r - alpha->i * X(jx).i, z__1.i =
-			 alpha->r * X(jx).i + alpha->i * X(jx).r;
-		temp1.r = z__1.r, temp1.i = z__1.i;
-		temp2.r = 0., temp2.i = 0.;
-		i__2 = jy;
-		i__3 = jy;
-		i__4 = j + j * a_dim1;
-		d__1 = A(j,j).r;
-		z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
-		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
-		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
-		ix = jx;
-		iy = jy;
-		i__2 = *n;
-		for (i = j + 1; i <= *n; ++i) {
-		    ix += *incx;
-		    iy += *incy;
-		    i__3 = iy;
-		    i__4 = iy;
-		    i__5 = i + j * a_dim1;
-		    z__2.r = temp1.r * A(i,j).r - temp1.i * A(i,j).i, 
-			    z__2.i = temp1.r * A(i,j).i + temp1.i * A(i,j)
-			    .r;
-		    z__1.r = Y(iy).r + z__2.r, z__1.i = Y(iy).i + z__2.i;
-		    Y(iy).r = z__1.r, Y(iy).i = z__1.i;
-		    d_cnjg(&z__3, &A(i,j));
-		    i__3 = ix;
-		    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, z__2.i =
-			     z__3.r * X(ix).i + z__3.i * X(ix).r;
-		    z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-/* L110: */
-		}
-		i__2 = jy;
-		i__3 = jy;
-		z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = 
-			alpha->r * temp2.i + alpha->i * temp2.r;
-		z__1.r = Y(jy).r + z__2.r, z__1.i = Y(jy).i + z__2.i;
-		Y(jy).r = z__1.r, Y(jy).i = z__1.i;
-		jx += *incx;
-		jy += *incy;
-/* L120: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of ZHEMV . */
-
-} /* zhemv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zher2.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zher2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zher2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zher2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,437 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zher2_(char *uplo, integer *n, doublecomplex *alpha, 
-	doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, 
-	doublecomplex *a, integer *lda)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
-    doublereal d__1;
-    doublecomplex z__1, z__2, z__3, z__4;
-
-    /* Builtin functions */
-    void d_cnjg(doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp1, temp2;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, iy, jx, jy, kx, ky;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-
-
-/*  Purpose   
-    =======   
-
-    ZHER2  performs the hermitian rank 2 operation   
-
-       A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A,   
-
-    where alpha is a scalar, x and y are n element vectors and A is an n 
-  
-    by n hermitian matrix.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the upper or lower   
-             triangular part of the array A is to be referenced as   
-             follows:   
-
-                UPLO = 'U' or 'u'   Only the upper triangular part of A   
-                                    is to be referenced.   
-
-                UPLO = 'L' or 'l'   Only the lower triangular part of A   
-                                    is to be referenced.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry, ALPHA specifies the scalar alpha.   
-             Unchanged on exit.   
-
-    X      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element vector x.   
-             Unchanged on exit.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-    Y      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ).   
-             Before entry, the incremented array Y must contain the n   
-             element vector y.   
-             Unchanged on exit.   
-
-    INCY   - INTEGER.   
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular part of the hermitian matrix and the strictly   
-             lower triangular part of A is not referenced. On exit, the   
-             upper triangular part of the array A is overwritten by the   
-             upper triangular part of the updated matrix.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular part of the hermitian matrix and the strictly   
-             upper triangular part of A is not referenced. On exit, the   
-             lower triangular part of the array A is overwritten by the   
-             lower triangular part of the updated matrix.   
-             Note that the imaginary parts of the diagonal elements need 
-  
-             not be set, they are assumed to be zero, and on exit they   
-             are set to zero.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-#define Y(I) y[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (*n < 0) {
-	info = 2;
-    } else if (*incx == 0) {
-	info = 5;
-    } else if (*incy == 0) {
-	info = 7;
-    } else if (*lda < max(1,*n)) {
-	info = 9;
-    }
-    if (info != 0) {
-	xerbla_("ZHER2 ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0 || alpha->r == 0. && alpha->i == 0.) {
-	return 0;
-    }
-
-/*     Set up the start points in X and Y if the increments are not both 
-  
-       unity. */
-
-    if (*incx != 1 || *incy != 1) {
-	if (*incx > 0) {
-	    kx = 1;
-	} else {
-	    kx = 1 - (*n - 1) * *incx;
-	}
-	if (*incy > 0) {
-	    ky = 1;
-	} else {
-	    ky = 1 - (*n - 1) * *incy;
-	}
-	jx = kx;
-	jy = ky;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through the triangular part   
-       of A. */
-
-    if (lsame_(uplo, "U")) {
-
-/*        Form  A  when A is stored in the upper triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		i__3 = j;
-		if (X(j).r != 0. || X(j).i != 0. || (Y(j).r != 0. || 
-			Y(j).i != 0.)) {
-		    d_cnjg(&z__2, &Y(j));
-		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
-			    alpha->r * z__2.i + alpha->i * z__2.r;
-		    temp1.r = z__1.r, temp1.i = z__1.i;
-		    i__2 = j;
-		    z__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
-			    z__2.i = alpha->r * X(j).i + alpha->i * X(j)
-			    .r;
-		    d_cnjg(&z__1, &z__2);
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = i;
-			z__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
-				z__3.i = X(i).r * temp1.i + X(i).i * 
-				temp1.r;
-			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
-				z__3.i;
-			i__6 = i;
-			z__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
-				z__4.i = Y(i).r * temp2.i + Y(i).i * 
-				temp2.r;
-			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-/* L10: */
-		    }
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = j;
-		    z__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
-			    z__2.i = X(j).r * temp1.i + X(j).i * 
-			    temp1.r;
-		    i__5 = j;
-		    z__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
-			    z__3.i = Y(j).r * temp2.i + Y(j).i * 
-			    temp2.r;
-		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-		    d__1 = A(j,j).r + z__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		}
-/* L20: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		i__3 = jy;
-		if (X(jx).r != 0. || X(jx).i != 0. || (Y(jy).r != 0. || 
-			Y(jy).i != 0.)) {
-		    d_cnjg(&z__2, &Y(jy));
-		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
-			    alpha->r * z__2.i + alpha->i * z__2.r;
-		    temp1.r = z__1.r, temp1.i = z__1.i;
-		    i__2 = jx;
-		    z__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    z__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    d_cnjg(&z__1, &z__2);
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-		    ix = kx;
-		    iy = ky;
-		    i__2 = j - 1;
-		    for (i = 1; i <= j-1; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = ix;
-			z__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
-				z__3.i = X(ix).r * temp1.i + X(ix).i * 
-				temp1.r;
-			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
-				z__3.i;
-			i__6 = iy;
-			z__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
-				z__4.i = Y(iy).r * temp2.i + Y(iy).i * 
-				temp2.r;
-			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-			ix += *incx;
-			iy += *incy;
-/* L30: */
-		    }
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = jx;
-		    z__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
-			    z__2.i = X(jx).r * temp1.i + X(jx).i * 
-			    temp1.r;
-		    i__5 = jy;
-		    z__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
-			    z__3.i = Y(jy).r * temp2.i + Y(jy).i * 
-			    temp2.r;
-		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-		    d__1 = A(j,j).r + z__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		}
-		jx += *incx;
-		jy += *incy;
-/* L40: */
-	    }
-	}
-    } else {
-
-/*        Form  A  when A is stored in the lower triangle. */
-
-	if (*incx == 1 && *incy == 1) {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = j;
-		i__3 = j;
-		if (X(j).r != 0. || X(j).i != 0. || (Y(j).r != 0. || 
-			Y(j).i != 0.)) {
-		    d_cnjg(&z__2, &Y(j));
-		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
-			    alpha->r * z__2.i + alpha->i * z__2.r;
-		    temp1.r = z__1.r, temp1.i = z__1.i;
-		    i__2 = j;
-		    z__2.r = alpha->r * X(j).r - alpha->i * X(j).i, 
-			    z__2.i = alpha->r * X(j).i + alpha->i * X(j)
-			    .r;
-		    d_cnjg(&z__1, &z__2);
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = j;
-		    z__2.r = X(j).r * temp1.r - X(j).i * temp1.i, 
-			    z__2.i = X(j).r * temp1.i + X(j).i * 
-			    temp1.r;
-		    i__5 = j;
-		    z__3.r = Y(j).r * temp2.r - Y(j).i * temp2.i, 
-			    z__3.i = Y(j).r * temp2.i + Y(j).i * 
-			    temp2.r;
-		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-		    d__1 = A(j,j).r + z__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		    i__2 = *n;
-		    for (i = j + 1; i <= *n; ++i) {
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = i;
-			z__3.r = X(i).r * temp1.r - X(i).i * temp1.i, 
-				z__3.i = X(i).r * temp1.i + X(i).i * 
-				temp1.r;
-			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
-				z__3.i;
-			i__6 = i;
-			z__4.r = Y(i).r * temp2.r - Y(i).i * temp2.i, 
-				z__4.i = Y(i).r * temp2.i + Y(i).i * 
-				temp2.r;
-			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-/* L50: */
-		    }
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		}
-/* L60: */
-	    }
-	} else {
-	    i__1 = *n;
-	    for (j = 1; j <= *n; ++j) {
-		i__2 = jx;
-		i__3 = jy;
-		if (X(jx).r != 0. || X(jx).i != 0. || (Y(jy).r != 0. || 
-			Y(jy).i != 0.)) {
-		    d_cnjg(&z__2, &Y(jy));
-		    z__1.r = alpha->r * z__2.r - alpha->i * z__2.i, z__1.i = 
-			    alpha->r * z__2.i + alpha->i * z__2.r;
-		    temp1.r = z__1.r, temp1.i = z__1.i;
-		    i__2 = jx;
-		    z__2.r = alpha->r * X(jx).r - alpha->i * X(jx).i, 
-			    z__2.i = alpha->r * X(jx).i + alpha->i * X(jx)
-			    .r;
-		    d_cnjg(&z__1, &z__2);
-		    temp2.r = z__1.r, temp2.i = z__1.i;
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    i__4 = jx;
-		    z__2.r = X(jx).r * temp1.r - X(jx).i * temp1.i, 
-			    z__2.i = X(jx).r * temp1.i + X(jx).i * 
-			    temp1.r;
-		    i__5 = jy;
-		    z__3.r = Y(jy).r * temp2.r - Y(jy).i * temp2.i, 
-			    z__3.i = Y(jy).r * temp2.i + Y(jy).i * 
-			    temp2.r;
-		    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
-		    d__1 = A(j,j).r + z__1.r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		    ix = jx;
-		    iy = jy;
-		    i__2 = *n;
-		    for (i = j + 1; i <= *n; ++i) {
-			ix += *incx;
-			iy += *incy;
-			i__3 = i + j * a_dim1;
-			i__4 = i + j * a_dim1;
-			i__5 = ix;
-			z__3.r = X(ix).r * temp1.r - X(ix).i * temp1.i, 
-				z__3.i = X(ix).r * temp1.i + X(ix).i * 
-				temp1.r;
-			z__2.r = A(i,j).r + z__3.r, z__2.i = A(i,j).i + 
-				z__3.i;
-			i__6 = iy;
-			z__4.r = Y(iy).r * temp2.r - Y(iy).i * temp2.i, 
-				z__4.i = Y(iy).r * temp2.i + Y(iy).i * 
-				temp2.r;
-			z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
-			A(i,j).r = z__1.r, A(i,j).i = z__1.i;
-/* L70: */
-		    }
-		} else {
-		    i__2 = j + j * a_dim1;
-		    i__3 = j + j * a_dim1;
-		    d__1 = A(j,j).r;
-		    A(j,j).r = d__1, A(j,j).i = 0.;
-		}
-		jx += *incx;
-		jy += *incy;
-/* L80: */
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of ZHER2 . */
-
-} /* zher2_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zscal.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zscal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/zscal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/zscal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,70 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int zscal_(integer *n, doublecomplex *za, doublecomplex *zx, 
-	integer *incx)
-{
-
-
-    /* System generated locals */
-    integer i__1, i__2, i__3;
-    doublecomplex z__1;
-
-    /* Local variables */
-    static integer i, ix;
-
-
-/*     scales a vector by a constant.   
-       jack dongarra, 3/11/78.   
-       modified 3/93 to return if incx .le. 0.   
-       modified 12/3/93, array(1) declarations changed to array(*)   
-
-
-    
-   Parameter adjustments   
-       Function Body */
-#define ZX(I) zx[(I)-1]
-
-
-    if (*n <= 0 || *incx <= 0) {
-	return 0;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*        code for increment not equal to 1 */
-
-    ix = 1;
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = ix;
-	i__3 = ix;
-	z__1.r = za->r * ZX(ix).r - za->i * ZX(ix).i, z__1.i = za->r * ZX(
-		ix).i + za->i * ZX(ix).r;
-	ZX(ix).r = z__1.r, ZX(ix).i = z__1.i;
-	ix += *incx;
-/* L10: */
-    }
-    return 0;
-
-/*        code for increment equal to 1 */
-
-L20:
-    i__1 = *n;
-    for (i = 1; i <= *n; ++i) {
-	i__2 = i;
-	i__3 = i;
-	z__1.r = za->r * ZX(i).r - za->i * ZX(i).i, z__1.i = za->r * ZX(
-		i).i + za->i * ZX(i).r;
-	ZX(i).r = z__1.r, ZX(i).i = z__1.i;
-/* L30: */
-    }
-    return 0;
-} /* zscal_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,692 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Table of constant values */
-
-static doublecomplex c_b1 = {1.,0.};
-
-/* Subroutine */ int ztrsm_(char *side, char *uplo, char *transa, char *diag, 
-	integer *m, integer *n, doublecomplex *alpha, doublecomplex *a, 
-	integer *lda, doublecomplex *b, integer *ldb)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, 
-	    i__6, i__7;
-    doublecomplex z__1, z__2, z__3;
-
-    /* Builtin functions */
-    void z_div(doublecomplex *, doublecomplex *, doublecomplex *), d_cnjg(
-	    doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp;
-    static integer i, j, k;
-    static logical lside;
-    extern logical lsame_(char *, char *);
-    static integer nrowa;
-    static logical upper;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical noconj, nounit;
-
-
-/*  Purpose   
-    =======   
-
-    ZTRSM  solves one of the matrix equations   
-
-       op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,   
-
-    where alpha is a scalar, X and B are m by n matrices, A is a unit, or 
-  
-    non-unit,  upper or lower triangular matrix  and  op( A )  is one  of 
-  
-
-       op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).   
-
-    The matrix X is overwritten on B.   
-
-    Parameters   
-    ==========   
-
-    SIDE   - CHARACTER*1.   
-             On entry, SIDE specifies whether op( A ) appears on the left 
-  
-             or right of X as follows:   
-
-                SIDE = 'L' or 'l'   op( A )*X = alpha*B.   
-
-                SIDE = 'R' or 'r'   X*op( A ) = alpha*B.   
-
-             Unchanged on exit.   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the matrix A is an upper or 
-  
-             lower triangular matrix as follows:   
-
-                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
-
-                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
-
-             Unchanged on exit.   
-
-    TRANSA - CHARACTER*1.   
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-  
-             the matrix multiplication as follows:   
-
-                TRANSA = 'N' or 'n'   op( A ) = A.   
-
-                TRANSA = 'T' or 't'   op( A ) = A'.   
-
-                TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).   
-
-             Unchanged on exit.   
-
-    DIAG   - CHARACTER*1.   
-             On entry, DIAG specifies whether or not A is unit triangular 
-  
-             as follows:   
-
-                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
-
-                DIAG = 'N' or 'n'   A is not assumed to be unit   
-                                    triangular.   
-
-             Unchanged on exit.   
-
-    M      - INTEGER.   
-             On entry, M specifies the number of rows of B. M must be at 
-  
-             least zero.   
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the number of columns of B.  N must be 
-  
-             at least zero.   
-             Unchanged on exit.   
-
-    ALPHA  - COMPLEX*16      .   
-             On entry,  ALPHA specifies the scalar  alpha. When  alpha is 
-  
-             zero then  A is not referenced and  B need not be set before 
-  
-             entry.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, k ), where k is m 
-  
-             when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'. 
-  
-             Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k 
-  
-             upper triangular part of the array  A must contain the upper 
-  
-             triangular matrix  and the strictly lower triangular part of 
-  
-             A is not referenced.   
-             Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k 
-  
-             lower triangular part of the array  A must contain the lower 
-  
-             triangular matrix  and the strictly upper triangular part of 
-  
-             A is not referenced.   
-             Note that when  DIAG = 'U' or 'u',  the diagonal elements of 
-  
-             A  are not referenced either,  but are assumed to be  unity. 
-  
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program.  When  SIDE = 'L' or 'l'  then 
-  
-             LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r' 
-  
-             then LDA must be at least max( 1, n ).   
-             Unchanged on exit.   
-
-    B      - COMPLEX*16       array of DIMENSION ( LDB, n ).   
-             Before entry,  the leading  m by n part of the array  B must 
-  
-             contain  the  right-hand  side  matrix  B,  and  on exit  is 
-  
-             overwritten by the solution matrix  X.   
-
-    LDB    - INTEGER.   
-             On entry, LDB specifies the first dimension of B as declared 
-  
-             in  the  calling  (sub)  program.   LDB  must  be  at  least 
-  
-             max( 1, m ).   
-             Unchanged on exit.   
-
-
-    Level 3 Blas routine.   
-
-    -- Written on 8-February-1989.   
-       Jack Dongarra, Argonne National Laboratory.   
-       Iain Duff, AERE Harwell.   
-       Jeremy Du Croz, Numerical Algorithms Group Ltd.   
-       Sven Hammarling, Numerical Algorithms Group Ltd.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
-
-    lside = lsame_(side, "L");
-    if (lside) {
-	nrowa = *m;
-    } else {
-	nrowa = *n;
-    }
-    noconj = lsame_(transa, "T");
-    nounit = lsame_(diag, "N");
-    upper = lsame_(uplo, "U");
-
-    info = 0;
-    if (! lside && ! lsame_(side, "R")) {
-	info = 1;
-    } else if (! upper && ! lsame_(uplo, "L")) {
-	info = 2;
-    } else if (! lsame_(transa, "N") && ! lsame_(transa, "T") 
-	    && ! lsame_(transa, "C")) {
-	info = 3;
-    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
-	info = 4;
-    } else if (*m < 0) {
-	info = 5;
-    } else if (*n < 0) {
-	info = 6;
-    } else if (*lda < max(1,nrowa)) {
-	info = 9;
-    } else if (*ldb < max(1,*m)) {
-	info = 11;
-    }
-    if (info != 0) {
-	xerbla_("ZTRSM ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0) {
-	return 0;
-    }
-
-/*     And when  alpha.eq.zero. */
-
-    if (alpha->r == 0. && alpha->i == 0.) {
-	i__1 = *n;
-	for (j = 1; j <= *n; ++j) {
-	    i__2 = *m;
-	    for (i = 1; i <= *m; ++i) {
-		i__3 = i + j * b_dim1;
-		B(i,j).r = 0., B(i,j).i = 0.;
-/* L10: */
-	    }
-/* L20: */
-	}
-	return 0;
-    }
-
-/*     Start the operations. */
-
-    if (lside) {
-	if (lsame_(transa, "N")) {
-
-/*           Form  B := alpha*inv( A )*B. */
-
-	    if (upper) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (alpha->r != 1. || alpha->i != 0.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__3 = i + j * b_dim1;
-			    i__4 = i + j * b_dim1;
-			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
-				    .i, z__1.i = alpha->r * B(i,j).i + 
-				    alpha->i * B(i,j).r;
-			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L30: */
-			}
-		    }
-		    for (k = *m; k >= 1; --k) {
-			i__2 = k + j * b_dim1;
-			if (B(k,j).r != 0. || B(k,j).i != 0.) {
-			    if (nounit) {
-				i__2 = k + j * b_dim1;
-				z_div(&z__1, &B(k,j), &A(k,k));
-				B(k,j).r = z__1.r, B(k,j).i = z__1.i;
-			    }
-			    i__2 = k - 1;
-			    for (i = 1; i <= k-1; ++i) {
-				i__3 = i + j * b_dim1;
-				i__4 = i + j * b_dim1;
-				i__5 = k + j * b_dim1;
-				i__6 = i + k * a_dim1;
-				z__2.r = B(k,j).r * A(i,k).r - B(k,j).i * 
-					A(i,k).i, z__2.i = B(k,j).r * A(i,k).i + B(k,j).i * A(i,k).r;
-				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
-					.i - z__2.i;
-				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L40: */
-			    }
-			}
-/* L50: */
-		    }
-/* L60: */
-		}
-	    } else {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (alpha->r != 1. || alpha->i != 0.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__3 = i + j * b_dim1;
-			    i__4 = i + j * b_dim1;
-			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
-				    .i, z__1.i = alpha->r * B(i,j).i + 
-				    alpha->i * B(i,j).r;
-			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L70: */
-			}
-		    }
-		    i__2 = *m;
-		    for (k = 1; k <= *m; ++k) {
-			i__3 = k + j * b_dim1;
-			if (B(k,j).r != 0. || B(k,j).i != 0.) {
-			    if (nounit) {
-				i__3 = k + j * b_dim1;
-				z_div(&z__1, &B(k,j), &A(k,k));
-				B(k,j).r = z__1.r, B(k,j).i = z__1.i;
-			    }
-			    i__3 = *m;
-			    for (i = k + 1; i <= *m; ++i) {
-				i__4 = i + j * b_dim1;
-				i__5 = i + j * b_dim1;
-				i__6 = k + j * b_dim1;
-				i__7 = i + k * a_dim1;
-				z__2.r = B(k,j).r * A(i,k).r - B(k,j).i * 
-					A(i,k).i, z__2.i = B(k,j).r * A(i,k).i + B(k,j).i * A(i,k).r;
-				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
-					.i - z__2.i;
-				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L80: */
-			    }
-			}
-/* L90: */
-		    }
-/* L100: */
-		}
-	    }
-	} else {
-
-/*           Form  B := alpha*inv( A' )*B   
-             or    B := alpha*inv( conjg( A' ) )*B. */
-
-	    if (upper) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = *m;
-		    for (i = 1; i <= *m; ++i) {
-			i__3 = i + j * b_dim1;
-			z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j).i, 
-				z__1.i = alpha->r * B(i,j).i + alpha->i * B(i,j).r;
-			temp.r = z__1.r, temp.i = z__1.i;
-			if (noconj) {
-			    i__3 = i - 1;
-			    for (k = 1; k <= i-1; ++k) {
-				i__4 = k + i * a_dim1;
-				i__5 = k + j * b_dim1;
-				z__2.r = A(k,i).r * B(k,j).r - A(k,i).i * 
-					B(k,j).i, z__2.i = A(k,i).r * B(k,j).i + A(k,i).i * B(k,j).r;
-				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-					z__2.i;
-				temp.r = z__1.r, temp.i = z__1.i;
-/* L110: */
-			    }
-			    if (nounit) {
-				z_div(&z__1, &temp, &A(i,i));
-				temp.r = z__1.r, temp.i = z__1.i;
-			    }
-			} else {
-			    i__3 = i - 1;
-			    for (k = 1; k <= i-1; ++k) {
-				d_cnjg(&z__3, &A(k,i));
-				i__4 = k + j * b_dim1;
-				z__2.r = z__3.r * B(k,j).r - z__3.i * B(k,j)
-					.i, z__2.i = z__3.r * B(k,j).i + 
-					z__3.i * B(k,j).r;
-				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-					z__2.i;
-				temp.r = z__1.r, temp.i = z__1.i;
-/* L120: */
-			    }
-			    if (nounit) {
-				d_cnjg(&z__2, &A(i,i));
-				z_div(&z__1, &temp, &z__2);
-				temp.r = z__1.r, temp.i = z__1.i;
-			    }
-			}
-			i__3 = i + j * b_dim1;
-			B(i,j).r = temp.r, B(i,j).i = temp.i;
-/* L130: */
-		    }
-/* L140: */
-		}
-	    } else {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    for (i = *m; i >= 1; --i) {
-			i__2 = i + j * b_dim1;
-			z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j).i, 
-				z__1.i = alpha->r * B(i,j).i + alpha->i * B(i,j).r;
-			temp.r = z__1.r, temp.i = z__1.i;
-			if (noconj) {
-			    i__2 = *m;
-			    for (k = i + 1; k <= *m; ++k) {
-				i__3 = k + i * a_dim1;
-				i__4 = k + j * b_dim1;
-				z__2.r = A(k,i).r * B(k,j).r - A(k,i).i * 
-					B(k,j).i, z__2.i = A(k,i).r * B(k,j).i + A(k,i).i * B(k,j).r;
-				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-					z__2.i;
-				temp.r = z__1.r, temp.i = z__1.i;
-/* L150: */
-			    }
-			    if (nounit) {
-				z_div(&z__1, &temp, &A(i,i));
-				temp.r = z__1.r, temp.i = z__1.i;
-			    }
-			} else {
-			    i__2 = *m;
-			    for (k = i + 1; k <= *m; ++k) {
-				d_cnjg(&z__3, &A(k,i));
-				i__3 = k + j * b_dim1;
-				z__2.r = z__3.r * B(k,j).r - z__3.i * B(k,j)
-					.i, z__2.i = z__3.r * B(k,j).i + 
-					z__3.i * B(k,j).r;
-				z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-					z__2.i;
-				temp.r = z__1.r, temp.i = z__1.i;
-/* L160: */
-			    }
-			    if (nounit) {
-				d_cnjg(&z__2, &A(i,i));
-				z_div(&z__1, &temp, &z__2);
-				temp.r = z__1.r, temp.i = z__1.i;
-			    }
-			}
-			i__2 = i + j * b_dim1;
-			B(i,j).r = temp.r, B(i,j).i = temp.i;
-/* L170: */
-		    }
-/* L180: */
-		}
-	    }
-	}
-    } else {
-	if (lsame_(transa, "N")) {
-
-/*           Form  B := alpha*B*inv( A ). */
-
-	    if (upper) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    if (alpha->r != 1. || alpha->i != 0.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__3 = i + j * b_dim1;
-			    i__4 = i + j * b_dim1;
-			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
-				    .i, z__1.i = alpha->r * B(i,j).i + 
-				    alpha->i * B(i,j).r;
-			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L190: */
-			}
-		    }
-		    i__2 = j - 1;
-		    for (k = 1; k <= j-1; ++k) {
-			i__3 = k + j * a_dim1;
-			if (A(k,j).r != 0. || A(k,j).i != 0.) {
-			    i__3 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				i__4 = i + j * b_dim1;
-				i__5 = i + j * b_dim1;
-				i__6 = k + j * a_dim1;
-				i__7 = i + k * b_dim1;
-				z__2.r = A(k,j).r * B(i,k).r - A(k,j).i * 
-					B(i,k).i, z__2.i = A(k,j).r * B(i,k).i + A(k,j).i * B(i,k).r;
-				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
-					.i - z__2.i;
-				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L200: */
-			    }
-			}
-/* L210: */
-		    }
-		    if (nounit) {
-			z_div(&z__1, &c_b1, &A(j,j));
-			temp.r = z__1.r, temp.i = z__1.i;
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__3 = i + j * b_dim1;
-			    i__4 = i + j * b_dim1;
-			    z__1.r = temp.r * B(i,j).r - temp.i * B(i,j).i, 
-				    z__1.i = temp.r * B(i,j).i + temp.i * B(i,j).r;
-			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L220: */
-			}
-		    }
-/* L230: */
-		}
-	    } else {
-		for (j = *n; j >= 1; --j) {
-		    if (alpha->r != 1. || alpha->i != 0.) {
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__2 = i + j * b_dim1;
-			    i__3 = i + j * b_dim1;
-			    z__1.r = alpha->r * B(i,j).r - alpha->i * B(i,j)
-				    .i, z__1.i = alpha->r * B(i,j).i + 
-				    alpha->i * B(i,j).r;
-			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L240: */
-			}
-		    }
-		    i__1 = *n;
-		    for (k = j + 1; k <= *n; ++k) {
-			i__2 = k + j * a_dim1;
-			if (A(k,j).r != 0. || A(k,j).i != 0.) {
-			    i__2 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				i__3 = i + j * b_dim1;
-				i__4 = i + j * b_dim1;
-				i__5 = k + j * a_dim1;
-				i__6 = i + k * b_dim1;
-				z__2.r = A(k,j).r * B(i,k).r - A(k,j).i * 
-					B(i,k).i, z__2.i = A(k,j).r * B(i,k).i + A(k,j).i * B(i,k).r;
-				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
-					.i - z__2.i;
-				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L250: */
-			    }
-			}
-/* L260: */
-		    }
-		    if (nounit) {
-			z_div(&z__1, &c_b1, &A(j,j));
-			temp.r = z__1.r, temp.i = z__1.i;
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__2 = i + j * b_dim1;
-			    i__3 = i + j * b_dim1;
-			    z__1.r = temp.r * B(i,j).r - temp.i * B(i,j).i, 
-				    z__1.i = temp.r * B(i,j).i + temp.i * B(i,j).r;
-			    B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L270: */
-			}
-		    }
-/* L280: */
-		}
-	    }
-	} else {
-
-/*           Form  B := alpha*B*inv( A' )   
-             or    B := alpha*B*inv( conjg( A' ) ). */
-
-	    if (upper) {
-		for (k = *n; k >= 1; --k) {
-		    if (nounit) {
-			if (noconj) {
-			    z_div(&z__1, &c_b1, &A(k,k));
-			    temp.r = z__1.r, temp.i = z__1.i;
-			} else {
-			    d_cnjg(&z__2, &A(k,k));
-			    z_div(&z__1, &c_b1, &z__2);
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__2 = i + k * b_dim1;
-			    i__3 = i + k * b_dim1;
-			    z__1.r = temp.r * B(i,k).r - temp.i * B(i,k).i, 
-				    z__1.i = temp.r * B(i,k).i + temp.i * B(i,k).r;
-			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
-/* L290: */
-			}
-		    }
-		    i__1 = k - 1;
-		    for (j = 1; j <= k-1; ++j) {
-			i__2 = j + k * a_dim1;
-			if (A(j,k).r != 0. || A(j,k).i != 0.) {
-			    if (noconj) {
-				i__2 = j + k * a_dim1;
-				temp.r = A(j,k).r, temp.i = A(j,k).i;
-			    } else {
-				d_cnjg(&z__1, &A(j,k));
-				temp.r = z__1.r, temp.i = z__1.i;
-			    }
-			    i__2 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				i__3 = i + j * b_dim1;
-				i__4 = i + j * b_dim1;
-				i__5 = i + k * b_dim1;
-				z__2.r = temp.r * B(i,k).r - temp.i * B(i,k)
-					.i, z__2.i = temp.r * B(i,k).i + 
-					temp.i * B(i,k).r;
-				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
-					.i - z__2.i;
-				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L300: */
-			    }
-			}
-/* L310: */
-		    }
-		    if (alpha->r != 1. || alpha->i != 0.) {
-			i__1 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__2 = i + k * b_dim1;
-			    i__3 = i + k * b_dim1;
-			    z__1.r = alpha->r * B(i,k).r - alpha->i * B(i,k)
-				    .i, z__1.i = alpha->r * B(i,k).i + 
-				    alpha->i * B(i,k).r;
-			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
-/* L320: */
-			}
-		    }
-/* L330: */
-		}
-	    } else {
-		i__1 = *n;
-		for (k = 1; k <= *n; ++k) {
-		    if (nounit) {
-			if (noconj) {
-			    z_div(&z__1, &c_b1, &A(k,k));
-			    temp.r = z__1.r, temp.i = z__1.i;
-			} else {
-			    d_cnjg(&z__2, &A(k,k));
-			    z_div(&z__1, &c_b1, &z__2);
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__3 = i + k * b_dim1;
-			    i__4 = i + k * b_dim1;
-			    z__1.r = temp.r * B(i,k).r - temp.i * B(i,k).i, 
-				    z__1.i = temp.r * B(i,k).i + temp.i * B(i,k).r;
-			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
-/* L340: */
-			}
-		    }
-		    i__2 = *n;
-		    for (j = k + 1; j <= *n; ++j) {
-			i__3 = j + k * a_dim1;
-			if (A(j,k).r != 0. || A(j,k).i != 0.) {
-			    if (noconj) {
-				i__3 = j + k * a_dim1;
-				temp.r = A(j,k).r, temp.i = A(j,k).i;
-			    } else {
-				d_cnjg(&z__1, &A(j,k));
-				temp.r = z__1.r, temp.i = z__1.i;
-			    }
-			    i__3 = *m;
-			    for (i = 1; i <= *m; ++i) {
-				i__4 = i + j * b_dim1;
-				i__5 = i + j * b_dim1;
-				i__6 = i + k * b_dim1;
-				z__2.r = temp.r * B(i,k).r - temp.i * B(i,k)
-					.i, z__2.i = temp.r * B(i,k).i + 
-					temp.i * B(i,k).r;
-				z__1.r = B(i,j).r - z__2.r, z__1.i = B(i,j)
-					.i - z__2.i;
-				B(i,j).r = z__1.r, B(i,j).i = z__1.i;
-/* L350: */
-			    }
-			}
-/* L360: */
-		    }
-		    if (alpha->r != 1. || alpha->i != 0.) {
-			i__2 = *m;
-			for (i = 1; i <= *m; ++i) {
-			    i__3 = i + k * b_dim1;
-			    i__4 = i + k * b_dim1;
-			    z__1.r = alpha->r * B(i,k).r - alpha->i * B(i,k)
-				    .i, z__1.i = alpha->r * B(i,k).i + 
-				    alpha->i * B(i,k).r;
-			    B(i,k).r = z__1.r, B(i,k).i = z__1.i;
-/* L370: */
-			}
-		    }
-/* L380: */
-		}
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of ZTRSM . */
-
-} /* ztrsm_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/CBLAS/ztrsv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,510 +0,0 @@
-
-/*  -- translated by f2c (version 19940927).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-#include "f2c.h"
-
-/* Subroutine */ int ztrsv_(char *uplo, char *trans, char *diag, integer *n, 
-	doublecomplex *a, integer *lda, doublecomplex *x, integer *incx)
-{
-
-
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
-    doublecomplex z__1, z__2, z__3;
-
-    /* Builtin functions */
-    void z_div(doublecomplex *, doublecomplex *, doublecomplex *), d_cnjg(
-	    doublecomplex *, doublecomplex *);
-
-    /* Local variables */
-    static integer info;
-    static doublecomplex temp;
-    static integer i, j;
-    extern logical lsame_(char *, char *);
-    static integer ix, jx, kx;
-    extern /* Subroutine */ int xerbla_(char *, integer *);
-    static logical noconj, nounit;
-
-
-/*  Purpose   
-    =======   
-
-    ZTRSV  solves one of the systems of equations   
-
-       A*x = b,   or   A'*x = b,   or   conjg( A' )*x = b,   
-
-    where b and x are n element vectors and A is an n by n unit, or   
-    non-unit, upper or lower triangular matrix.   
-
-    No test for singularity or near-singularity is included in this   
-    routine. Such tests must be performed before calling this routine.   
-
-    Parameters   
-    ==========   
-
-    UPLO   - CHARACTER*1.   
-             On entry, UPLO specifies whether the matrix is an upper or   
-             lower triangular matrix as follows:   
-
-                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
-
-                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
-
-             Unchanged on exit.   
-
-    TRANS  - CHARACTER*1.   
-             On entry, TRANS specifies the equations to be solved as   
-             follows:   
-
-                TRANS = 'N' or 'n'   A*x = b.   
-
-                TRANS = 'T' or 't'   A'*x = b.   
-
-                TRANS = 'C' or 'c'   conjg( A' )*x = b.   
-
-             Unchanged on exit.   
-
-    DIAG   - CHARACTER*1.   
-             On entry, DIAG specifies whether or not A is unit   
-             triangular as follows:   
-
-                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
-
-                DIAG = 'N' or 'n'   A is not assumed to be unit   
-                                    triangular.   
-
-             Unchanged on exit.   
-
-    N      - INTEGER.   
-             On entry, N specifies the order of the matrix A.   
-             N must be at least zero.   
-             Unchanged on exit.   
-
-    A      - COMPLEX*16       array of DIMENSION ( LDA, n ).   
-             Before entry with  UPLO = 'U' or 'u', the leading n by n   
-             upper triangular part of the array A must contain the upper 
-  
-             triangular matrix and the strictly lower triangular part of 
-  
-             A is not referenced.   
-             Before entry with UPLO = 'L' or 'l', the leading n by n   
-             lower triangular part of the array A must contain the lower 
-  
-             triangular matrix and the strictly upper triangular part of 
-  
-             A is not referenced.   
-             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
-  
-             A are not referenced either, but are assumed to be unity.   
-             Unchanged on exit.   
-
-    LDA    - INTEGER.   
-             On entry, LDA specifies the first dimension of A as declared 
-  
-             in the calling (sub) program. LDA must be at least   
-             max( 1, n ).   
-             Unchanged on exit.   
-
-    X      - COMPLEX*16       array of dimension at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ).   
-             Before entry, the incremented array X must contain the n   
-             element right-hand side vector b. On exit, X is overwritten 
-  
-             with the solution vector x.   
-
-    INCX   - INTEGER.   
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-             Unchanged on exit.   
-
-
-    Level 2 Blas routine.   
-
-    -- Written on 22-October-1986.   
-       Jack Dongarra, Argonne National Lab.   
-       Jeremy Du Croz, Nag Central Office.   
-       Sven Hammarling, Nag Central Office.   
-       Richard Hanson, Sandia National Labs.   
-
-
-
-       Test the input parameters.   
-
-    
-   Parameter adjustments   
-       Function Body */
-#define X(I) x[(I)-1]
-
-#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
-
-    info = 0;
-    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
-	info = 1;
-    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
-	     ! lsame_(trans, "C")) {
-	info = 2;
-    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
-	info = 3;
-    } else if (*n < 0) {
-	info = 4;
-    } else if (*lda < max(1,*n)) {
-	info = 6;
-    } else if (*incx == 0) {
-	info = 8;
-    }
-    if (info != 0) {
-	xerbla_("ZTRSV ", &info);
-	return 0;
-    }
-
-/*     Quick return if possible. */
-
-    if (*n == 0) {
-	return 0;
-    }
-
-    noconj = lsame_(trans, "T");
-    nounit = lsame_(diag, "N");
-
-/*     Set up the start point in X if the increment is not unity. This   
-       will be  ( N - 1 )*INCX  too small for descending loops. */
-
-    if (*incx <= 0) {
-	kx = 1 - (*n - 1) * *incx;
-    } else if (*incx != 1) {
-	kx = 1;
-    }
-
-/*     Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-
-    if (lsame_(trans, "N")) {
-
-/*        Form  x := inv( A )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    i__1 = j;
-		    if (X(j).r != 0. || X(j).i != 0.) {
-			if (nounit) {
-			    i__1 = j;
-			    z_div(&z__1, &X(j), &A(j,j));
-			    X(j).r = z__1.r, X(j).i = z__1.i;
-			}
-			i__1 = j;
-			temp.r = X(j).r, temp.i = X(j).i;
-			for (i = j - 1; i >= 1; --i) {
-			    i__1 = i;
-			    i__2 = i;
-			    i__3 = i + j * a_dim1;
-			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    z__1.r = X(i).r - z__2.r, z__1.i = X(i).i - 
-				    z__2.i;
-			    X(i).r = z__1.r, X(i).i = z__1.i;
-/* L10: */
-			}
-		    }
-/* L20: */
-		}
-	    } else {
-		jx = kx + (*n - 1) * *incx;
-		for (j = *n; j >= 1; --j) {
-		    i__1 = jx;
-		    if (X(jx).r != 0. || X(jx).i != 0.) {
-			if (nounit) {
-			    i__1 = jx;
-			    z_div(&z__1, &X(jx), &A(j,j));
-			    X(jx).r = z__1.r, X(jx).i = z__1.i;
-			}
-			i__1 = jx;
-			temp.r = X(jx).r, temp.i = X(jx).i;
-			ix = jx;
-			for (i = j - 1; i >= 1; --i) {
-			    ix -= *incx;
-			    i__1 = ix;
-			    i__2 = ix;
-			    i__3 = i + j * a_dim1;
-			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    z__1.r = X(ix).r - z__2.r, z__1.i = X(ix).i - 
-				    z__2.i;
-			    X(ix).r = z__1.r, X(ix).i = z__1.i;
-/* L30: */
-			}
-		    }
-		    jx -= *incx;
-/* L40: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = j;
-		    if (X(j).r != 0. || X(j).i != 0.) {
-			if (nounit) {
-			    i__2 = j;
-			    z_div(&z__1, &X(j), &A(j,j));
-			    X(j).r = z__1.r, X(j).i = z__1.i;
-			}
-			i__2 = j;
-			temp.r = X(j).r, temp.i = X(j).i;
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    i__3 = i;
-			    i__4 = i;
-			    i__5 = i + j * a_dim1;
-			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    z__1.r = X(i).r - z__2.r, z__1.i = X(i).i - 
-				    z__2.i;
-			    X(i).r = z__1.r, X(i).i = z__1.i;
-/* L50: */
-			}
-		    }
-/* L60: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = jx;
-		    if (X(jx).r != 0. || X(jx).i != 0.) {
-			if (nounit) {
-			    i__2 = jx;
-			    z_div(&z__1, &X(jx), &A(j,j));
-			    X(jx).r = z__1.r, X(jx).i = z__1.i;
-			}
-			i__2 = jx;
-			temp.r = X(jx).r, temp.i = X(jx).i;
-			ix = jx;
-			i__2 = *n;
-			for (i = j + 1; i <= *n; ++i) {
-			    ix += *incx;
-			    i__3 = ix;
-			    i__4 = ix;
-			    i__5 = i + j * a_dim1;
-			    z__2.r = temp.r * A(i,j).r - temp.i * A(i,j).i, 
-				    z__2.i = temp.r * A(i,j).i + temp.i * A(i,j).r;
-			    z__1.r = X(ix).r - z__2.r, z__1.i = X(ix).i - 
-				    z__2.i;
-			    X(ix).r = z__1.r, X(ix).i = z__1.i;
-/* L70: */
-			}
-		    }
-		    jx += *incx;
-/* L80: */
-		}
-	    }
-	}
-    } else {
-
-/*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
-
-	if (lsame_(uplo, "U")) {
-	    if (*incx == 1) {
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    i__2 = j;
-		    temp.r = X(j).r, temp.i = X(j).i;
-		    if (noconj) {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    i__3 = i + j * a_dim1;
-			    i__4 = i;
-			    z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
-				    i).i, z__2.i = A(i,j).r * X(i).i + 
-				    A(i,j).i * X(i).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-/* L90: */
-			}
-			if (nounit) {
-			    z_div(&z__1, &temp, &A(j,j));
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    } else {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    d_cnjg(&z__3, &A(i,j));
-			    i__3 = i;
-			    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, 
-				    z__2.i = z__3.r * X(i).i + z__3.i * X(
-				    i).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-/* L100: */
-			}
-			if (nounit) {
-			    d_cnjg(&z__2, &A(j,j));
-			    z_div(&z__1, &temp, &z__2);
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    }
-		    i__2 = j;
-		    X(j).r = temp.r, X(j).i = temp.i;
-/* L110: */
-		}
-	    } else {
-		jx = kx;
-		i__1 = *n;
-		for (j = 1; j <= *n; ++j) {
-		    ix = kx;
-		    i__2 = jx;
-		    temp.r = X(jx).r, temp.i = X(jx).i;
-		    if (noconj) {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    i__3 = i + j * a_dim1;
-			    i__4 = ix;
-			    z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
-				    ix).i, z__2.i = A(i,j).r * X(ix).i + 
-				    A(i,j).i * X(ix).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-			    ix += *incx;
-/* L120: */
-			}
-			if (nounit) {
-			    z_div(&z__1, &temp, &A(j,j));
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    } else {
-			i__2 = j - 1;
-			for (i = 1; i <= j-1; ++i) {
-			    d_cnjg(&z__3, &A(i,j));
-			    i__3 = ix;
-			    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, 
-				    z__2.i = z__3.r * X(ix).i + z__3.i * X(
-				    ix).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-			    ix += *incx;
-/* L130: */
-			}
-			if (nounit) {
-			    d_cnjg(&z__2, &A(j,j));
-			    z_div(&z__1, &temp, &z__2);
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    }
-		    i__2 = jx;
-		    X(jx).r = temp.r, X(jx).i = temp.i;
-		    jx += *incx;
-/* L140: */
-		}
-	    }
-	} else {
-	    if (*incx == 1) {
-		for (j = *n; j >= 1; --j) {
-		    i__1 = j;
-		    temp.r = X(j).r, temp.i = X(j).i;
-		    if (noconj) {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    i__2 = i + j * a_dim1;
-			    i__3 = i;
-			    z__2.r = A(i,j).r * X(i).r - A(i,j).i * X(
-				    i).i, z__2.i = A(i,j).r * X(i).i + 
-				    A(i,j).i * X(i).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-/* L150: */
-			}
-			if (nounit) {
-			    z_div(&z__1, &temp, &A(j,j));
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    } else {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    d_cnjg(&z__3, &A(i,j));
-			    i__2 = i;
-			    z__2.r = z__3.r * X(i).r - z__3.i * X(i).i, 
-				    z__2.i = z__3.r * X(i).i + z__3.i * X(
-				    i).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-/* L160: */
-			}
-			if (nounit) {
-			    d_cnjg(&z__2, &A(j,j));
-			    z_div(&z__1, &temp, &z__2);
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    }
-		    i__1 = j;
-		    X(j).r = temp.r, X(j).i = temp.i;
-/* L170: */
-		}
-	    } else {
-		kx += (*n - 1) * *incx;
-		jx = kx;
-		for (j = *n; j >= 1; --j) {
-		    ix = kx;
-		    i__1 = jx;
-		    temp.r = X(jx).r, temp.i = X(jx).i;
-		    if (noconj) {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    i__2 = i + j * a_dim1;
-			    i__3 = ix;
-			    z__2.r = A(i,j).r * X(ix).r - A(i,j).i * X(
-				    ix).i, z__2.i = A(i,j).r * X(ix).i + 
-				    A(i,j).i * X(ix).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-			    ix -= *incx;
-/* L180: */
-			}
-			if (nounit) {
-			    z_div(&z__1, &temp, &A(j,j));
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    } else {
-			i__1 = j + 1;
-			for (i = *n; i >= j+1; --i) {
-			    d_cnjg(&z__3, &A(i,j));
-			    i__2 = ix;
-			    z__2.r = z__3.r * X(ix).r - z__3.i * X(ix).i, 
-				    z__2.i = z__3.r * X(ix).i + z__3.i * X(
-				    ix).r;
-			    z__1.r = temp.r - z__2.r, z__1.i = temp.i - 
-				    z__2.i;
-			    temp.r = z__1.r, temp.i = z__1.i;
-			    ix -= *incx;
-/* L190: */
-			}
-			if (nounit) {
-			    d_cnjg(&z__2, &A(j,j));
-			    z_div(&z__1, &temp, &z__2);
-			    temp.r = z__1.r, temp.i = z__1.i;
-			}
-		    }
-		    i__1 = jx;
-		    X(jx).r = temp.r, X(jx).i = temp.i;
-		    jx -= *incx;
-/* L200: */
-		}
-	    }
-	}
-    }
-
-    return 0;
-
-/*     End of ZTRSV . */
-
-} /* ztrsv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/EXAMPLE/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/EXAMPLE/Makefile
--- hypre-2.11.2/src/FEI_mv/DSuperLU/EXAMPLE/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/EXAMPLE/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,136 +0,0 @@
-#######################################################################
-#
-#  This makefile creates the example programs for the linear equation
-#  routines in SuperLU_DIST.
-#
-#  The command
-#       make
-#  without any arguments creates all the example programs.
-#  The command
-# 	make double
-#  creates double precision real example programs.
-#  The command
-#       make complex16
-#  creates double precision complex example programs.
-#
-#  The executable files are called
-#       double real:	pddrive pddrive_ABglobal pddrive1
-#                       pddrive1_ABglobal pddrive2 pddrive3 pddrive4
-#	double complex: pzdrive pzdrive_ABglobal pzdrive1
-#                       pzdrive1_ABglobal pzdrive2 pzdrive3 pzdrive4 
-#
-#  Alternatively, you can create example programs individually by
-#  typing the command (for example)
-#	make pddrive
-#
-#  To remove the object files after the executable files have been
-#  created, enter
-#       make clean
-#
-#######################################################################
-include ../make.inc
-INCLUDEDIR = -I../SRC
-
-DEXM	= pddrive.o dcreate_matrix.o sp_ienv.o dreadhb.o
-# pdgstrs_lsum_X1.o pdgstrf_X1.o
-DEXM1	= pddrive1.o dcreate_matrix.o dreadhb.o
-DEXM2	= pddrive2.o dcreate_matrix.o dreadhb.o
-DEXM3	= pddrive3.o dcreate_matrix.o dreadhb.o
-DEXM4	= pddrive4.o dcreate_matrix.o dreadhb.o
-DEXMG	= pddrive_ABglobal.o dreadhb.o
-DEXMG1	= pddrive1_ABglobal.o dreadhb.o sp_ienv.o 
-DEXMG2	= pddrive2_ABglobal.o dreadhb.o
-DEXMG3	= pddrive3_ABglobal.o dreadhb.o
-DEXMG4	= pddrive4_ABglobal.o dreadhb.o
-ZEXM	= pzdrive.o zcreate_matrix.o zreadhb.o #pzgstrf_X1.o
-ZEXM1	= pzdrive1.o zcreate_matrix.o zreadhb.o
-ZEXM2	= pzdrive2.o zcreate_matrix.o zreadhb.o
-ZEXM3	= pzdrive3.o zcreate_matrix.o zreadhb.o
-ZEXM4	= pzdrive4.o zcreate_matrix.o zreadhb.o
-ZEXMG	= pzdrive_ABglobal.o zreadhb.o
-ZEXMG1	= pzdrive1_ABglobal.o zreadhb.o
-ZEXMG2	= pzdrive2_ABglobal.o zreadhb.o
-ZEXMG3	= pzdrive3_ABglobal.o zreadhb.o
-ZEXMG4	= pzdrive4_ABglobal.o zreadhb.o
-
-all: double complex16
-
-double:    pddrive pddrive1 pddrive2 pddrive3 pddrive4 \
-	   pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \
-	   pddrive3_ABglobal pddrive4_ABglobal
-
-complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \
-	   pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \
-	   pzdrive3_ABglobal pzdrive4_ABglobal
-
-pddrive: $(DEXM) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@
-
-pddrive1: $(DEXM1) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXM1) $(LIBS) -lm -o $@
-
-pddrive2: $(DEXM2) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXM2) $(LIBS) -lm -o $@
-
-pddrive3: $(DEXM3) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXM3) $(LIBS) -lm -o $@
-
-pddrive4: $(DEXM4) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXM4) $(LIBS) -lm -o $@
-
-pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@
-
-pddrive1_ABglobal: $(DEXMG1) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXMG1) $(LIBS) -lm -o $@
-
-pddrive2_ABglobal: $(DEXMG2) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXMG2) $(LIBS) -lm -o $@
-
-pddrive3_ABglobal: $(DEXMG3) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXMG3) $(LIBS) -lm -o $@
-
-pddrive4_ABglobal: $(DEXMG4) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(DEXMG4) $(LIBS) -lm -o $@
-
-pzdrive: $(ZEXM) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXM) $(LIBS) -lm -o $@
-
-pzdrive1: $(ZEXM1) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXM1) $(LIBS) -lm -o $@
-
-pzdrive2: $(ZEXM2) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXM2) $(LIBS) -lm -o $@
-
-pzdrive3: $(ZEXM3) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXM3) $(LIBS) -lm -o $@
-
-pzdrive4: $(ZEXM4) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXM4) $(LIBS) -lm -o $@
-
-pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@
-
-pzdrive1_ABglobal: $(ZEXMG1) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXMG1) $(LIBS) -lm -o $@
-
-pzdrive2_ABglobal: $(ZEXMG2) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXMG2) $(LIBS) -lm -o $@
-
-pzdrive3_ABglobal: $(ZEXMG3) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXMG3) $(LIBS) -lm -o $@
-
-pzdrive4_ABglobal: $(ZEXMG4) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(ZEXMG4) $(LIBS) -lm -o $@
-
-.c.o:
-	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE)
-
-.f.o:
-	$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
-
-clean:	
-	rm -f *.o p[dz]drive p[dz]drive[1-9] \
-	p[dz]drive_ABglobal p[dz]drive[1-9]_ABglobal
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamch.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamch.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,963 +0,0 @@
-#include <stdio.h>
-#define TRUE_ (1)
-#define FALSE_ (0)
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-
-double dlamch_(char *cmach)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-    Purpose   
-    =======   
-
-    DLAMCH determines double precision machine parameters.   
-
-    Arguments   
-    =========   
-
-    CMACH   (input) CHARACTER*1   
-            Specifies the value to be returned by DLAMCH:   
-            = 'E' or 'e',   DLAMCH := eps   
-            = 'S' or 's ,   DLAMCH := sfmin   
-            = 'B' or 'b',   DLAMCH := base   
-            = 'P' or 'p',   DLAMCH := eps*base   
-            = 'N' or 'n',   DLAMCH := t   
-            = 'R' or 'r',   DLAMCH := rnd   
-            = 'M' or 'm',   DLAMCH := emin   
-            = 'U' or 'u',   DLAMCH := rmin   
-            = 'L' or 'l',   DLAMCH := emax   
-            = 'O' or 'o',   DLAMCH := rmax   
-
-            where   
-
-            eps   = relative machine precision   
-            sfmin = safe minimum, such that 1/sfmin does not overflow   
-            base  = base of the machine   
-            prec  = eps*base   
-            t     = number of (base) digits in the mantissa   
-            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
-            emin  = minimum exponent before (gradual) underflow   
-            rmin  = underflow threshold - base**(emin-1)   
-            emax  = largest exponent before overflow   
-            rmax  = overflow threshold  - (base**emax)*(1-eps)   
-
-   ===================================================================== 
-*/
-
-    static int first = TRUE_;
-
-    /* System generated locals */
-    int i__1;
-    double ret_val;
-    /* Builtin functions */
-    double pow_di(double *, int *);
-    /* Local variables */
-    static double base;
-    static int beta;
-    static double emin, prec, emax;
-    static int imin, imax;
-    static int lrnd;
-    static double rmin, rmax, t, rmach;
-    extern int lsame_(char *, char *);
-    static double small, sfmin;
-    extern /* Subroutine */ int dlamc2_(int *, int *, int *, 
-	    double *, int *, double *, int *, double *);
-    static int it;
-    static double rnd, eps;
-
-    if (first) {
-	first = FALSE_;
-	dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
-	base = (double) beta;
-	t = (double) it;
-	if (lrnd) {
-	    rnd = 1.;
-	    i__1 = 1 - it;
-	    eps = pow_di(&base, &i__1) / 2;
-	} else {
-	    rnd = 0.;
-	    i__1 = 1 - it;
-	    eps = pow_di(&base, &i__1);
-	}
-	prec = eps * base;
-	emin = (double) imin;
-	emax = (double) imax;
-	sfmin = rmin;
-	small = 1. / rmax;
-	if (small >= sfmin) {
-
-	/* Use SMALL plus a bit, to avoid the possibility of rounding   
-             causing overflow when computing  1/sfmin. */
-	    sfmin = small * (eps + 1.);
-	}
-    }
-
-    if (lsame_(cmach, "E")) {
-	rmach = eps;
-    } else if (lsame_(cmach, "S")) {
-	rmach = sfmin;
-    } else if (lsame_(cmach, "B")) {
-	rmach = base;
-    } else if (lsame_(cmach, "P")) {
-	rmach = prec;
-    } else if (lsame_(cmach, "N")) {
-	rmach = t;
-    } else if (lsame_(cmach, "R")) {
-	rmach = rnd;
-    } else if (lsame_(cmach, "M")) {
-	rmach = emin;
-    } else if (lsame_(cmach, "U")) {
-	rmach = rmin;
-    } else if (lsame_(cmach, "L")) {
-	rmach = emax;
-    } else if (lsame_(cmach, "O")) {
-	rmach = rmax;
-    }
-
-    ret_val = rmach;
-    return ret_val;
-
-/*     End of DLAMCH */
-
-} /* dlamch_ */
-
-
-/* Subroutine */ int dlamc1_(int *beta, int *t, int *rnd, int 
-	*ieee1)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC1 determines the machine parameters given by BETA, T, RND, and   
-    IEEE1.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    IEEE1   (output) INT   
-            Specifies whether rounding appears to be done in the IEEE   
-            'round to nearest' style.   
-
-    Further Details   
-    ===============   
-
-    The routine is based on the routine  ENVRON  by Malcolm and   
-    incorporates suggestions by Gentleman and Marovich. See   
-
-       Malcolm M. A. (1972) Algorithms to reveal properties of   
-          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
-
-       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
-          that reveal properties of floating point arithmetic units.   
-          Comms. of the ACM, 17, 276-277.   
-
-   ===================================================================== 
-*/
-    /* Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    double d__1, d__2;
-    /* Local variables */
-    static int lrnd;
-    static double a, b, c, f;
-    static int lbeta;
-    static double savec;
-    extern double dlamc3_(double *, double *);
-    static int lieee1;
-    static double t1, t2;
-    static int lt;
-    static double one, qtr;
-
-    if (first) {
-	first = FALSE_;
-	one = 1.;
-
-/*        LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BE
-TA,   
-          IEEE1, T and RND.   
-
-          Throughout this routine  we use the function  DLAMC3  to ens
-ure   
-          that relevant values are  stored and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          Compute  a = 2.0**m  with the  smallest positive integer m s
-uch   
-          that   
-
-             fl( a + 1.0 ) = a. */
-
-	a = 1.;
-	c = 1.;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L10:
-	if (c == one) {
-	    a *= 2;
-	    c = dlamc3_(&a, &one);
-	    d__1 = -a;
-	    c = dlamc3_(&c, &d__1);
-	    goto L10;
-	}
-/* +       END WHILE   
-
-          Now compute  b = 2.0**m  with the smallest positive integer 
-m   
-          such that   
-
-             fl( a + b ) .gt. a. */
-
-	b = 1.;
-	c = dlamc3_(&a, &b);
-
-/* +       WHILE( C.EQ.A )LOOP */
-L20:
-	if (c == a) {
-	    b *= 2;
-	    c = dlamc3_(&a, &b);
-	    goto L20;
-	}
-/* +       END WHILE   
-
-          Now compute the base.  a and c  are neighbouring floating po
-int   
-          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
- so   
-          their difference is beta. Adding 0.25 to c is to ensure that
- it   
-          is truncated to beta and not ( beta - 1 ). */
-
-	qtr = one / 4;
-	savec = c;
-	d__1 = -a;
-	c = dlamc3_(&c, &d__1);
-	lbeta = (int) (c + qtr);
-
-/*        Now determine whether rounding or chopping occurs,  by addin
-g a   
-          bit  less  than  beta/2  and a  bit  more  than  beta/2  to 
- a. */
-
-	b = (double) lbeta;
-	d__1 = b / 2;
-	d__2 = -b / 100;
-	f = dlamc3_(&d__1, &d__2);
-	c = dlamc3_(&f, &a);
-	if (c == a) {
-	    lrnd = TRUE_;
-	} else {
-	    lrnd = FALSE_;
-	}
-	d__1 = b / 2;
-	d__2 = b / 100;
-	f = dlamc3_(&d__1, &d__2);
-	c = dlamc3_(&f, &a);
-	if (lrnd && c == a) {
-	    lrnd = FALSE_;
-	}
-
-/*        Try and decide whether rounding is done in the  IEEE  'round
- to   
-          nearest' style. B/2 is half a unit in the last place of the 
-two   
-          numbers A and SAVEC. Furthermore, A is even, i.e. has last  
-bit   
-          zero, and SAVEC is odd. Thus adding B/2 to A should not  cha
-nge   
-          A, but adding B/2 to SAVEC should change SAVEC. */
-
-	d__1 = b / 2;
-	t1 = dlamc3_(&d__1, &a);
-	d__1 = b / 2;
-	t2 = dlamc3_(&d__1, &savec);
-	lieee1 = t1 == a && t2 > savec && lrnd;
-
-/*        Now find  the  mantissa, t.  It should  be the  integer part
- of   
-          log to the base beta of a,  however it is safer to determine
-  t   
-          by powering.  So we find t as the smallest positive integer 
-for   
-          which   
-
-             fl( beta**t + 1.0 ) = 1.0. */
-
-	lt = 0;
-	a = 1.;
-	c = 1.;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L30:
-	if (c == one) {
-	    ++lt;
-	    a *= lbeta;
-	    c = dlamc3_(&a, &one);
-	    d__1 = -a;
-	    c = dlamc3_(&c, &d__1);
-	    goto L30;
-	}
-/* +       END WHILE */
-
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *ieee1 = lieee1;
-    return 0;
-
-/*     End of DLAMC1 */
-
-} /* dlamc1_ */
-
-
-/* Subroutine */ int dlamc2_(int *beta, int *t, int *rnd, 
-	double *eps, int *emin, double *rmin, int *emax, 
-	double *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC2 determines the machine parameters specified in its argument   
-    list.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    EPS     (output) DOUBLE PRECISION   
-            The smallest positive number such that   
-
-               fl( 1.0 - EPS ) .LT. 1.0,   
-
-            where fl denotes the computed value.   
-
-    EMIN    (output) INT   
-            The minimum exponent before (gradual) underflow occurs.   
-
-    RMIN    (output) DOUBLE PRECISION   
-            The smallest normalized number for the machine, given by   
-            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
-  
-            of BETA.   
-
-    EMAX    (output) INT   
-            The maximum exponent before overflow occurs.   
-
-    RMAX    (output) DOUBLE PRECISION   
-            The largest positive number for the machine, given by   
-            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
-  
-            value of BETA.   
-
-    Further Details   
-    ===============   
-
-    The computation of  EPS  is based on a routine PARANOIA by   
-    W. Kahan of the University of California at Berkeley.   
-
-   ===================================================================== 
-*/
-    /* Table of constant values */
-    static int c__1 = 1;
-    
-    /* Initialized data */
-    static int first = TRUE_;
-    static int iwarn = FALSE_;
-    /* System generated locals */
-    int i__1;
-    double d__1, d__2, d__3, d__4, d__5;
-    /* Builtin functions */
-    double pow_di(double *, int *);
-    /* Local variables */
-    static int ieee;
-    static double half;
-    static int lrnd;
-    static double leps, zero, a, b, c;
-    static int i, lbeta;
-    static double rbase;
-    static int lemin, lemax, gnmin;
-    static double small;
-    static int gpmin;
-    static double third, lrmin, lrmax, sixth;
-    extern /* Subroutine */ int dlamc1_(int *, int *, int *, 
-	    int *);
-    extern double dlamc3_(double *, double *);
-    static int lieee1;
-    extern /* Subroutine */ int dlamc4_(int *, double *, int *), 
-	    dlamc5_(int *, int *, int *, int *, int *, 
-	    double *);
-    static int lt, ngnmin, ngpmin;
-    static double one, two;
-
-    if (first) {
-	first = FALSE_;
-	zero = 0.;
-	one = 1.;
-	two = 2.;
-
-/*        LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values
- of   
-          BETA, T, RND, EPS, EMIN and RMIN.   
-
-          Throughout this routine  we use the function  DLAMC3  to ens
-ure   
-          that relevant values are stored  and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          DLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1. 
-*/
-
-	dlamc1_(&lbeta, &lt, &lrnd, &lieee1);
-
-/*        Start to find EPS. */
-
-	b = (double) lbeta;
-	i__1 = -lt;
-	a = pow_di(&b, &i__1);
-	leps = a;
-
-/*        Try some tricks to see whether or not this is the correct  E
-PS. */
-
-	b = two / 3;
-	half = one / 2;
-	d__1 = -half;
-	sixth = dlamc3_(&b, &d__1);
-	third = dlamc3_(&sixth, &sixth);
-	d__1 = -half;
-	b = dlamc3_(&third, &d__1);
-	b = dlamc3_(&b, &sixth);
-	b = abs(b);
-	if (b < leps) {
-	    b = leps;
-	}
-
-	leps = 1.;
-
-/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
-L10:
-	if (leps > b && b > zero) {
-	    leps = b;
-	    d__1 = half * leps;
-/* Computing 5th power */
-	    d__3 = two, d__4 = d__3, d__3 *= d__3;
-/* Computing 2nd power */
-	    d__5 = leps;
-	    d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5);
-	    c = dlamc3_(&d__1, &d__2);
-	    d__1 = -c;
-	    c = dlamc3_(&half, &d__1);
-	    b = dlamc3_(&half, &c);
-	    d__1 = -b;
-	    c = dlamc3_(&half, &d__1);
-	    b = dlamc3_(&half, &c);
-	    goto L10;
-	}
-/* +       END WHILE */
-
-	if (a < leps) {
-	    leps = a;
-	}
-
-/*        Computation of EPS complete.   
-
-          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3
-)).   
-          Keep dividing  A by BETA until (gradual) underflow occurs. T
-his   
-          is detected when we cannot recover the previous A. */
-
-	rbase = one / lbeta;
-	small = one;
-	for (i = 1; i <= 3; ++i) {
-	    d__1 = small * rbase;
-	    small = dlamc3_(&d__1, &zero);
-/* L20: */
-	}
-	a = dlamc3_(&one, &small);
-	dlamc4_(&ngpmin, &one, &lbeta);
-	d__1 = -one;
-	dlamc4_(&ngnmin, &d__1, &lbeta);
-	dlamc4_(&gpmin, &a, &lbeta);
-	d__1 = -a;
-	dlamc4_(&gnmin, &d__1, &lbeta);
-	ieee = FALSE_;
-
-	if (ngpmin == ngnmin && gpmin == gnmin) {
-	    if (ngpmin == gpmin) {
-		lemin = ngpmin;
-/*            ( Non twos-complement machines, no gradual under
-flow;   
-                e.g.,  VAX ) */
-	    } else if (gpmin - ngpmin == 3) {
-		lemin = ngpmin - 1 + lt;
-		ieee = TRUE_;
-/*            ( Non twos-complement machines, with gradual und
-erflow;   
-                e.g., IEEE standard followers ) */
-	    } else {
-		lemin = min(ngpmin,gpmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if (ngpmin == gpmin && ngnmin == gnmin) {
-	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
-		lemin = max(ngpmin,ngnmin);
-/*            ( Twos-complement machines, no gradual underflow
-;   
-                e.g., CYBER 205 ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
-		 {
-	    if (gpmin - min(ngpmin,ngnmin) == 3) {
-		lemin = max(ngpmin,ngnmin) - 1 + lt;
-/*            ( Twos-complement machines with gradual underflo
-w;   
-                no known machine ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else {
-/* Computing MIN */
-	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
-	    lemin = min(i__1,gnmin);
-/*         ( A guess; no known machine ) */
-	    iwarn = TRUE_;
-	}
-/* **   
-   Comment out this if block if EMIN is ok */
-	if (iwarn) {
-	    first = TRUE_;
-	    printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    printf("EMIN = %8i\n",lemin);
-	    printf("If, after inspection, the value EMIN looks acceptable");
-            printf("please comment out \n the IF block as marked within the"); 
-            printf("code of routine DLAMC2, \n otherwise supply EMIN"); 
-            printf("explicitly.\n");
-	}
-/* **   
-
-          Assume IEEE arithmetic if we found denormalised  numbers abo
-ve,   
-          or if arithmetic seems to round in the  IEEE style,  determi
-ned   
-          in routine DLAMC1. A true IEEE machine should have both  thi
-ngs   
-          true; however, faulty machines may have one or the other. */
-
-	ieee = ieee || lieee1;
-
-/*        Compute  RMIN by successive division by  BETA. We could comp
-ute   
-          RMIN as BASE**( EMIN - 1 ),  but some machines underflow dur
-ing   
-          this computation. */
-
-	lrmin = 1.;
-	i__1 = 1 - lemin;
-	for (i = 1; i <= 1-lemin; ++i) {
-	    d__1 = lrmin * rbase;
-	    lrmin = dlamc3_(&d__1, &zero);
-/* L30: */
-	}
-
-/*        Finally, call DLAMC5 to compute EMAX and RMAX. */
-
-	dlamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *eps = leps;
-    *emin = lemin;
-    *rmin = lrmin;
-    *emax = lemax;
-    *rmax = lrmax;
-
-    return 0;
-
-
-/*     End of DLAMC2 */
-
-} /* dlamc2_ */
-
-
-double dlamc3_(double *a, double *b)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC3  is intended to force  A  and  B  to be stored prior to doing 
-  
-    the addition of  A  and  B ,  for use in situations where optimizers 
-  
-    might hold one of these in a register.   
-
-    Arguments   
-    =========   
-
-    A, B    (input) DOUBLE PRECISION   
-            The values A and B.   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       System generated locals */
-    double ret_val;
-
-    ret_val = *a + *b;
-
-    return ret_val;
-
-/*     End of DLAMC3 */
-
-} /* dlamc3_ */
-
-
-/* Subroutine */ int dlamc4_(int *emin, double *start, int *base)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC4 is a service routine for DLAMC2.   
-
-    Arguments   
-    =========   
-
-    EMIN    (output) EMIN   
-            The minimum exponent before (gradual) underflow, computed by 
-  
-            setting A = START and dividing by BASE until the previous A   
-            can not be recovered.   
-
-    START   (input) DOUBLE PRECISION   
-            The starting point for determining EMIN.   
-
-    BASE    (input) INT   
-            The base of the machine.   
-
-   ===================================================================== 
-*/
-    /* System generated locals */
-    int i__1;
-    double d__1;
-    /* Local variables */
-    static double zero, a;
-    static int i;
-    static double rbase, b1, b2, c1, c2, d1, d2;
-    extern double dlamc3_(double *, double *);
-    static double one;
-
-    a = *start;
-    one = 1.;
-    rbase = one / *base;
-    zero = 0.;
-    *emin = 1;
-    d__1 = a * rbase;
-    b1 = dlamc3_(&d__1, &zero);
-    c1 = a;
-    c2 = a;
-    d1 = a;
-    d2 = a;
-/* +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.   
-      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP */
-L10:
-    if (c1 == a && c2 == a && d1 == a && d2 == a) {
-	--(*emin);
-	a = b1;
-	d__1 = a / *base;
-	b1 = dlamc3_(&d__1, &zero);
-	d__1 = b1 * *base;
-	c1 = dlamc3_(&d__1, &zero);
-	d1 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d1 += b1;
-/* L20: */
-	}
-	d__1 = a * rbase;
-	b2 = dlamc3_(&d__1, &zero);
-	d__1 = b2 / rbase;
-	c2 = dlamc3_(&d__1, &zero);
-	d2 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d2 += b2;
-/* L30: */
-	}
-	goto L10;
-    }
-/* +    END WHILE */
-
-    return 0;
-
-/*     End of DLAMC4 */
-
-} /* dlamc4_ */
-
-
-/* Subroutine */ int dlamc5_(int *beta, int *p, int *emin, 
-	int *ieee, int *emax, double *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC5 attempts to compute RMAX, the largest machine floating-point   
-    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
-    approximately to a power of 2.  It will fail on machines where this   
-    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
-  
-    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
-    too large (i.e. too close to zero), probably with overflow.   
-
-    Arguments   
-    =========   
-
-    BETA    (input) INT   
-            The base of floating-point arithmetic.   
-
-    P       (input) INT   
-            The number of base BETA digits in the mantissa of a   
-            floating-point value.   
-
-    EMIN    (input) INT   
-            The minimum exponent before (gradual) underflow.   
-
-    IEEE    (input) INT   
-            A int flag specifying whether or not the arithmetic   
-            system is thought to comply with the IEEE standard.   
-
-    EMAX    (output) INT   
-            The largest exponent before overflow   
-
-    RMAX    (output) DOUBLE PRECISION   
-            The largest machine floating-point number.   
-
-   ===================================================================== 
-  
-
-
-       First compute LEXP and UEXP, two powers of 2 that bound   
-       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
-       approximately to the bound that is closest to abs(EMIN).   
-       (EMAX is the exponent of the required number RMAX). */
-    /* Table of constant values */
-    static double c_b5 = 0.;
-    
-    /* System generated locals */
-    int i__1;
-    double d__1;
-    /* Local variables */
-    static int lexp;
-    static double oldy;
-    static int uexp, i;
-    static double y, z;
-    static int nbits;
-    extern double dlamc3_(double *, double *);
-    static double recbas;
-    static int exbits, expsum, try__;
-
-
-
-    lexp = 1;
-    exbits = 1;
-L10:
-    try__ = lexp << 1;
-    if (try__ <= -(*emin)) {
-	lexp = try__;
-	++exbits;
-	goto L10;
-    }
-    if (lexp == -(*emin)) {
-	uexp = lexp;
-    } else {
-	uexp = try__;
-	++exbits;
-    }
-
-/*     Now -LEXP is less than or equal to EMIN, and -UEXP is greater   
-       than or equal to EMIN. EXBITS is the number of bits needed to   
-       store the exponent. */
-
-    if (uexp + *emin > -lexp - *emin) {
-	expsum = lexp << 1;
-    } else {
-	expsum = uexp << 1;
-    }
-
-/*     EXPSUM is the exponent range, approximately equal to   
-       EMAX - EMIN + 1 . */
-
-    *emax = expsum + *emin - 1;
-    nbits = exbits + 1 + *p;
-
-/*     NBITS is the total number of bits needed to store a   
-       floating-point number. */
-
-    if (nbits % 2 == 1 && *beta == 2) {
-
-/*        Either there are an odd number of bits used to store a   
-          floating-point number, which is unlikely, or some bits are 
-  
-          not used in the representation of numbers, which is possible
-,   
-          (e.g. Cray machines) or the mantissa has an implicit bit,   
-          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
-   
-          most likely. We have to assume the last alternative.   
-          If this is true, then we need to reduce EMAX by one because 
-  
-          there must be some way of representing zero in an implicit-b
-it   
-          system. On machines like Cray, we are reducing EMAX by one 
-  
-          unnecessarily. */
-
-	--(*emax);
-    }
-
-    if (*ieee) {
-
-/*        Assume we are on an IEEE machine which reserves one exponent
-   
-          for infinity and NaN. */
-
-	--(*emax);
-    }
-
-/*     Now create RMAX, the largest machine number, which should   
-       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .   
-
-       First compute 1.0 - BETA**(-P), being careful that the   
-       result is less than 1.0 . */
-
-    recbas = 1. / *beta;
-    z = *beta - 1.;
-    y = 0.;
-    i__1 = *p;
-    for (i = 1; i <= *p; ++i) {
-	z *= recbas;
-	if (y < 1.) {
-	    oldy = y;
-	}
-	y = dlamc3_(&y, &z);
-/* L20: */
-    }
-    if (y >= 1.) {
-	y = oldy;
-    }
-
-/*     Now multiply by BETA**EMAX to get RMAX. */
-
-    i__1 = *emax;
-    for (i = 1; i <= *emax; ++i) {
-	d__1 = y * *beta;
-	y = dlamc3_(&d__1, &c_b5);
-/* L30: */
-    }
-
-    *rmax = y;
-    return 0;
-
-/*     End of DLAMC5 */
-
-} /* dlamc5_ */
-
-double pow_di(double *ap, int *bp)
-{
-    double pow, x;
-    int n;
-
-    pow = 1;
-    x = *ap;
-    n = *bp;
-
-    if(n != 0){
-	if(n < 0) {
-	    n = -n;
-	    x = 1/x;
-	}
-	for( ; ; ) {
-	    if(n & 01) pow *= x;
-	    if(n >>= 1)	x *= x;
-	    else break;
-	}
-    }
-    return(pow);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/dlamchtst.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,34 +0,0 @@
-#include <stdio.h>
-
-main()
-{
-    /* Local variables */
-    double base, emin, prec, emax, rmin, rmax, t, sfmin;
-    extern double dlamch_(char *);
-    double rnd, eps;
-
-    eps = dlamch_("Epsilon");
-    sfmin = dlamch_("Safe minimum");
-    base = dlamch_("Base");
-    prec = dlamch_("Precision");
-    t = dlamch_("Number of digits in mantissa");
-    rnd = dlamch_("Rounding mode");
-    emin = dlamch_("Minnimum exponent");
-    rmin = dlamch_("Underflow threshold");
-    emax = dlamch_("Largest exponent");
-    rmax = dlamch_("Overflow threshold");
-
-    printf(" Epsilon                      = %e\n", eps);
-    printf(" Safe minimum                 = %e\n", sfmin);
-    printf(" Base                         = %.0f\n", base);
-    printf(" Precision                    = %e\n", prec);
-    printf(" Number of digits in mantissa = %.0f\n", t);
-    printf(" Rounding mode                = %.0f\n", rnd);
-    printf(" Minimum exponent             = %.0f\n", emin);
-    printf(" Underflow threshold          = %e\n", rmin);
-    printf(" Largest exponent             = %.0f\n", emax);
-    printf(" Overflow threshold           = %e\n", rmax);
-    printf(" Reciprocal of safe minimum   = %e\n", 1./sfmin);
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/install.csh hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/install.csh
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/install.csh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/install.csh	1970-01-01 00:00:00.000000000 +0000
@@ -1,14 +0,0 @@
-#! /bin/csh
-
-set ofile = install.out			# output file
-
-echo '---- SINGLE PRECISION' >! $ofile
-./testslamch >> $ofile
-echo '' >> $ofile
-echo ---- DOUBLE PRECISION >> $ofile
-./testdlamch >> $ofile
-echo '' >> $ofile
-echo ---- TIMER >> $ofile
-./testtimer >> $ofile
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/lsame.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/lsame.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/lsame.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/lsame.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,70 +0,0 @@
-int lsame_(char *ca, char *cb)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-    Purpose   
-    =======   
-
-    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.   
-
-    Arguments   
-    =========   
-
-    CA      (input) CHARACTER*1   
-    CB      (input) CHARACTER*1   
-            CA and CB specify the single characters to be compared.   
-
-   ===================================================================== 
-*/  
-
-    /* System generated locals */
-    int ret_val;
-    
-    /* Local variables */
-    int inta, intb, zcode;
-
-    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
-    if (ret_val) {
-	return ret_val;
-    }
-
-    /* Now test for equivalence if both characters are alphabetic. */
-
-    zcode = 'Z';
-
-    /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime   
-       machines, on which ICHAR returns a value with bit 8 set.   
-       ICHAR('A') on Prime machines returns 193 which is the same as   
-       ICHAR('A') on an EBCDIC machine. */
-
-    inta = *(unsigned char *)ca;
-    intb = *(unsigned char *)cb;
-
-    if (zcode == 90 || zcode == 122) {
-	/* ASCII is assumed - ZCODE is the ASCII code of either lower or   
-          upper case 'Z'. */
-	if (inta >= 97 && inta <= 122) inta += -32;
-	if (intb >= 97 && intb <= 122) intb += -32;
-
-    } else if (zcode == 233 || zcode == 169) {
-	/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or   
-          upper case 'Z'. */
-	if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta 
-		>= 162 && inta <= 169)
-	    inta += 64;
-	if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb 
-		>= 162 && intb <= 169)
-	    intb += 64;
-    } else if (zcode == 218 || zcode == 250) {
-	/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code   
-          plus 128 of either lower or upper case 'Z'. */
-	if (inta >= 225 && inta <= 250) inta += -32;
-	if (intb >= 225 && intb <= 250) intb += -32;
-    }
-    ret_val = inta == intb;
-    return ret_val;
-    
-} /* lsame_ */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/Makefile
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,27 +0,0 @@
-include ../make.inc
-
-all:  testdlamch testslamch testtimer #install.out
-
-testdlamch: dlamch.o lsame.o dlamchtst.o
-	$(LOADER) $(LOADOPTS) -o testdlamch dlamch.o lsame.o dlamchtst.o
-
-testslamch: slamch.o lsame.o slamchtst.o
-	$(LOADER) $(LOADOPTS) -o testslamch slamch.o lsame.o slamchtst.o
-
-testtimer: superlu_timer.o timertst.o
-	$(LOADER) $(LOADOPTS) -o testtimer superlu_timer.o timertst.o \
-	$(MPILIB)
-
-install.out:
-	@echo Testing machines parameters and timer
-	csh install.csh
-
-slamch.o: slamch.c ; $(CC) $(NOOPTS) -c $<
-dlamch.o: dlamch.c ; $(CC) $(NOOPTS) -c $<
-superlu_timer.o: superlu_timer.c; $(CC) $(NOOPTS) -c $<
-
-.c.o:
-	$(CC) $(CFLAGS) -c $<
-
-clean:
-	rm -f *.o test* *.out
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamch.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamch.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,982 +0,0 @@
-#include <stdio.h>
-#define TRUE_ (1)
-#define FALSE_ (0)
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define dabs(x) (double)abs(x)
-
-double slamch_(char *cmach)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMCH determines single precision machine parameters.   
-
-    Arguments   
-    =========   
-
-    CMACH   (input) CHARACTER*1   
-            Specifies the value to be returned by SLAMCH:   
-            = 'E' or 'e',   SLAMCH := eps   
-            = 'S' or 's ,   SLAMCH := sfmin   
-            = 'B' or 'b',   SLAMCH := base   
-            = 'P' or 'p',   SLAMCH := eps*base   
-            = 'N' or 'n',   SLAMCH := t   
-            = 'R' or 'r',   SLAMCH := rnd   
-            = 'M' or 'm',   SLAMCH := emin   
-            = 'U' or 'u',   SLAMCH := rmin   
-            = 'L' or 'l',   SLAMCH := emax   
-            = 'O' or 'o',   SLAMCH := rmax   
-
-            where   
-
-            eps   = relative machine precision   
-            sfmin = safe minimum, such that 1/sfmin does not overflow   
-            base  = base of the machine   
-            prec  = eps*base   
-            t     = number of (base) digits in the mantissa   
-            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
-            emin  = minimum exponent before (gradual) underflow   
-            rmin  = underflow threshold - base**(emin-1)   
-            emax  = largest exponent before overflow   
-            rmax  = overflow threshold  - (base**emax)*(1-eps)   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    int i__1;
-    float ret_val;
-    /* Builtin functions */
-    double pow_ri(float *, int *);
-    /* Local variables */
-    static float base;
-    static int beta;
-    static float emin, prec, emax;
-    static int imin, imax;
-    static int lrnd;
-    static float rmin, rmax, t, rmach;
-    extern int lsame_(char *, char *);
-    static float small, sfmin;
-    extern /* Subroutine */ int slamc2_(int *, int *, int *, float 
-	    *, int *, float *, int *, float *);
-    static int it;
-    static float rnd, eps;
-
-
-
-    if (first) {
-	first = FALSE_;
-	slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
-	base = (float) beta;
-	t = (float) it;
-	if (lrnd) {
-	    rnd = 1.f;
-	    i__1 = 1 - it;
-	    eps = pow_ri(&base, &i__1) / 2;
-	} else {
-	    rnd = 0.f;
-	    i__1 = 1 - it;
-	    eps = pow_ri(&base, &i__1);
-	}
-	prec = eps * base;
-	emin = (float) imin;
-	emax = (float) imax;
-	sfmin = rmin;
-	small = 1.f / rmax;
-	if (small >= sfmin) {
-
-/*           Use SMALL plus a bit, to avoid the possibility of rou
-nding   
-             causing overflow when computing  1/sfmin. */
-
-	    sfmin = small * (eps + 1.f);
-	}
-    }
-
-    if (lsame_(cmach, "E")) {
-	rmach = eps;
-    } else if (lsame_(cmach, "S")) {
-	rmach = sfmin;
-    } else if (lsame_(cmach, "B")) {
-	rmach = base;
-    } else if (lsame_(cmach, "P")) {
-	rmach = prec;
-    } else if (lsame_(cmach, "N")) {
-	rmach = t;
-    } else if (lsame_(cmach, "R")) {
-	rmach = rnd;
-    } else if (lsame_(cmach, "M")) {
-	rmach = emin;
-    } else if (lsame_(cmach, "U")) {
-	rmach = rmin;
-    } else if (lsame_(cmach, "L")) {
-	rmach = emax;
-    } else if (lsame_(cmach, "O")) {
-	rmach = rmax;
-    }
-
-    ret_val = rmach;
-    return ret_val;
-
-/*     End of SLAMCH */
-
-} /* slamch_ */
-
-
-/* Subroutine */ int slamc1_(int *beta, int *t, int *rnd, int 
-	*ieee1)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC1 determines the machine parameters given by BETA, T, RND, and   
-    IEEE1.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    IEEE1   (output) INT   
-            Specifies whether rounding appears to be done in the IEEE   
-            'round to nearest' style.   
-
-    Further Details   
-    ===============   
-
-    The routine is based on the routine  ENVRON  by Malcolm and   
-    incorporates suggestions by Gentleman and Marovich. See   
-
-       Malcolm M. A. (1972) Algorithms to reveal properties of   
-          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
-
-       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
-          that reveal properties of floating point arithmetic units.   
-          Comms. of the ACM, 17, 276-277.   
-
-   ===================================================================== 
-*/
-    /* Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    float r__1, r__2;
-    /* Local variables */
-    static int lrnd;
-    static float a, b, c, f;
-    static int lbeta;
-    static float savec;
-    static int lieee1;
-    static float t1, t2;
-    extern double slamc3_(float *, float *);
-    static int lt;
-    static float one, qtr;
-
-
-
-    if (first) {
-	first = FALSE_;
-	one = 1.f;
-
-/*        LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BE
-TA,   
-          IEEE1, T and RND.   
-
-          Throughout this routine  we use the function  SLAMC3  to ens
-ure   
-          that relevant values are  stored and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          Compute  a = 2.0**m  with the  smallest positive integer m s
-uch   
-          that   
-
-             fl( a + 1.0 ) = a. */
-
-	a = 1.f;
-	c = 1.f;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L10:
-	if (c == one) {
-	    a *= 2;
-	    c = slamc3_(&a, &one);
-	    r__1 = -(double)a;
-	    c = slamc3_(&c, &r__1);
-	    goto L10;
-	}
-/* +       END WHILE   
-
-          Now compute  b = 2.0**m  with the smallest positive integer 
-m   
-          such that   
-
-             fl( a + b ) .gt. a. */
-
-	b = 1.f;
-	c = slamc3_(&a, &b);
-
-/* +       WHILE( C.EQ.A )LOOP */
-L20:
-	if (c == a) {
-	    b *= 2;
-	    c = slamc3_(&a, &b);
-	    goto L20;
-	}
-/* +       END WHILE   
-
-          Now compute the base.  a and c  are neighbouring floating po
-int   
-          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
- so   
-          their difference is beta. Adding 0.25 to c is to ensure that
- it   
-          is truncated to beta and not ( beta - 1 ). */
-
-	qtr = one / 4;
-	savec = c;
-	r__1 = -(double)a;
-	c = slamc3_(&c, &r__1);
-	lbeta = c + qtr;
-
-/*        Now determine whether rounding or chopping occurs,  by addin
-g a   
-          bit  less  than  beta/2  and a  bit  more  than  beta/2  to 
- a. */
-
-	b = (float) lbeta;
-	r__1 = b / 2;
-	r__2 = -(double)b / 100;
-	f = slamc3_(&r__1, &r__2);
-	c = slamc3_(&f, &a);
-	if (c == a) {
-	    lrnd = TRUE_;
-	} else {
-	    lrnd = FALSE_;
-	}
-	r__1 = b / 2;
-	r__2 = b / 100;
-	f = slamc3_(&r__1, &r__2);
-	c = slamc3_(&f, &a);
-	if (lrnd && c == a) {
-	    lrnd = FALSE_;
-	}
-
-/*        Try and decide whether rounding is done in the  IEEE  'round
- to   
-          nearest' style. B/2 is half a unit in the last place of the 
-two   
-          numbers A and SAVEC. Furthermore, A is even, i.e. has last  
-bit   
-          zero, and SAVEC is odd. Thus adding B/2 to A should not  cha
-nge   
-          A, but adding B/2 to SAVEC should change SAVEC. */
-
-	r__1 = b / 2;
-	t1 = slamc3_(&r__1, &a);
-	r__1 = b / 2;
-	t2 = slamc3_(&r__1, &savec);
-	lieee1 = t1 == a && t2 > savec && lrnd;
-
-/*        Now find  the  mantissa, t.  It should  be the  integer part
- of   
-          log to the base beta of a,  however it is safer to determine
-  t   
-          by powering.  So we find t as the smallest positive integer 
-for   
-          which   
-
-             fl( beta**t + 1.0 ) = 1.0. */
-
-	lt = 0;
-	a = 1.f;
-	c = 1.f;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L30:
-	if (c == one) {
-	    ++lt;
-	    a *= lbeta;
-	    c = slamc3_(&a, &one);
-	    r__1 = -(double)a;
-	    c = slamc3_(&c, &r__1);
-	    goto L30;
-	}
-/* +       END WHILE */
-
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *ieee1 = lieee1;
-    return 0;
-
-/*     End of SLAMC1 */
-
-} /* slamc1_ */
-
-
-/* Subroutine */ int slamc2_(int *beta, int *t, int *rnd, float *
-	eps, int *emin, float *rmin, int *emax, float *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC2 determines the machine parameters specified in its argument   
-    list.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    EPS     (output) FLOAT   
-            The smallest positive number such that   
-
-               fl( 1.0 - EPS ) .LT. 1.0,   
-
-            where fl denotes the computed value.   
-
-    EMIN    (output) INT   
-            The minimum exponent before (gradual) underflow occurs.   
-
-    RMIN    (output) FLOAT   
-            The smallest normalized number for the machine, given by   
-            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
-  
-            of BETA.   
-
-    EMAX    (output) INT   
-            The maximum exponent before overflow occurs.   
-
-    RMAX    (output) FLOAT   
-            The largest positive number for the machine, given by   
-            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
-  
-            value of BETA.   
-
-    Further Details   
-    ===============   
-
-    The computation of  EPS  is based on a routine PARANOIA by   
-    W. Kahan of the University of California at Berkeley.   
-
-   ===================================================================== 
-*/
-    /* Table of constant values */
-    static int c__1 = 1;
-    
-    /* Initialized data */
-    static int first = TRUE_;
-    static int iwarn = FALSE_;
-    /* System generated locals */
-    int i__1;
-    float r__1, r__2, r__3, r__4, r__5;
-    /* Builtin functions */
-    double pow_ri(float *, int *);
-    /* Local variables */
-    static int ieee;
-    static float half;
-    static int lrnd;
-    static float leps, zero, a, b, c;
-    static int i, lbeta;
-    static float rbase;
-    static int lemin, lemax, gnmin;
-    static float small;
-    static int gpmin;
-    static float third, lrmin, lrmax, sixth;
-    static int lieee1;
-    extern /* Subroutine */ int slamc1_(int *, int *, int *, 
-	    int *);
-    extern double slamc3_(float *, float *);
-    extern /* Subroutine */ int slamc4_(int *, float *, int *), 
-	    slamc5_(int *, int *, int *, int *, int *, 
-	    float *);
-    static int lt, ngnmin, ngpmin;
-    static float one, two;
-
-
-
-    if (first) {
-	first = FALSE_;
-	zero = 0.f;
-	one = 1.f;
-	two = 2.f;
-
-/*        LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values
- of   
-          BETA, T, RND, EPS, EMIN and RMIN.   
-
-          Throughout this routine  we use the function  SLAMC3  to ens
-ure   
-          that relevant values are stored  and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          SLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1. 
-*/
-
-	slamc1_(&lbeta, &lt, &lrnd, &lieee1);
-
-/*        Start to find EPS. */
-
-	b = (float) lbeta;
-	i__1 = -lt;
-	a = pow_ri(&b, &i__1);
-	leps = a;
-
-/*        Try some tricks to see whether or not this is the correct  E
-PS. */
-
-	b = two / 3;
-	half = one / 2;
-	r__1 = -(double)half;
-	sixth = slamc3_(&b, &r__1);
-	third = slamc3_(&sixth, &sixth);
-	r__1 = -(double)half;
-	b = slamc3_(&third, &r__1);
-	b = slamc3_(&b, &sixth);
-	b = dabs(b);
-	if (b < leps) {
-	    b = leps;
-	}
-
-	leps = 1.f;
-
-/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
-L10:
-	if (leps > b && b > zero) {
-	    leps = b;
-	    r__1 = half * leps;
-/* Computing 5th power */
-	    r__3 = two, r__4 = r__3, r__3 *= r__3;
-/* Computing 2nd power */
-	    r__5 = leps;
-	    r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5);
-	    c = slamc3_(&r__1, &r__2);
-	    r__1 = -(double)c;
-	    c = slamc3_(&half, &r__1);
-	    b = slamc3_(&half, &c);
-	    r__1 = -(double)b;
-	    c = slamc3_(&half, &r__1);
-	    b = slamc3_(&half, &c);
-	    goto L10;
-	}
-/* +       END WHILE */
-
-	if (a < leps) {
-	    leps = a;
-	}
-
-/*        Computation of EPS complete.   
-
-          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3
-)).   
-          Keep dividing  A by BETA until (gradual) underflow occurs. T
-his   
-          is detected when we cannot recover the previous A. */
-
-	rbase = one / lbeta;
-	small = one;
-	for (i = 1; i <= 3; ++i) {
-	    r__1 = small * rbase;
-	    small = slamc3_(&r__1, &zero);
-/* L20: */
-	}
-	a = slamc3_(&one, &small);
-	slamc4_(&ngpmin, &one, &lbeta);
-	r__1 = -(double)one;
-	slamc4_(&ngnmin, &r__1, &lbeta);
-	slamc4_(&gpmin, &a, &lbeta);
-	r__1 = -(double)a;
-	slamc4_(&gnmin, &r__1, &lbeta);
-	ieee = FALSE_;
-
-	if (ngpmin == ngnmin && gpmin == gnmin) {
-	    if (ngpmin == gpmin) {
-		lemin = ngpmin;
-/*            ( Non twos-complement machines, no gradual under
-flow;   
-                e.g.,  VAX ) */
-	    } else if (gpmin - ngpmin == 3) {
-		lemin = ngpmin - 1 + lt;
-		ieee = TRUE_;
-/*            ( Non twos-complement machines, with gradual und
-erflow;   
-                e.g., IEEE standard followers ) */
-	    } else {
-		lemin = min(ngpmin,gpmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if (ngpmin == gpmin && ngnmin == gnmin) {
-	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
-		lemin = max(ngpmin,ngnmin);
-/*            ( Twos-complement machines, no gradual underflow
-;   
-                e.g., CYBER 205 ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
-		 {
-	    if (gpmin - min(ngpmin,ngnmin) == 3) {
-		lemin = max(ngpmin,ngnmin) - 1 + lt;
-/*            ( Twos-complement machines with gradual underflo
-w;   
-                no known machine ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else {
-/* Computing MIN */
-	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
-	    lemin = min(i__1,gnmin);
-/*         ( A guess; no known machine ) */
-	    iwarn = TRUE_;
-	}
-/* **   
-   Comment out this if block if EMIN is ok */
-	if (iwarn) {
-	    first = TRUE_;
-	    printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    printf("EMIN = %8i\n",lemin);
-	    printf("If, after inspection, the value EMIN looks acceptable");
-            printf("please comment out \n the IF block as marked within the"); 
-            printf("code of routine SLAMC2, \n otherwise supply EMIN"); 
-            printf("explicitly.\n");
-	}
-/* **   
-
-          Assume IEEE arithmetic if we found denormalised  numbers abo
-ve,   
-          or if arithmetic seems to round in the  IEEE style,  determi
-ned   
-          in routine SLAMC1. A true IEEE machine should have both  thi
-ngs   
-          true; however, faulty machines may have one or the other. */
-
-	ieee = ieee || lieee1;
-
-/*        Compute  RMIN by successive division by  BETA. We could comp
-ute   
-          RMIN as BASE**( EMIN - 1 ),  but some machines underflow dur
-ing   
-          this computation. */
-
-	lrmin = 1.f;
-	i__1 = 1 - lemin;
-	for (i = 1; i <= 1-lemin; ++i) {
-	    r__1 = lrmin * rbase;
-	    lrmin = slamc3_(&r__1, &zero);
-/* L30: */
-	}
-
-/*        Finally, call SLAMC5 to compute EMAX and RMAX. */
-
-	slamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *eps = leps;
-    *emin = lemin;
-    *rmin = lrmin;
-    *emax = lemax;
-    *rmax = lrmax;
-
-    return 0;
-
-
-/*     End of SLAMC2 */
-
-} /* slamc2_ */
-
-
-double slamc3_(float *a, float *b)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC3  is intended to force  A  and  B  to be stored prior to doing 
-  
-    the addition of  A  and  B ,  for use in situations where optimizers 
-  
-    might hold one of these in a register.   
-
-    Arguments   
-    =========   
-
-    A, B    (input) FLOAT   
-            The values A and B.   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       System generated locals */
-    float ret_val;
-
-
-
-    ret_val = *a + *b;
-
-    return ret_val;
-
-/*     End of SLAMC3 */
-
-} /* slamc3_ */
-
-
-/* Subroutine */ int slamc4_(int *emin, float *start, int *base)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC4 is a service routine for SLAMC2.   
-
-    Arguments   
-    =========   
-
-    EMIN    (output) EMIN   
-            The minimum exponent before (gradual) underflow, computed by 
-  
-            setting A = START and dividing by BASE until the previous A   
-            can not be recovered.   
-
-    START   (input) FLOAT   
-            The starting point for determining EMIN.   
-
-    BASE    (input) INT   
-            The base of the machine.   
-
-   ===================================================================== 
-*/
-    /* System generated locals */
-    int i__1;
-    float r__1;
-    /* Local variables */
-    static float zero, a;
-    static int i;
-    static float rbase, b1, b2, c1, c2, d1, d2;
-    extern double slamc3_(float *, float *);
-    static float one;
-
-
-
-    a = *start;
-    one = 1.f;
-    rbase = one / *base;
-    zero = 0.f;
-    *emin = 1;
-    r__1 = a * rbase;
-    b1 = slamc3_(&r__1, &zero);
-    c1 = a;
-    c2 = a;
-    d1 = a;
-    d2 = a;
-/* +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.   
-      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP */
-L10:
-    if (c1 == a && c2 == a && d1 == a && d2 == a) {
-	--(*emin);
-	a = b1;
-	r__1 = a / *base;
-	b1 = slamc3_(&r__1, &zero);
-	r__1 = b1 * *base;
-	c1 = slamc3_(&r__1, &zero);
-	d1 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d1 += b1;
-/* L20: */
-	}
-	r__1 = a * rbase;
-	b2 = slamc3_(&r__1, &zero);
-	r__1 = b2 / rbase;
-	c2 = slamc3_(&r__1, &zero);
-	d2 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d2 += b2;
-/* L30: */
-	}
-	goto L10;
-    }
-/* +    END WHILE */
-
-    return 0;
-
-/*     End of SLAMC4 */
-
-} /* slamc4_ */
-
-
-/* Subroutine */ int slamc5_(int *beta, int *p, int *emin, 
-	int *ieee, int *emax, float *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC5 attempts to compute RMAX, the largest machine floating-point   
-    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
-    approximately to a power of 2.  It will fail on machines where this   
-    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
-  
-    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
-    too large (i.e. too close to zero), probably with overflow.   
-
-    Arguments   
-    =========   
-
-    BETA    (input) INT   
-            The base of floating-point arithmetic.   
-
-    P       (input) INT   
-            The number of base BETA digits in the mantissa of a   
-            floating-point value.   
-
-    EMIN    (input) INT   
-            The minimum exponent before (gradual) underflow.   
-
-    IEEE    (input) INT   
-            A logical flag specifying whether or not the arithmetic   
-            system is thought to comply with the IEEE standard.   
-
-    EMAX    (output) INT   
-            The largest exponent before overflow   
-
-    RMAX    (output) FLOAT   
-            The largest machine floating-point number.   
-
-   ===================================================================== 
-  
-
-
-       First compute LEXP and UEXP, two powers of 2 that bound   
-       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
-       approximately to the bound that is closest to abs(EMIN).   
-       (EMAX is the exponent of the required number RMAX). */
-    /* Table of constant values */
-    static float c_b5 = 0.f;
-    
-    /* System generated locals */
-    int i__1;
-    float r__1;
-    /* Local variables */
-    static int lexp;
-    static float oldy;
-    static int uexp, i;
-    static float y, z;
-    static int nbits;
-    extern double slamc3_(float *, float *);
-    static float recbas;
-    static int exbits, expsum, try__;
-
-
-
-    lexp = 1;
-    exbits = 1;
-L10:
-    try__ = lexp << 1;
-    if (try__ <= -(*emin)) {
-	lexp = try__;
-	++exbits;
-	goto L10;
-    }
-    if (lexp == -(*emin)) {
-	uexp = lexp;
-    } else {
-	uexp = try__;
-	++exbits;
-    }
-
-/*     Now -LEXP is less than or equal to EMIN, and -UEXP is greater   
-       than or equal to EMIN. EXBITS is the number of bits needed to   
-       store the exponent. */
-
-    if (uexp + *emin > -lexp - *emin) {
-	expsum = lexp << 1;
-    } else {
-	expsum = uexp << 1;
-    }
-
-/*     EXPSUM is the exponent range, approximately equal to   
-       EMAX - EMIN + 1 . */
-
-    *emax = expsum + *emin - 1;
-    nbits = exbits + 1 + *p;
-
-/*     NBITS is the total number of bits needed to store a   
-       floating-point number. */
-
-    if (nbits % 2 == 1 && *beta == 2) {
-
-/*        Either there are an odd number of bits used to store a   
-          floating-point number, which is unlikely, or some bits are 
-  
-          not used in the representation of numbers, which is possible
-,   
-          (e.g. Cray machines) or the mantissa has an implicit bit,   
-          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
-   
-          most likely. We have to assume the last alternative.   
-          If this is true, then we need to reduce EMAX by one because 
-  
-          there must be some way of representing zero in an implicit-b
-it   
-          system. On machines like Cray, we are reducing EMAX by one 
-  
-          unnecessarily. */
-
-	--(*emax);
-    }
-
-    if (*ieee) {
-
-/*        Assume we are on an IEEE machine which reserves one exponent
-   
-          for infinity and NaN. */
-
-	--(*emax);
-    }
-
-/*     Now create RMAX, the largest machine number, which should   
-       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .   
-
-       First compute 1.0 - BETA**(-P), being careful that the   
-       result is less than 1.0 . */
-
-    recbas = 1.f / *beta;
-    z = *beta - 1.f;
-    y = 0.f;
-    i__1 = *p;
-    for (i = 1; i <= *p; ++i) {
-	z *= recbas;
-	if (y < 1.f) {
-	    oldy = y;
-	}
-	y = slamc3_(&y, &z);
-/* L20: */
-    }
-    if (y >= 1.f) {
-	y = oldy;
-    }
-
-/*     Now multiply by BETA**EMAX to get RMAX. */
-
-    i__1 = *emax;
-    for (i = 1; i <= *emax; ++i) {
-	r__1 = y * *beta;
-	y = slamc3_(&r__1, &c_b5);
-/* L30: */
-    }
-
-    *rmax = y;
-    return 0;
-
-/*     End of SLAMC5 */
-
-} /* slamc5_ */
-
-
-double pow_ri(float *ap, int *bp)
-{
-double pow, x;
-int n;
-
-pow = 1;
-x = *ap;
-n = *bp;
-
-if(n != 0)
-	{
-	if(n < 0)
-		{
-		n = -n;
-		x = 1/x;
-		}
-	for( ; ; )
-		{
-		if(n & 01)
-			pow *= x;
-		if(n >>= 1)
-			x *= x;
-		else
-			break;
-		}
-	}
-return(pow);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/slamchtst.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,34 +0,0 @@
-#include <stdio.h>
-
-main()
-{
-    /* Local variables */
-    float base, emin, prec, emax, rmin, rmax, t, sfmin;
-    extern double slamch_(char *);
-    float rnd, eps;
-
-    eps = slamch_("Epsilon");
-    sfmin = slamch_("Safe minimum");
-    base = slamch_("Base");
-    prec = slamch_("Precision");
-    t = slamch_("Number of digits in mantissa");
-    rnd = slamch_("Rounding mode");
-    emin = slamch_("Minnimum exponent");
-    rmin = slamch_("Underflow threshold");
-    emax = slamch_("Largest exponent");
-    rmax = slamch_("Overflow threshold");
-
-    printf(" Epsilon                      = %e\n", eps);
-    printf(" Safe minimum                 = %e\n", sfmin);
-    printf(" Base                         = %.0f\n", base);
-    printf(" Precision                    = %e\n", prec);
-    printf(" Number of digits in mantissa = %.0f\n", t);
-    printf(" Rounding mode                = %.0f\n", rnd);
-    printf(" Minimum exponent             = %.0f\n", emin);
-    printf(" Underflow threshold          = %e\n", rmin);
-    printf(" Largest exponent             = %.0f\n", emax);
-    printf(" Overflow threshold           = %e\n", rmax);
-    printf(" Reciprocal of safe minimum   = %e\n", 1./sfmin);
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/superlu_timer.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,54 +0,0 @@
-/* 
- * Purpose
- * ======= 
- *	Returns the time in seconds used by the process.
- *
- * Note: the timer function call is machine dependent. Use conditional
- *       compilation to choose the appropriate function.
- *
- */
-
-
-#ifdef SUN 
-/*
- * 	It uses the system call gethrtime(3C), which is accurate to 
- *	nanoseconds. 
-*/
-#include <sys/time.h>
- 
-double SuperLU_timer_() {
-    return ( (double)gethrtime() / 1e9 );
-}
-
-#elif defined ( UNIX_TIMER )
-
-#include <sys/types.h>
-#include <sys/times.h>
-#include <time.h>
-#include <sys/time.h>
-
-#ifndef CLK_TCK
-#define CLK_TCK 60
-#endif
-
-double SuperLU_timer_()
-{
-    struct tms use;
-    double tmp;
-    times(&use);
-    tmp = use.tms_utime;
-    tmp += use.tms_stime;
-    return (double)(tmp) / (double) CLK_TCK;
-}
-
-#else
-
-#include <mpi.h>
-
-double SuperLU_timer_()
-{
-    return MPI_Wtime();
-}
-
-#endif
-
Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/testdlamch and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/testdlamch differ
Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/testslamch and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/testslamch differ
Binary files /tmp/tmp6BLdKS/KeOIccOR8n/hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/testtimer and /tmp/tmp6BLdKS/zS5BJEUpFu/hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/testtimer differ
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/timertst.c hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/timertst.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/INSTALL/timertst.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/INSTALL/timertst.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,72 +0,0 @@
-#include <stdio.h>
-#include <mpi.h>
-
-void mysub(int n, double *x, double *y)
-{
-    return;
-}
-
-main(int argc, char *argv[])
-{
-    /* Parameters */    
-#define NMAX    100
-#define ITS     10000
-    
-    int      i, j;
-    double   alpha, avg, t1, t2, tnotim;
-    double   x[NMAX], y[NMAX];
-    double   SuperLU_timer_();
-
-    MPI_Init( &argc, &argv );
-
-    /* Initialize X and Y */
-    for (i = 0; i < NMAX; ++i) {
-	x[i] = 1.0 / (double)(i+1);
-	y[i] = (double)(NMAX - i) / (double)NMAX;
-    }
-    alpha = 0.315;
-
-    /* Time 1,000,000 DAXPY operations */
-    t1 = SuperLU_timer_();
-    for (j = 0; j < ITS; ++j) {
-	for (i = 0; i < NMAX; ++i)
-	    y[i] += alpha * x[i];
-	alpha = -alpha;
-    }
-    t2 = SuperLU_timer_();
-    printf("Time for 1,000,000 DAXPY ops  = %10.3g seconds\n", t2-t1);
-    if ( t2-t1 > 0. ) 
-	printf("DAXPY performance rate        = %10.3g mflops\n", 2./(t2-t1));
-    else
-	printf("*** Error:  Time for operations was zero\n");
-	
-    tnotim = t2 - t1;
-
-    /* Time 1,000,000 DAXPY operations with SuperLU_timer_() 
-       in the outer loop */
-    t1 = SuperLU_timer_();
-    for (j = 0; j < ITS; ++j) {
-	for (i = 0; i < NMAX; ++i)
-	    y[i] += alpha * x[i];
-	alpha = -alpha;
-	t2 = SuperLU_timer_();
-    }
-
-    /* Compute the time in milliseconds used by an average call to 
-       SuperLU_timer_(). */
-    printf("Including DSECND, time        = %10.3g seconds\n", t2-t1);
-    avg = ( (t2 - t1) - tnotim )*1000. / (double)ITS;
-    printf("Average time for DSECND       = %10.3g milliseconds\n", avg);
-
-    /* Compute the equivalent number of floating point operations used
-       by an average call to DSECND.    */
-    if ( tnotim > 0. )
-	printf("Equivalent floating point ops = %10.3g ops\n",
-	       1000.*avg / tnotim);
-
-    mysub(NMAX, x, y);
-
-    MPI_Finalize();
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile
--- hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,61 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          Makefile
-#
-#  Purpose:         Top-level Makefile
-#
-#  Creation date:   September 1, 1999  version 1.0
-#
-#  Modified:        
-#
-############################################################################
-
-############################
-# for hypre
-############################
-all:
-	( cd SRC; $(MAKE) all)
-
-install:
-	( cd SRC; $(MAKE) install)
-
-clean:
-	( cd SRC; $(MAKE) clean)
-
-distclean: clean
-
-############################
-
-#include make.inc
-
-#all: install lib example
-
-#lib: superlulib
-
-example:
-	( cd EXAMPLE; $(MAKE) )
-
-#clean: cleanlib cleantesting
-
-#install:
-	( cd INSTALL; $(MAKE) )
-#	( cd INSTALL; cp lsame.c ../SRC/; \
-#	  cp dlamch.c ../SRC/; cp slamch.c ../SRC/ )
-
-blaslib:
-	( cd CBLAS; $(MAKE) )
-
-superlulib:
-	( cd SRC; $(MAKE) )
-
-cleanlib:
-	( cd SRC; $(MAKE) clean )
-	( cd CBLAS; $(MAKE) clean )
-	( cd lib; rm -f *.a )
-
-cleantesting:
-	( cd INSTALL; $(MAKE) clean )
-	( cd EXAMPLE; $(MAKE) clean )
-	( cd FORTRAN; $(MAKE) clean )
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile.dsuperlu hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile.dsuperlu
--- hypre-2.11.2/src/FEI_mv/DSuperLU/Makefile.dsuperlu	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/Makefile.dsuperlu	1970-01-01 00:00:00.000000000 +0000
@@ -1,45 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          Makefile
-#
-#  Purpose:         Top-level Makefile
-#
-#  Creation date:   September 1, 1999  version 1.0
-#
-#  Modified:        
-#
-############################################################################
-
-include make.inc
-
-all: install lib example
-
-lib: superlulib
-
-example:
-	( cd EXAMPLE; $(MAKE) )
-
-clean: cleanlib cleantesting
-
-install:
-	( cd INSTALL; $(MAKE) )
-#	( cd INSTALL; cp lsame.c ../SRC/; \
-#	  cp dlamch.c ../SRC/; cp slamch.c ../SRC/ )
-
-blaslib:
-	( cd CBLAS; $(MAKE) )
-
-superlulib:
-	( cd SRC; $(MAKE) )
-
-cleanlib:
-	( cd SRC; $(MAKE) clean )
-	( cd CBLAS; $(MAKE) clean )
-	( cd lib; rm -f *.a )
-
-cleantesting:
-	( cd INSTALL; $(MAKE) clean )
-	( cd EXAMPLE; $(MAKE) clean )
-	( cd FORTRAN; $(MAKE) clean )
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/make.inc hypre-2.13.0/src/FEI_mv/DSuperLU/make.inc
--- hypre-2.11.2/src/FEI_mv/DSuperLU/make.inc	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/make.inc	1970-01-01 00:00:00.000000000 +0000
@@ -1,65 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1999   version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#                   March 15, 2003     version 2.0
-#
-# 		    January 18, 2006   Sam Adams
-#                                      General Dynamics - Network Systems
-#                       works for i386 Linux, with LAM-MPI 7.1.1 and GCC 4.
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _i386
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= /home/chtong/HYPRE/linear_solvers/FEI_mv/SuperLU_DIST_2.2
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.0.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-BLASLIB      	= /usr/lib/libblas.so.3
-METISLIB	=
-PARMETISLIB	=
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-ARCHFLAGS    	= cr
-RANLIB       	= ranlib
-
-############################################################################
-# C compiler setup
-CC           	= mpicc
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = -pipe -O2
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-NOOPTS		= 
-############################################################################
-# FORTRAN compiler setup
-FORTRAN         = mpif77
-FFLAGS 		= 
-############################################################################
-#LOADER	        = mpif77
-LOADER	        = mpicc
-LOADOPTS	= 
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DAdd__ 
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.altix hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.altix
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.altix	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.altix	1970-01-01 00:00:00.000000000 +0000
@@ -1,63 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   April 10, 2006  version 2.0
-#
-#  Modified: 	    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT            = _altix
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot    = $(HOME)/SuperLU_codes/SuperLU_DIST_2.2
-DSUPERLULIB     = $(DSuperLUroot)/lib/libsuperlu$(PLAT).a
-
-MKLHOME		= /usr/common/intel/mkl/8.1.014
-BLASDEF         = -DUSE_VENDOR_BLAS
-BLASLIB         = -L${MKLHOME}/lib/64 -lmkl_ipf -lguide
-PARMETISLIB	= -L$(HOME)/Codes/ParMetis-3.1 -lparmetis
-METISLIB	= -lmetis
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) \
-	          -lmpi -lm -L/usr/common/intel/fc/8.1.029/lib -lifcore
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH            = ar
-ARCHFLAGS       = crv
-RANLIB          = ranlib
-
-#######################################################################
-# C compiler setup
-CC              = icc
-ISA             = -ftz -mp
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = $(ISA) -O3 -DDEBUGlevel=0 -DPRNTlevel=0
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-NOOPTS          = $(ISA) -O0
-############################################################################
-# FORTRAN compiler setup
-FORTRAN         = ifort
-FFLAGS          = $(CFLAGS)
-F90FLAGS	= -r8 -check all -save -Dmpi -ftz
-############################################################################
-LOADER          = icc
-LOADOPTS        = $(CFLAGS)
-#
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DAdd_
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.i386_linux	1970-01-01 00:00:00.000000000 +0000
@@ -1,64 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1999   version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#                   March 15, 2003     version 2.0
-#
-# 		    January 18, 2006   Sam Adams
-#                                      General Dynamics - Network Systems
-#                       works for i386 Linux, with LAM-MPI 7.1.1 and GCC 4.
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _i386
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= /opt/SuperLU_DIST_2.0
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.0.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-BLASLIB      	= /usr/lib/libblas.so.3
-METISLIB	=
-PARMETISLIB	=
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-ARCHFLAGS    	= cr
-RANLIB       	= ranlib
-
-############################################################################
-# C compiler setup
-CC           	= mpicc
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = -pipe -O2
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-NOOPTS		= 
-############################################################################
-# FORTRAN compiler setup
-FORTRAN         = mpif77
-FFLAGS 		= 
-############################################################################
-LOADER	        = mpif77
-LOADOPTS	= 
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DAdd__ 
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.opteron	1970-01-01 00:00:00.000000000 +0000
@@ -1,64 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1999   version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#                   March 15, 2003     version 2.0
-#		    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _jacquard
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= $(HOME)/SuperLU_DIST_2.2
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-BLASLIB      	= -L/usr/common/usg/acml/2.6.0/pathscale64/lib -lacml -lacml_mv
-METISLIB	=
-PARMETISLIB	=
-#MPILIB		= -L/usr/lpp/ppe.poe/lib -lmpi_r
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-ARCHFLAGS    	= cr
-RANLIB       	= ranlib
-
-############################################################################
-# C compiler setup
-CC           	= mpicc
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = -O3 -DDEBUGlevel=0 -DPRNTlevel=1
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-NOOPTS		= -O0
-############################################################################
-# FORTRAN compiler setup
-FORTRAN         = mpif90
-FFLAGS 		= -O3
-############################################################################
-LOADER	        = mpif90
-
-# 32-bit:
-LOADOPTS	= 
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DAdd_
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.origin hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.origin
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.origin	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.origin	1970-01-01 00:00:00.000000000 +0000
@@ -1,66 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1997  version 1.0
-#
-#  Modified:        November 11, 2002 (by Tom Oppe)
-#		    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT            = _sgi
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot    = $(HOME)/SuperLU_DIST_2.2
-DSUPERLULIB     = $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a
-#
-BLASDEF         = -DUSE_VENDOR_BLAS
-BLASLIB         = -lscs
-MPILIB		= -lmpi
-METISLIB	=
-PARMETISLIB	=
-#
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) \
-	          $(MPILIB) -lfortran
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH            = ar
-ARCHFLAGS       = crv
-RANLIB          = touch
-
-#######################################################################
-# C compiler setup
-CC              = cc
-ISA             = -64 -mips4 -TARG:platform=ip35
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = $(ISA) -O2
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-# This must be enforced to compile the two routines: slamch.c and dlamch.c.
-NOOPTS          = $(ISA) -O0
-############################################################################
-# FORTRAN compiler setup
-FORTRAN         = f90
-FFLAGS          = $(CFLAGS)
-############################################################################
-LOADER          = cc
-LOADOPTS        = $(CFLAGS)
-#
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DAdd_ -DORIGIN
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp	1970-01-01 00:00:00.000000000 +0000
@@ -1,66 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1999   version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#                   March 15, 2003     version 2.0
-#		    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _sp
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= $(HOME)/SuperLU_DIST_2.2
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-BLASLIB      	= -lessl
-#MPILIB		= -L/usr/lpp/ppe.poe/lib -lmpi
-#PERFLIB     	= -L/vol1/VAMPIR/lib -lVT
-METISLIB	=
-PARMETISLIB	=
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-ARCHFLAGS    	= cr
-RANLIB       	= ranlib
-
-############################################################################
-CC           	= mpcc
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = -D_SP -O3 -qarch=PWR3 -qalias=allptrs \
-		  -DDEBUGlevel=0 -DPRNTlevel=0
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-# This must be enforced to compile the two routines: slamch.c and dlamch.c.
-NOOPTS		=
-############################################################################
-FORTRAN         = mpxlf90
-FFLAGS          = -WF,-Dsp -O3 -Q -qstrict -qfixed -qinit=f90ptr -qarch=pwr3
-############################################################################
-LOADER	        = mpxlf90
-#LOADOPTS	= -bmaxdata:0x80000000
-LOADOPTS	= -bmaxdata:0x70000000
-#
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DNoChange
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.sp.64bit	1970-01-01 00:00:00.000000000 +0000
@@ -1,71 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1999   version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#                   March 15, 2003     version 2.0
-#		    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _power5
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= $(HOME)/Release_Codes/SuperLU_DIST_2.2
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-BLASLIB      	= -lessl
-METISLIB        = -L/usr/common/usg/parmetis/3.1 -lmetis
-PARMETISLIB	= -L/usr/common/usg/parmetis/3.1 -lparmetis
-LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-# 64-bit:
-ARCHFLAGS    	= -X64 cr
-RANLIB       	= ranlib
-
-############################################################################
-# C compiler setup
-# 64-bit
-CC           	= mpcc_r
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = -D_SP -qarch=pwr5 -qalias=allptrs -q64 \
-		  -DDEBUGlevel=0 -DPRNTlevel=0 -O3
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-# 64-bit
-NOOPTS		= -q64
-
-############################################################################
-# FORTRAN compiler setup
-# 64-bit
-FORTRAN         = mpxlf90_r
-FFLAGS 		= -WF,-Dsp -O3 -Q -qstrict -qfixed -qinit=f90ptr -qarch=pwr5\
-                  -q64 #-qintsize=8
-############################################################################
-# 64-bit
-LOADER	= mpxlf90_r
-
-# 64-bit:
-LOADOPTS	= -q64
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DNoChange
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.t3e	1970-01-01 00:00:00.000000000 +0000
@@ -1,68 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1997  version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#		    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _t3e
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= $(HOME)/SuperLU_DIST_2.2
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-#
-#PERFLIB	= -l pat pat.cld
-#PERFLIB	= -lapp
-METISLIB	=
-PARMETISLIB	=
-LIBS            = $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-ARCHFLAGS    	= cr
-RANLIB       	= ranlib
-
-############################################################################
-# C compiler setup
-CC           	= cc
-# CFLAGS should be set to be the C flags that include optimization
-#CFLAGS       	= -D_CRAY -DPRNTlevel=1 -O3 -h aggress,split,unroll
-CFLAGS       	= -O3 -D_CRAY -DPRNTlevel=0 -DDEBUGlevel=0 -DPROFlevel=0
-#		  -happrentice,inline0
-PTROPT	     	= -h restrict=a
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-# This must be enforced to compile the two routines: slamch.c and dlamch.c.
-NOOPTS		=
-############################################################################
-# FORTRAN compiler setup
-FORTRAN		= f90
-FFLAGS		= -O3 -dp -i 32
-############################################################################
-LOADER       	= cc
-LOADOPTS   = 
-#
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DUpCase
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4 hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4
--- hypre-2.11.2/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/MAKE_INC/make.xt4	1970-01-01 00:00:00.000000000 +0000
@@ -1,62 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU_DIST
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   February 4, 1999   version alpha
-#
-#  Modified:	    September 1, 1999  version 1.0
-#                   March 15, 2003     version 2.0
-#		    November 1, 2007   version 2.1
-#
-############################################################################
-#
-#  The machine (platform) identifier to append to the library names
-#
-PLAT		= _xt4
-
-#
-#  The name of the libraries to be created/linked to
-#
-DSuperLUroot 	= $(HOME)/Release_Codes/SuperLU_DIST_2.2
-DSUPERLULIB   	= $(DSuperLUroot)/lib/libsuperlu_dist_2.2.a
-#
-BLASDEF	     	= -DUSE_VENDOR_BLAS
-BLASLIB	        =
-METISLIB    	= -L/usr/common/usg/parmetis/3.1 -lmetis
-PARMETISLIB   	= -L/usr/common/usg/parmetis/3.1 -lparmetis
-FLIBS	 	= -lpgf90 -lpgf90_rpm1 -lpgf902 -lpgf90rtl -lpgftnrtl
-LIBS	     	= $(DSUPERLULIB) $(BLASLIB) $(PARMETISLIB) $(METISLIB) $(FLIBS)
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         	= ar
-ARCHFLAGS    	= cr
-RANLIB       	= ranlib
-
-############################################################################
-# C compiler setup
-CC           	= cc
-# CFLAGS should be set to be the C flags that include optimization
-CFLAGS          = -fastsse -DDEBUGlevel=0 -DPRNTlevel=1
-#
-# NOOPTS should be set to be the C flags that turn off any optimization
-NOOPTS		= -O0
-############################################################################
-# FORTRAN compiler setup
-FORTRAN         = ftn
-FFLAGS 		= -fastsse
-############################################################################
-LOADER	        = cc
-LOADOPTS	= 
-############################################################################
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-#  Need follow the convention of how C calls a Fortran routine.
-#
-CDEFS        = -DAdd_
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/README hypre-2.13.0/src/FEI_mv/DSuperLU/README
--- hypre-2.11.2/src/FEI_mv/DSuperLU/README	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/README	1970-01-01 00:00:00.000000000 +0000
@@ -1,175 +0,0 @@
-		SuperLU_DIST (version 2.1)
-		==========================
-
-Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
-
-All rights reserved. 
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-(1) Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer. 
-(2) Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution. 
-(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of
-Energy nor the names of its contributors may be used to endorse or promote
-products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
-
-  
-SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
-A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
-Static pivoting is a technique that combines the numerical stability of
-partial pivoting with the scalability of Cholesky (no pivoting),
-to run accurately and efficiently on large numbers of processors. 
-
-SuperLU_DIST is a parallel extension to the serial SuperLU library.
-It is targeted for the distributed memory parallel machines.
-SuperLU_DIST is implemented in ANSI C, and MPI for communications.
-Currently, the LU factorization and triangular solution routines,
-which are the most time-consuming part of the solution process,
-are parallelized. The other routines, such as static pivoting and 
-column preordering for sparsity are performed sequentially. 
-This "alpha" release contains double-precision real and double-precision
-complex data types.
-
-The distribution contains the following directory structure:
-
-  SuperLU_DIST/README    instructions on installation
-  SuperLU_DIST/CBLAS/    needed BLAS routines in C, not necessarily fast
-  SuperLU_DIST/DOC/  	 the Users' Guide
-  SuperLU_DIST/EXAMPLE/  example programs
-  SuperLU_DIST/INSTALL/  test machine dependent parameters
-  SuperLU_DIST/SRC/      C source code, to be compiled into libsuperlu_dist.a
-  SuperLU_DIST/lib/      contains library archive libsuperlu_dist.a
-  SuperLU_DIST/Makefile  top level Makefile that does installation and testing
-  SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
-                         preprocessor definitions, included in all Makefiles.
-                         (You may need to edit it to suit for your system
-                          before compiling the whole package.)
-  SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
-
-Before installing the package, please examine the three things dependent 
-on your system setup:
-
-1. Edit the make.inc include file.
-
-   This make include file is referenced inside each of the Makefiles
-   in the various subdirectories. As a result, there is no need to 
-   edit the Makefiles in the subdirectories. All information that is
-   machine specific has been defined in this include file. 
-
-   Sample machine-specific {\tt make.inc} are provided in the MAKE_INC/
-   directory for several platforms, such as Cray T3E and IBM SP.
-   When you have selected the machine to which you wish to install
-   SuperLU_DIST, copy the appropriate sample include file (if one is present)
-   into make.inc. For example, if you wish to run SuperLU_DIST on a
-   Cray T3E,  you can do
-
-        cp MAKE_INC/make.t3e make.inc
-   
-   For the systems other than listed above, some porting effort is needed
-   for parallel factorization routines. Please refer to the Users' Guide 
-   for detailed instructions on porting.
-
-   
-2. The BLAS library.
-
-   The parallel routines in SuperLU_DIST uses some sequential BLAS routines
-   on each process. If there is BLAS library available on your machine,
-   you may define the following in the file make.inc:
-        BLASDEF = -DUSE_VENDOR_BLAS
-        BLASLIB = <BLAS library you wish to link with>
-
-   The CBLAS/ subdirectory contains the part of the C BLAS needed by 
-   SuperLU_DIST package. However, these codes are intended for use only if
-   there is no faster implementation of the BLAS already available on your
-   machine. In this case, you should go to the top-level SuperLU_DIST/
-   directory and do the following:
-
-    1) In make.inc, undefine (comment out) BLASDEF, and define:
-          BLASLIB = ../lib/libblas$(PLAT).a
-
-    2) Type:
-          make blaslib
-       to make the BLAS library from the routines in the CBLAS/ subdirectory.
-
-
-3. External libraries: Metis and ParMetis.
-
-   If you will use Metis or ParMetis ordering, or parallel
-   symbolic factorization (which depends on ParMetis), you will
-   need to install them yourself. Since ParMetis package already
-   contains the source code for the Metis library, you can just
-   download ParMetis at:
-       http://glaros.dtc.umn.edu/gkhome/metis/parmetis/download
-
-   After you have installed it, you should define the following in make.inc:
-        METISLIB = -L<metis directory> -lmetis
-        PARMETISLIB = -L<parmetis directory> -lparmetis
-
-
-4. C preprocessor definition CDEFS.
-
-   In the header file SRC/Cnames.h, we use macros to determine how
-   C routines should be named so that they are callable by Fortran.
-   (Some vendor-supplied BLAS libraries do not have C interfaces. So the 
-    re-naming is needed in order for the SuperLU BLAS calls (in C) to 
-    interface with the Fortran-style BLAS.)
-   The possible options for CDEFS are:
-
-       o -DAdd_: Fortran expects a C routine to have an underscore
-		 postfixed to the name;
-       o -DNoChange: Fortran expects a C routine name to be identical to
-		     that compiled by C;
-       o -DUpCase: Fortran expects a C routine name to be all uppercase.
-   
-A Makefile is provided in each subdirectory. The installation can be done
-completely automatically by simply typing "make" at the top level.
-
-
-REFERENCES
-
-[1] SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for
-    Unsymmetric Linear Systems.  Xiaoye S. Li and James W. Demmel.
-    ACM Trans. on Math. Solftware, Vol. 29, No. 2, June 2003, pp. 110-140.
-[2] Parallel Symbolic Factorization for Sparse LU with Static Pivoting.
-    L. Grigori, J. Demmel and X.S. Li. SIAM J. Sci. Comp., Vol. 29, Issue 3,
-    1289-1314, 2007. 
-
-Xiaoye S. Li    Lawrence Berkeley National Lab, xsli@lbl.gov
-James Demmel    Univ. of California Berkeley, demmel@cs.berkeley.edu
-Laura Grigori   INRIA, France, Laura.Grigori@inria.fr
-
-
------------------
-| RELEASE NOTES |
------------------
-
-* Version 2.0, 10-15-2003
-   - "set_default_options" is renamed to "set_default_options_dist",
-     because sequential SuperLU now uses set_default_options.
-   - add a field in input "options" argument to control whether to
-     print the solver's statistics.
-   - add Fortran 90 wrapper in FORTRAN/ directory.     
-   - fixed a bug in triangular solve, which is related to the unfreed
-     MPI_Irecv request handles.
-    
-* Version 2.1, 10-01-2007
-   - include parallel symbolic factorization capability.
-   - include Metis or ParMetis ordering algorithms.
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,348 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
-#define __SUPERLU_CNAMES
-
-/*
- * These macros define how C routines will be called.  ADD_ assumes that
- * they will be called by fortran, which expects C routines to have an
- * underscore postfixed to the name (Suns, and the Intel expect this).
- * NOCHANGE indicates that fortran will be calling, and that it expects
- * the name called by fortran to be identical to that compiled by the C
- * (RS6K's do this).  UPCASE says it expects C routines called by fortran
- * to be in all upcase (CRAY wants this). 
- */
-
-#define ADD_       0
-#define NOCHANGE   1
-#define UPCASE     2
-#define C_CALL     3
-
-#ifdef UpCase
-#define F77_CALL_C UPCASE
-#endif
-
-#ifdef NoChange
-#define F77_CALL_C NOCHANGE
-#endif
-
-#ifdef Add_
-#define F77_CALL_C ADD_
-#endif
-
-#ifndef F77_CALL_C
-#define F77_CALL_C ADD_
-#endif
-
-#if (F77_CALL_C == ADD_)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine
- * No redefinition necessary to have following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm_(...)
- *
- * This is the default.
- */
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle_
-#define f_create_options_handle          f_create_options_handle_
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle_
-#define f_create_LUstruct_handle         f_create_lustruct_handle_
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle_
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle_
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle_
-#define f_destroy_options_handle         f_destroy_options_handle_
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle_
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle_
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle_
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle_
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle_
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle_
-#define f_get_gridinfo                   f_get_gridinfo_
-#define f_get_SuperMatrix                f_get_supermatrix_
-#define f_set_SuperMatrix                f_set_supermatrix_
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix_ 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix_
-#define f_get_superlu_options            f_get_superlu_options_
-#define f_set_superlu_options            f_set_superlu_options_
-#define f_set_default_options            f_set_default_options_
-#define f_superlu_gridinit               f_superlu_gridinit_
-#define f_superlu_gridexit               f_superlu_gridexit_
-#define f_ScalePermstructInit            f_scalepermstructinit_
-#define f_ScalePermstructFree            f_scalepermstructfree_
-#define f_PStatInit                      f_pstatinit_
-#define f_PStatFree                      f_pstatfree_
-#define f_LUstructInit                   f_lustructinit_
-#define f_LUstructFree                   f_lustructfree_
-#define f_Destroy_LU                     f_destroy_lu_
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist_
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist_
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist_
-#define f_dSolveFinalize                 f_dsolvefinalize_
-#define f_pdgssvx                        f_pdgssvx_
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix_
-#define f_check_malloc                   f_check_malloc_
-#endif
-
-#if (F77_CALL_C == UPCASE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void DGEMM(...)
- */
-/* BLAS */
-#define sasum_    SASUM
-#define isamax_   ISAMAX
-#define scopy_    SCOPY
-#define sscal_    SSCAL
-#define sger_     SGER
-#define snrm2_    SNRM2
-#define ssymv_    SSYMV
-#define sdot_     SDOT
-#define saxpy_    SAXPY
-#define ssyr2_    SSYR2
-#define srot_     SROT
-#define sgemv_    SGEMV
-#define strsv_    STRSV
-#define sgemm_    SGEMM
-#define strsm_    STRSM
-
-#define dasum_    DASUM
-#define idamax_   IDAMAX
-#define dcopy_    DCOPY
-#define dscal_    DSCAL
-#define dger_     DGER
-#define dnrm2_    DNRM2
-#define dsymv_    DSYMV
-#define ddot_     DDOT
-#define daxpy_    DAXPY
-#define dsyr2_    DSYR2
-#define drot_     DROT
-#define dgemv_    DGEMV
-#define dtrsv_    DTRSV
-#define dgemm_    DGEMM
-#define dtrsm_    DTRSM
-
-#define scasum_   SCASUM
-#define icamax_   ICAMAX
-#define ccopy_    CCOPY
-#define cscal_    CSCAL
-#define scnrm2_   SCNRM2
-#define caxpy_    CAXPY
-#define cgemv_    CGEMV
-#define ctrsv_    CTRSV
-#define cgemm_    CGEMM
-#define ctrsm_    CTRSM
-#define cgerc_    CGERC
-#define chemv_    CHEMV
-#define cher2_    CHER2
-
-#define dzasum_   DZASUM
-#define izamax_   IZAMAX
-#define zcopy_    ZCOPY
-#define zscal_    ZSCAL
-#define dznrm2_   DZNRM2
-#define zaxpy_    ZAXPY
-#define zgemv_    ZGEMV
-#define ztrsv_    ZTRSV
-#define zgemm_    ZGEMM
-#define ztrsm_    ZTRSM
-#define zgerc_    ZGERC
-#define zhemv_    ZHEMV
-#define zher2_    ZHER2
-#define zgeru_    ZGERU
-
-/* LAPACK */
-#define dlamch_         DLAMCH
-#define slamch_         SLAMCH
-#define xerbla_         XERBLA
-#define lsame_          LSAME
-
-#define mc64id_         MC64ID
-#define mc64ad_         MC64AD
-#define c_bridge_dgssv_               C_BRIDGE_DGSSV
-#define c_fortran_slugrid_            C_FORTRAN_SLUGRID
-#define c_fortran_pdgssvx_            C_FORTRAN_PDGSSVX
-#define c_fortran_pdgssvx_ABglobal_   C_FORTRAN_PDGSSVX_ABGLOBAL
-#define c_fortran_pzgssvx_            C_FORTRAN_PZGSSVX
-#define c_fortran_pzgssvx_ABglobal_   C_FORTRAN_PZGSSVX_ABGLOBAL
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         F_CREATE_GRIDINFO_HANDLE
-#define f_create_options_handle          F_CREATE_OPTIONS_HANDLE
-#define f_create_ScalePerm_handle        F_CREATE_SCALEPERM_HANDLE
-#define f_create_LUstruct_handle         F_CREATE_LUSTRUCT_HANDLE
-#define f_create_SOLVEstruct_handle      F_CREATE_SOLVESTRUCT_HANDLE
-#define f_create_SuperMatrix_handle      F_CREATE_SUPERMATRIX_HANDLE
-#define f_destroy_gridinfo_handle        F_DESTROY_GRIDINFO_HANDLE
-#define f_destroy_options_handle         F_DESTROY_OPTIONS_HANDLE
-#define f_destroy_ScalePerm_handle       F_DESTROY_SCALEPERM_HANDLE
-#define f_destroy_LUstruct_handle        F_DESTROY_LUSTRUCT_HANDLE
-#define f_destroy_SOLVEstruct_handle     F_DESTROY_SOLVESTRUCT_HANDLE
-#define f_destroy_SuperMatrix_handle     F_DESTROY_SUPERMATRIX_HANDLE
-#define f_create_SuperLUStat_handle      F_CREATE_SUPERLUSTAT_HANDLE
-#define f_destroy_SuperLUStat_handle     F_DESTROY_SUPERLUSTAT_HANDLE
-#define f_get_gridinfo                   F_GET_GRIDINFO
-#define f_get_SuperMatrix                F_GET_SUPERMATRIX
-#define f_set_SuperMatrix                F_SET_SUPERMATRIX
-#define f_get_CompRowLoc_Matrix          F_GET_COMPROWLOC_MATRIX
-#define f_set_CompRowLoc_Matrix          F_SET_COMPROWLOC_MATRIX
-#define f_get_superlu_options            F_GET_SUPERLU_OPTIONS
-#define f_set_superlu_options            F_SET_SUPERLU_OPTIONS
-#define f_set_default_options            F_SET_DEFAULT_OPTIONS
-#define f_superlu_gridinit               F_SUPERLU_GRIDINIT
-#define f_superlu_gridexit               F_SUPERLU_GRIDEXIT
-#define f_ScalePermstructInit            F_SCALEPERMSTRUCTINIT
-#define f_ScalePermstructFree            F_SCALEPERMSTRUCTFREE
-#define f_PStatInit                      F_PSTATINIT
-#define f_PStatFree                      F_PSTATFREE
-#define f_LUstructInit                   F_LUSTRUCTINIT
-#define f_LUstructFree                   F_LUSTRUCTFREE
-#define f_Destroy_LU                     F_DESTROY_LU
-#define f_dCreate_CompRowLoc_Mat_dist    F_DCREATE_COMPROWLOC_MAT_DIST
-#define f_Destroy_CompRowLoc_Mat_dist    F_DESTROY_COMPROWLOC_MAT_DIST
-#define f_Destroy_SuperMat_Store_dist    F_DESTROY_SUPERMAT_STORE_DIST
-#define f_dSolveFinalize                 F_DSOLVEFINALIZE
-#define f_pdgssvx                        F_PDGSSVX
-#define f_dcreate_dist_matrix            F_DCREATE_DIST_MATRIX
-#define f_check_malloc                   F_CHECK_MALLOC
-#endif
-
-#if (F77_CALL_C == NOCHANGE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * for following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm(...)
- */
-/* BLAS */
-#define sasum_    sasum
-#define isamax_   isamax
-#define scopy_    scopy
-#define sscal_    sscal
-#define sger_     sger
-#define snrm2_    snrm2
-#define ssymv_    ssymv
-#define sdot_     sdot
-#define saxpy_    saxpy
-#define ssyr2_    ssyr2
-#define srot_     srot
-#define sgemv_    sgemv
-#define strsv_    strsv
-#define sgemm_    sgemm
-#define strsm_    strsm
-
-#define dasum_    dasum
-#define idamax_   idamax
-#define dcopy_    dcopy
-#define dscal_    dscal
-#define dger_     dger
-#define dnrm2_    dnrm2
-#define dsymv_    dsymv
-#define ddot_     ddot
-#define daxpy_    daxpy
-#define dsyr2_    dsyr2
-#define drot_     drot
-#define dgemv_    dgemv
-#define dtrsv_    dtrsv
-#define dgemm_    dgemm
-#define dtrsm_    dtrsm
-
-#define scasum_   scasum
-#define icamax_   icamax
-#define ccopy_    ccopy
-#define cscal_    cscal
-#define scnrm2_   scnrm2
-#define caxpy_    caxpy
-#define cgemv_    cgemv
-#define ctrsv_    ctrsv
-#define cgemm_    cgemm
-#define ctrsm_    ctrsm
-#define cgerc_    cgerc
-#define chemv_    chemv
-#define cher2_    cher2
-
-#define dzasum_   dzasum
-#define izamax_   izamax
-#define zcopy_    zcopy
-#define zscal_    zscal
-#define dznrm2_   dznrm2
-#define zaxpy_    zaxpy
-#define zgemv_    zgemv
-#define ztrsv_    ztrsv
-#define zgemm_    zgemm
-#define ztrsm_    ztrsm
-#define zgerc_    zgerc
-#define zhemv_    zhemv
-#define zher2_    zher2
-#define zgeru_    zgeru
-
-/* LAPACK */
-#define dlamch_         dlamch
-#define slamch_         slamch
-#define xerbla_         xerbla
-#define lsame_          lsame
-
-#define mc64id_         mc64id
-#define mc64ad_         mc64ad
-
-#define c_bridge_dgssv_               c_bridge_dgssv
-#define c_fortran_slugrid_            c_fortran_slugrid
-#define c_fortran_pdgssvx_            c_fortran_pdgssvx
-#define c_fortran_pdgssvx_ABglobal_   c_fortran_pdgssvx_abglobal
-#define c_fortran_pzgssvx_            c_fortran_pzgssvx
-#define c_fortran_pzgssvx_ABglobal_   c_fortran_pzgssvx_abglobal
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle
-#define f_create_options_handle          f_create_options_handle
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle
-#define f_create_LUstruct_handle         f_create_lustruct_handle
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle
-#define f_destroy_options_handle         f_destroy_options_handle
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle
-#define f_get_gridinfo                   f_get_gridinfo
-#define f_get_SuperMatrix                f_get_supermatrix
-#define f_set_SuperMatrix                f_set_supermatrix
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix
-#define f_get_superlu_options            f_get_superlu_options
-#define f_set_superlu_options            f_set_superlu_options
-#define f_set_default_options            f_set_default_options
-#define f_superlu_gridinit               f_superlu_gridinit
-#define f_superlu_gridexit               f_superlu_gridexit
-#define f_ScalePermstructInit            f_scalepermstructinit
-#define f_ScalePermstructFree            f_scalepermstructfree
-#define f_PStatInit                      f_pstatinit
-#define f_PStatFree                      f_pstatfree
-#define f_LUstructInit                   f_lustructinit
-#define f_LUstructFree                   f_lustructfree
-#define f_Destroy_LU                     f_destroy_lu
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist
-#define f_dSolveFinalize                 f_dsolvefinalize
-#define f_pdgssvx                        f_pdgssvx
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix
-#define f_check_malloc                   f_check_malloc
-#endif
-
-#include "_hypre_utilities.h"
-
-#endif /* __SUPERLU_CNAMES */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Cnames.h.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Cnames.h.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,285 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
-#define __SUPERLU_CNAMES
-
-/*
- * These macros define how C routines will be called.  ADD_ assumes that
- * they will be called by fortran, which expects C routines to have an
- * underscore postfixed to the name (Suns, and the Intel expect this).
- * NOCHANGE indicates that fortran will be calling, and that it expects
- * the name called by fortran to be identical to that compiled by the C
- * (RS6K's do this).  UPCASE says it expects C routines called by fortran
- * to be in all upcase (CRAY wants this). 
- */
-
-#define ADD_       0
-#define NOCHANGE   1
-#define UPCASE     2
-#define C_CALL     3
-
-#ifdef UpCase
-#define F77_CALL_C UPCASE
-#endif
-
-#ifdef NoChange
-#define F77_CALL_C NOCHANGE
-#endif
-
-#ifdef Add_
-#define F77_CALL_C ADD_
-#endif
-
-#ifndef F77_CALL_C
-#define F77_CALL_C ADD_
-#endif
-
-#if (F77_CALL_C == ADD_)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine
- * No redefinition necessary to have following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm_(...)
- *
- * This is the default.
- */
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle_
-#define f_create_options_handle          f_create_options_handle_
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle_
-#define f_create_LUstruct_handle         f_create_lustruct_handle_
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle_
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle_
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle_
-#define f_destroy_options_handle         f_destroy_options_handle_
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle_
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle_
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle_
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle_
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle_
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle_
-#define f_get_gridinfo                   f_get_gridinfo_
-#define f_get_SuperMatrix                f_get_supermatrix_
-#define f_set_SuperMatrix                f_set_supermatrix_
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix_ 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix_
-#define f_get_superlu_options            f_get_superlu_options_
-#define f_set_superlu_options            f_set_superlu_options_
-#define f_set_default_options            f_set_default_options_
-#define f_superlu_gridinit               f_superlu_gridinit_
-#define f_superlu_gridexit               f_superlu_gridexit_
-#define f_ScalePermstructInit            f_scalepermstructinit_
-#define f_ScalePermstructFree            f_scalepermstructfree_
-#define f_PStatInit                      f_pstatinit_
-#define f_PStatFree                      f_pstatfree_
-#define f_LUstructInit                   f_lustructinit_
-#define f_LUstructFree                   f_lustructfree_
-#define f_Destroy_LU                     f_destroy_lu_
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist_
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist_
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist_
-#define f_dSolveFinalize                 f_dsolvefinalize_
-#define f_pdgssvx                        f_pdgssvx_
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix_
-#define f_check_malloc                   f_check_malloc_
-#endif
-
-#if (F77_CALL_C == UPCASE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void DGEMM(...)
- */
-
-/* LAPACK */
-#define dlamch_         DLAMCH
-#define slamch_         SLAMCH
-#define xerbla_         XERBLA
-#define lsame_          LSAME
-
-#define mc64id_         MC64ID
-#define mc64ad_         MC64AD
-#define c_bridge_dgssv_               C_BRIDGE_DGSSV
-#define c_fortran_slugrid_            C_FORTRAN_SLUGRID
-#define c_fortran_pdgssvx_            C_FORTRAN_PDGSSVX
-#define c_fortran_pdgssvx_ABglobal_   C_FORTRAN_PDGSSVX_ABGLOBAL
-#define c_fortran_pzgssvx_            C_FORTRAN_PZGSSVX
-#define c_fortran_pzgssvx_ABglobal_   C_FORTRAN_PZGSSVX_ABGLOBAL
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         F_CREATE_GRIDINFO_HANDLE
-#define f_create_options_handle          F_CREATE_OPTIONS_HANDLE
-#define f_create_ScalePerm_handle        F_CREATE_SCALEPERM_HANDLE
-#define f_create_LUstruct_handle         F_CREATE_LUSTRUCT_HANDLE
-#define f_create_SOLVEstruct_handle      F_CREATE_SOLVESTRUCT_HANDLE
-#define f_create_SuperMatrix_handle      F_CREATE_SUPERMATRIX_HANDLE
-#define f_destroy_gridinfo_handle        F_DESTROY_GRIDINFO_HANDLE
-#define f_destroy_options_handle         F_DESTROY_OPTIONS_HANDLE
-#define f_destroy_ScalePerm_handle       F_DESTROY_SCALEPERM_HANDLE
-#define f_destroy_LUstruct_handle        F_DESTROY_LUSTRUCT_HANDLE
-#define f_destroy_SOLVEstruct_handle     F_DESTROY_SOLVESTRUCT_HANDLE
-#define f_destroy_SuperMatrix_handle     F_DESTROY_SUPERMATRIX_HANDLE
-#define f_create_SuperLUStat_handle      F_CREATE_SUPERLUSTAT_HANDLE
-#define f_destroy_SuperLUStat_handle     F_DESTROY_SUPERLUSTAT_HANDLE
-#define f_get_gridinfo                   F_GET_GRIDINFO
-#define f_get_SuperMatrix                F_GET_SUPERMATRIX
-#define f_set_SuperMatrix                F_SET_SUPERMATRIX
-#define f_get_CompRowLoc_Matrix          F_GET_COMPROWLOC_MATRIX
-#define f_set_CompRowLoc_Matrix          F_SET_COMPROWLOC_MATRIX
-#define f_get_superlu_options            F_GET_SUPERLU_OPTIONS
-#define f_set_superlu_options            F_SET_SUPERLU_OPTIONS
-#define f_set_default_options            F_SET_DEFAULT_OPTIONS
-#define f_superlu_gridinit               F_SUPERLU_GRIDINIT
-#define f_superlu_gridexit               F_SUPERLU_GRIDEXIT
-#define f_ScalePermstructInit            F_SCALEPERMSTRUCTINIT
-#define f_ScalePermstructFree            F_SCALEPERMSTRUCTFREE
-#define f_PStatInit                      F_PSTATINIT
-#define f_PStatFree                      F_PSTATFREE
-#define f_LUstructInit                   F_LUSTRUCTINIT
-#define f_LUstructFree                   F_LUSTRUCTFREE
-#define f_Destroy_LU                     F_DESTROY_LU
-#define f_dCreate_CompRowLoc_Mat_dist    F_DCREATE_COMPROWLOC_MAT_DIST
-#define f_Destroy_CompRowLoc_Mat_dist    F_DESTROY_COMPROWLOC_MAT_DIST
-#define f_Destroy_SuperMat_Store_dist    F_DESTROY_SUPERMAT_STORE_DIST
-#define f_dSolveFinalize                 F_DSOLVEFINALIZE
-#define f_pdgssvx                        F_PDGSSVX
-#define f_dcreate_dist_matrix            F_DCREATE_DIST_MATRIX
-#define f_check_malloc                   F_CHECK_MALLOC
-#endif
-
-#if (F77_CALL_C == NOCHANGE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * for following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm(...)
- */
-/* BLAS */
-#define sasum_    sasum
-#define isamax_   isamax
-#define scopy_    scopy
-#define sscal_    sscal
-#define sger_     sger
-#define snrm2_    snrm2
-#define ssymv_    ssymv
-#define sdot_     sdot
-#define saxpy_    saxpy
-#define ssyr2_    ssyr2
-#define srot_     srot
-#define sgemv_    sgemv
-#define strsv_    strsv
-#define sgemm_    sgemm
-#define strsm_    strsm
-
-#define dasum_    dasum
-#define idamax_   idamax
-#define dcopy_    dcopy
-#define dscal_    dscal
-#define dger_     dger
-#define dnrm2_    dnrm2
-#define dsymv_    dsymv
-#define ddot_     ddot
-#define daxpy_    daxpy
-#define dsyr2_    dsyr2
-#define drot_     drot
-#define dgemv_    dgemv
-#define dtrsv_    dtrsv
-#define dgemm_    dgemm
-#define dtrsm_    dtrsm
-
-#define scasum_   scasum
-#define icamax_   icamax
-#define ccopy_    ccopy
-#define cscal_    cscal
-#define scnrm2_   scnrm2
-#define caxpy_    caxpy
-#define cgemv_    cgemv
-#define ctrsv_    ctrsv
-#define cgemm_    cgemm
-#define ctrsm_    ctrsm
-#define cgerc_    cgerc
-#define chemv_    chemv
-#define cher2_    cher2
-
-#define dzasum_   dzasum
-#define izamax_   izamax
-#define zcopy_    zcopy
-#define zscal_    zscal
-#define dznrm2_   dznrm2
-#define zaxpy_    zaxpy
-#define zgemv_    zgemv
-#define ztrsv_    ztrsv
-#define zgemm_    zgemm
-#define ztrsm_    ztrsm
-#define zgerc_    zgerc
-#define zhemv_    zhemv
-#define zher2_    zher2
-#define zgeru_    zgeru
-
-/* LAPACK */
-#define dlamch_         dlamch
-#define slamch_         slamch
-#define xerbla_         xerbla
-#define lsame_          lsame
-
-#define mc64id_         mc64id
-#define mc64ad_         mc64ad
-
-#define c_bridge_dgssv_               c_bridge_dgssv
-#define c_fortran_slugrid_            c_fortran_slugrid
-#define c_fortran_pdgssvx_            c_fortran_pdgssvx
-#define c_fortran_pdgssvx_ABglobal_   c_fortran_pdgssvx_abglobal
-#define c_fortran_pzgssvx_            c_fortran_pzgssvx_
-#define c_fortran_pzgssvx_ABglobal_   c_fortran_pzgssvx_abglobal
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle
-#define f_create_options_handle          f_create_options_handle
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle
-#define f_create_LUstruct_handle         f_create_lustruct_handle
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle
-#define f_destroy_options_handle         f_destroy_options_handle
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle
-#define f_get_gridinfo                   f_get_gridinfo
-#define f_get_SuperMatrix                f_get_supermatrix
-#define f_set_SuperMatrix                f_set_supermatrix
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix
-#define f_get_superlu_options            f_get_superlu_options
-#define f_set_superlu_options            f_set_superlu_options
-#define f_set_default_options            f_set_default_options
-#define f_superlu_gridinit               f_superlu_gridinit
-#define f_superlu_gridexit               f_superlu_gridexit
-#define f_ScalePermstructInit            f_scalepermstructinit
-#define f_ScalePermstructFree            f_scalepermstructfree
-#define f_PStatInit                      f_pstatinit
-#define f_PStatFree                      f_pstatfree
-#define f_LUstructInit                   f_lustructinit
-#define f_LUstructFree                   f_lustructfree
-#define f_Destroy_LU                     f_destroy_lu
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist
-#define f_dSolveFinalize                 f_dsolvefinalize
-#define f_pdgssvx                        f_pdgssvx
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix
-#define f_check_malloc                   f_check_malloc
-#endif
-
-#endif /* __SUPERLU_CNAMES */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/comm.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/comm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/comm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/comm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-#include "superlu_ddefs.h"
-
-
-void
-bcast_tree(void *buf, int count, MPI_Datatype dtype, int root, int tag,
-	   gridinfo_t *grid, int scope, int *recvcnt)
-/* 
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Purpose
- * =======
- *   Broadcast an array of *dtype* numbers. The communication pattern
- *   is a tree with number of branches equal to NBRANCHES.
- *   The process ranks are between 0 and Np-1.
- * 
- *   The following two pairs of graphs give different ways of viewing the same
- *   algorithm.  The first pair shows the trees as they should be visualized
- *   when examining the algorithm.  The second pair are isomorphic graphs of
- *   of the first, which show the actual pattern of data movement.
- *   Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a
- *   hypercube broadcast (however, it does not require the nodes be a
- *   power of two to work).
- *
- *    TREE BROADCAST, NBRANCHES = 2     *    TREE BROADCAST, NBRANCHES = 3
- *       
- *     root=2
- * i=4   &______________                *
- *       |              \               *       root=2
- * i=2   &______         &______        * i=3     &______________________
- *       |      \        |      \       *         |          \           \
- * i=1   &__     &__     &__     &__    * i=1     &______     &______     &__
- *       |  \    |  \    |  \    |  \   *         |  \   \    |  \   \    |  \
- *       2   3   4   5   6   7   0   1  *         2   3   4   5   6   7   0   1
- *
- *
- *          ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS:
- *
- *                2                                           2
- *       _________|_________                       ___________|____________
- *      /         |         \                     /           |      |     \
- *     6          4          3                   5            0      3      4
- *    / \         |                             / \           |
- *   0   7        5                            6   7          1
- *   |
- *   1
- *
- *
- * Arguments
- * =========
- * 
- * scope
- */
-{
-    int Iam, i, j, Np, nbranches = 2;
-    int destdist; /* The distance of the destination node. */
-    int mydist;   /* My distance from root. */
-    superlu_scope_t *scp;
-    MPI_Status status;
-
-    if ( scope == COMM_COLUMN ) scp = &grid->cscp;
-    else if ( scope == ROW ) scp = &grid->rscp;
-    Np = scp->Np;
-    if ( Np < 2 ) return;
-    Iam = scp->Iam;
-    
-    if ( Iam == root ) {
-	for (i = nbranches; i < Np; i *= nbranches);
-	for (i /= nbranches; i > 0; i /= nbranches) {
-	    for (j = 1; j < nbranches; ++j) {
-		destdist = i*j;
-		if ( destdist < Np )
-		    MPI_Send( buf, count, dtype, (Iam+destdist)%Np, 
-			     tag, scp->comm );
-	    }
-	}
-    } else {
-	mydist = (Np + Iam - root) % Np;
-	for (i = nbranches; i < Np; i *= nbranches);
-	for (i /= nbranches; (mydist%i); i /= nbranches);
-/*	MPI_Probe( MPI_ANY_SOURCE, tag, scp->comm, &status );*/
-	MPI_Recv( buf, count, dtype, MPI_ANY_SOURCE, tag, scp->comm, &status );
-	MPI_Get_count( &status, dtype, recvcnt );
-
-	/* I need to send data to others. */
-	while ( (i > 1) && !(mydist%i) ) {
-	    i /= nbranches;
-	    for (j = 1; j < nbranches; ++j) {
-		destdist = mydist + j*i;
-		if ( destdist < Np )
-		    MPI_Send( buf, *recvcnt, dtype, (root+destdist)%Np, 
-			     tag, scp->comm );
-	    }
-	}
-    }
-} /* BCAST_TREE */
-
-
-
-
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,105 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-/*
- * This file defines common arithmetic operations for complex type.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include "dcomplex.h"
-
-
-/* Complex Division c = a/b */
-void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b)
-{
-    double ratio, den;
-    double abr, abi, cr, ci;
-  
-    if( (abr = b->r) < 0.)
-	abr = - abr;
-    if( (abi = b->i) < 0.)
-	abi = - abi;
-    if( abr <= abi ) {
-	if (abi == 0) {
-	    fprintf(stderr, "z_div.c: division by zero");
-	    exit(-1);
-	}	  
-	ratio = b->r / b->i ;
-	den = b->i * (1 + ratio*ratio);
-	cr = (a->r*ratio + a->i) / den;
-	ci = (a->i*ratio - a->r) / den;
-    } else {
-	ratio = b->i / b->r ;
-	den = b->r * (1 + ratio*ratio);
-	cr = (a->r + a->i*ratio) / den;
-	ci = (a->i - a->r*ratio) / den;
-    }
-    c->r = cr;
-    c->i = ci;
-}
-
-
-/* Returns sqrt(z.r^2 + z.i^2) */
-double z_abs(doublecomplex *z)
-{
-    double temp;
-    double real = z->r;
-    double imag = z->i;
-
-    if (real < 0) real = -real;
-    if (imag < 0) imag = -imag;
-    if (imag > real) {
-	temp = real;
-	real = imag;
-	imag = temp;
-    }
-    if ((real+imag) == real) return(real);
-  
-    temp = imag/real;
-    temp = real*sqrt(1.0 + temp*temp);  /*overflow!!*/
-    return (temp);
-}
-
-
-/* Approximates the abs */
-/* Returns abs(z.r) + abs(z.i) */
-double z_abs1(doublecomplex *z)
-{
-    double real = z->r;
-    double imag = z->i;
-  
-    if (real < 0) real = -real;
-    if (imag < 0) imag = -imag;
-
-    return (real + imag);
-}
-
-/* Return the exponentiation */
-void z_exp(doublecomplex *r, doublecomplex *z)
-{
-    double expx;
-
-    expx = exp(z->r);
-    r->r = expx * cos(z->i);
-    r->i = expx * sin(z->i);
-}
-
-/* Return the complex conjugate */
-void d_cnjg(doublecomplex *r, doublecomplex *z)
-{
-    r->r = z->r;
-    r->i = -z->i;
-}
-
-/* Return the imaginary part */
-double d_imag(doublecomplex *z)
-{
-    return (z->i);
-}
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dcomplex.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dcomplex.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,71 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-/* 
- * This header file is to be included in source files z*.c
- */
-#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */
-#define __SUPERLU_DCOMPLEX
-
-#include <mpi.h>
-
-typedef struct { double r, i; } doublecomplex;
-
-/*
- * These variables will be defined to be MPI datatypes for complex
- * and double complex. I'm too lazy to declare
- * these guys external in every file that needs them.
- */
-extern MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX;
-
-
-/* Macro definitions */
-
-/* Complex Addition c = a + b */
-#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \
-			 (c)->i = (a)->i + (b)->i; }
-
-/* Complex Subtraction c = a - b */
-#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \
-			 (c)->i = (a)->i - (b)->i; }
-
-/* Complex-Double Multiplication */
-#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \
-                           (c)->i = (a)->i * (b); }
-
-/* Complex-Complex Multiplication */
-#define zz_mult(c, a, b) { \
-	double cr, ci; \
-    	cr = (a)->r * (b)->r - (a)->i * (b)->i; \
-    	ci = (a)->i * (b)->r + (a)->r * (b)->i; \
-    	(c)->r = cr; \
-    	(c)->i = ci; \
-    }
-
-/* Complex equality testing */
-#define z_eq(a, b)  ( (a)->r == (b)->r && (a)->i == (b)->i )
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Prototypes for functions in dcomplex.c */
-void   z_div(doublecomplex *, doublecomplex *, doublecomplex *);
-double z_abs(doublecomplex *);     /* exact */
-double z_abs1(doublecomplex *);    /* approximate */
-void   z_exp(doublecomplex *, doublecomplex *);
-void   d_cnjg(doublecomplex *r, doublecomplex *z);
-double d_imag(doublecomplex *);
-
-
-#ifdef __cplusplus
-  }
-#endif
-
-
-#endif  /* __SUPERLU_DCOMPLEX */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/ddistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/ddistribute.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/ddistribute.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/ddistribute.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,725 +0,0 @@
-
-
-#include "superlu_ddefs.h"
-
-int_t
-ddistribute(fact_t fact, int_t n, SuperMatrix *A, 
-            Glu_freeable_t *Glu_freeable,
-	    LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- *   Distribute the matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (input) int
- *        Dimension of the matrix.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, permuted by columns, of dimension
- *        (A->nrow, A->ncol). The type of A can be:
- *        Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * LUstruct (input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   > 0, working storage required (in bytes).
- *
- */
-{
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
-          len, len1, nsupc;
-    int_t ljb;  /* local block column number */
-    int_t nrbl; /* number of L blocks in current block column */
-    int_t nrbu; /* number of U blocks in current block column */
-    int_t gb;   /* global block number; 0 < gb <= nsuper */
-    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-    int iam, jbrow, kcol, mycol, myrow, pc, pr;
-    int_t mybufmax[NBUFFERS];
-    NCPformat *Astore;
-    double *a;
-    int_t *asub;
-    int_t *xa_begin, *xa_end;
-    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
-    int_t *lsub, *xlsub, *usub, *xusub;
-    int_t nsupers;
-    int_t next_lind;      /* next available position in index[*] */
-    int_t next_lval;      /* next available position in nzval[*] */
-    int_t *index;         /* indices consist of headers and row subscripts */
-    double *lusup, *uval; /* nonzero values in L and U */
-    double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-    double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-
-    /*-- Counts to be used in factorization. --*/
-    int_t  *ToRecv, *ToSendD, **ToSendR;
-
-    /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
-
-    /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
-			      the full array (local)                 */
-
-    /*-- Auxiliary arrays; freed on return --*/
-    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
-    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
-    int_t *Ucbs;       /* number of column blocks in a block row            */
-    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
-    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
-    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
-    double *dense, *dense_col; /* SPA */
-    double zero = 0.0;
-    int_t  ldaspa;     /* LDA of SPA */
-    int_t mem_use = 0, iword, dword;
-
-#if ( PRNTlevel>=1 )
-    int_t nLblocks = 0, nUblocks = 0;
-#endif
-#if ( PROFlevel>=1 ) 
-    double t, t_u, t_l;
-    int_t u_blks;
-#endif
-
-    /* Initialization. */
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-    nsupers  = supno[n-1] + 1;
-    Astore   = A->Store;
-    a        = Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-#if ( PRNTlevel>=1 )
-    iword = sizeof(int_t);
-    dword = sizeof(double);
-#endif
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter ddistribute()");
-#endif
-
-    if ( fact == SamePattern_SameRowPerm ) {
-        /* ---------------------------------------------------------------
-         * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION.
-         * --------------------------------------------------------------- */
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* We can propagate the new values of A into the existing
-	   L and U data structures.            */
-	ilsum = Llu->ilsum;
-	ldaspa = Llu->ldalsum;
-	if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
-	if ( !(Urb_length = intCalloc_dist(nrbu)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-	Unzval_br_ptr = Llu->Unzval_br_ptr;
-#if ( PRNTlevel>=1 )
-	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
-#endif
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_();
-#endif
-
-	/* Initialize Uval to zero. */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
-	    index = Ufstnz_br_ptr[lb];
-	    if ( index ) {
-		uval = Unzval_br_ptr[lb];
-		len = index[1];
-		for (i = 0; i < len; ++i) uval[i] = zero;
-	    } /* if index != NULL */
-	} /* for lb ... */
-
-	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-
- 		/* Scatter A into SPA (for L), or into U directly. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
- 			    if ( gb < jb ) { /* in U */
- 				index = Ufstnz_br_ptr[lb];
- 				uval = Unzval_br_ptr[lb];
- 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
- 				    /* Skip nonzero values in this block */
- 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
- 				    /* Move pointer to the next block */
- 				    Urb_indptr[lb] += UB_DESCRIPTOR
- 					+ SuperSize( k );
- 				}
- 				/*assert(k == jb);*/
- 				/* start fstnz */
- 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
- 				len = Urb_length[lb];
- 				fsupc1 = FstBlockC( gb+1 );
- 				k = j - fsupc;
- 				/* Sum the lengths of the leading columns */
- 				for (jj = 0; jj < k; ++jj)
-				    len += fsupc1 - index[istart++];
-				/*assert(irow>=index[istart]);*/
-				uval[len + irow - index[istart]] = a[i];
-			    } else { /* in L; put in SPA first */
-  				irow = ilsum[lb] + irow - FstBlockC( gb );
-  				dense_col[irow] = a[i];
-  			    }
-  			}
-		    } /* for i ... */
-  		    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif
-
-		/* Gather the values of A from SPA into Lnzval[]. */
-		ljb = LBj( jb, grid ); /* Local block number */
-		index = Lrowind_bc_ptr[ljb];
-		if ( index ) {
-		    nrbl = index[0];   /* Number of row blocks. */
-		    len = index[1];    /* LDA of lusup[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (jj = 0; jj < nrbl; ++jj) {
-			gb = index[next_lind++];
-			len1 = index[next_lind++]; /* Rows in the block. */
-			lb = LBi( gb, grid );
-			for (bnnz = 0; bnnz < len1; ++bnnz) {
-			    irow = index[next_lind++]; /* Global index. */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    k = next_lval++;
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			} /* for bnnz ... */
-		    } /* for jj ... */
-		} /* if index ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-	} /* for jb ... */
-
-	SUPERLU_FREE(dense);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
-			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } else { 
-        /* --------------------------------------------------
-         * FIRST TIME CREATING THE L AND U DATA STRUCTURE. 
-         * -------------------------------------------------- */
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* No L and U data structures are available yet.
-	   We need to set up the L and U data structures and propagate
-	   the values of A into them.          */
-	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
-	xlsub = Glu_freeable->xlsub;
-	usub = Glu_freeable->usub;    /* compressed U subscripts */
-	xusub = Glu_freeable->xusub;
-    
-	if ( !(ToRecv = intCalloc_dist(nsupers)) )
-	    ABORT("Calloc fails for ToRecv[].");
-
-	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
-	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for ToSendR[].");
-	j = k * grid->npcol;
-	if ( !(index = intMalloc_dist(j)) )
-	    ABORT("Malloc fails for index[].");
-#if ( PRNTlevel>=1 )
-	mem_use += k*sizeof(int_t*) + (j + nsupers)*iword;
-#endif
-	for (i = 0; i < j; ++i) index[i] = EMPTY;
-	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-
-	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
-               (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
-	    ABORT("Malloc fails for Unzval_br_ptr[].");
-	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
-	if ( !(ToSendD = intCalloc_dist(k)) )
-	    ABORT("Malloc fails for ToSendD[].");
-	if ( !(ilsum = intMalloc_dist(k+1)) )
-	    ABORT("Malloc fails for ilsum[].");
-
-	/* Auxiliary arrays used to set up U block data structures.
-	   They are freed on return. */
-	if ( !(rb_marker = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for rb_marker[].");
-	if ( !(Urb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	if ( !(Urb_fstnz = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_fstnz[].");
-	if ( !(Ucbs = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Ucbs[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword;
-#endif
-	/* Compute ldaspa and ilsum[]. */
-	ldaspa = 0;
-	ilsum[0] = 0;
-	for (gb = 0; gb < nsupers; ++gb) {
-	    if ( myrow == PROW( gb, grid ) ) {
-		i = SuperSize( gb );
-		ldaspa += i;
-		lb = LBi( gb, grid );
-		ilsum[lb + 1] = ilsum[lb] + i;
-	    }
-	}
-	
-            
-	/* ------------------------------------------------------------
-	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
-	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
-	   ------------------------------------------------------------*/
-	
-	/* Loop through each supernode column. */
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    fsupc = FstBlockC( jb );
-	    nsupc = SuperSize( jb );
-	    /* Loop through each column in the block. */
-	    for (j = fsupc; j < fsupc + nsupc; ++j) {
-		/* usub[*] contains only "first nonzero" in each segment. */
-		for (i = xusub[j]; i < xusub[j+1]; ++i) {
-		    irow = usub[i]; /* First nonzero of the segment. */
-		    gb = BlockNum( irow );
-		    kcol = PCOL( gb, grid );
-		    ljb = LBj( gb, grid );
-		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
-		    pr = PROW( gb, grid );
-		    lb = LBi( gb, grid );
-		    if ( mycol == pc ) {
-			if  ( myrow == pr ) {
-			    ToSendD[lb] = YES;
-			    /* Count nonzeros in entire block row. */
-			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
-			    if (rb_marker[lb] <= jb) {/* First see the block */
-				rb_marker[lb] = jb + 1;
-				Urb_fstnz[lb] += nsupc;
-				++Ucbs[lb]; /* Number of column blocks
-					       in block row lb. */
-#if ( PRNTlevel>=1 )
-				++nUblocks;
-#endif
-			    }
-			    ToRecv[gb] = 1;
-			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
-		    }
-		} /* for i ... */
-	    } /* for j ... */
-	} /* for jb ... */
-	
-	/* Set up the initial pointers for each block row in U. */
-	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    len = Urb_length[lb];
-	    rb_marker[lb] = 0; /* Reset block marker. */
-	    if ( len ) {
-		/* Add room for descriptors */
-		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
-		if ( !(index = intMalloc_dist(len1+1)) )
-		    ABORT("Malloc fails for Uindex[].");
-		Ufstnz_br_ptr[lb] = index;
-		if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
-		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
-		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-		index[0] = Ucbs[lb]; /* Number of column blocks */
-		index[1] = len;      /* Total length of nzval[] */
-		index[2] = len1;     /* Total length of index[] */
-		index[len1] = -1;    /* End marker */
-	    } else {
-		Ufstnz_br_ptr[lb] = NULL;
-		Unzval_br_ptr[lb] = NULL;
-	    }
-	    Urb_length[lb] = 0; /* Reset block length. */
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
- 	    Urb_fstnz[lb] = BR_HEADER;
-	} /* for lb ... */
-
-	SUPERLU_FREE(Ucbs);
-
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_() - t;
-	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
-#endif
-#if ( PRNTlevel>=1 )
-        mem_use -= 2*k * iword;
-#endif
-	/* Auxiliary arrays used to set up L block data structures.
-	   They are freed on return.
-	   k is the number of local row blocks.   */
-	if ( !(Lrb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Lrb_length[].");
-	if ( !(Lrb_number = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_number[].");
-	if ( !(Lrb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_indptr[].");
-	if ( !(Lrb_valptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_valptr[].");
-	if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
-              *sp_ienv_dist(3)))))
-	    ABORT("Calloc fails for SPA dense[].");
-
-	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for bmod[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword;
-#endif
-	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-
-	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
-	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
-	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
-	Lrowind_bc_ptr[k-1] = NULL;
-
-	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for fsendx_plist[].");
-	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
-#if ( PRNTlevel>=1 )
-	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
-#endif
-	/*------------------------------------------------------------
-	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-	  ------------------------------------------------------------*/
-
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-		ljb = LBj( jb, grid ); /* Local block number */
-		
-		/* Scatter A into SPA. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    dense_col[irow] = a[i];
-			}
-		    }
-		    dense_col += ldaspa;
-		}
-
-		jbrow = PROW( jb, grid );
-
-#if ( PROFlevel>=1 )
-		t = SuperLU_timer_();
-#endif
-		/*------------------------------------------------
-		 * SET UP U BLOCKS.
-		 *------------------------------------------------*/
-		kseen = 0;
-		dense_col = dense;
-		/* Loop through each column in the block column. */
-		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
-		    istart = xusub[j];
-		    /* NOTE: Only the first nonzero index of the segment
-		       is stored in usub[]. */
-		    for (i = istart; i < xusub[j+1]; ++i) {
-			irow = usub[i]; /* First nonzero in the segment. */
-			gb = BlockNum( irow );
-			pr = PROW( gb, grid );
-			if ( pr != jbrow &&
-			     myrow == jbrow &&  /* diag. proc. owning jb */
-			     bsendx_plist[ljb][pr] == EMPTY ) {
-			    bsendx_plist[ljb][pr] = YES;
-			    ++nbsendx;
-                        }
-			if ( myrow == pr ) {
-			    lb = LBi( gb, grid ); /* Local block number */
-			    index = Ufstnz_br_ptr[lb];
-			    uval = Unzval_br_ptr[lb];
-			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
-							  the block       */
-				rb_marker[lb] = jb + 1;
-				Urb_indptr[lb] = Urb_fstnz[lb];;
-				index[Urb_indptr[lb]] = jb; /* Descriptor */
-				Urb_indptr[lb] += UB_DESCRIPTOR;
-				/* Record the first location in index[] of the
-				   next block */
-				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
-				len = Urb_indptr[lb];/* Start fstnz in index */
-				index[len-1] = 0;
-				for (k = 0; k < nsupc; ++k)
-				    index[len+k] = fsupc1;
-				if ( gb != jb )/* Exclude diagonal block. */
-				    ++bmod[lb];/* Mod. count for back solve */
-				if ( kseen == 0 && myrow != jbrow ) {
-				    ++nbrecvx;
-				    kseen = 1;
-				}
-			    } else { /* Already saw the block */
-				len = Urb_indptr[lb];/* Start fstnz in index */
-			    }
-			    jj = j - fsupc;
-			    index[len+jj] = irow;
-			    /* Load the numerical values */
-			    k = fsupc1 - irow; /* No. of nonzeros in segment */
-			    index[len-1] += k; /* Increment block length in
-						  Descriptor */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (ii = 0; ii < k; ++ii) {
-				uval[Urb_length[lb]++] = dense_col[irow + ii];
-				dense_col[irow + ii] = zero;
-			    }
-			} /* if myrow == pr ... */
-		    } /* for i ... */
-                    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif
-
-		/*------------------------------------------------
-		 * SET UP L BLOCKS.
-		 *------------------------------------------------*/
-
-		/* Count number of blocks and length of each block. */
-		nrbl = 0;
-		len = 0; /* Number of row subscripts I own. */
-		kseen = 0;
-		istart = xlsub[fsupc];
-		for (i = istart; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    gb = BlockNum( irow ); /* Global block number */
-		    pr = PROW( gb, grid ); /* Process row owning this block */
-		    if ( pr != jbrow &&
-			 myrow == jbrow &&  /* diag. proc. owning jb */
-			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
-			fsendx_plist[ljb][pr] = YES;
-			++nfsendx;
-                    }
-		    if ( myrow == pr ) {
-			lb = LBi( gb, grid );  /* Local block number */
-			if (rb_marker[lb] <= jb) { /* First see this block */
-			    rb_marker[lb] = jb + 1;
-			    Lrb_length[lb] = 1;
-			    Lrb_number[nrbl++] = gb;
-			    if ( gb != jb ) /* Exclude diagonal block. */
-				++fmod[lb]; /* Mod. count for forward solve */
-			    if ( kseen == 0 && myrow != jbrow ) {
-				++nfrecvx;
-				kseen = 1;
-			    }
-#if ( PRNTlevel>=1 )
-			    ++nLblocks;
-#endif
-			} else {
-			    ++Lrb_length[lb];
-			}
-			++len;
-		    }
-		} /* for i ... */
-
-		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
-		       index[] and nzval[]. */
-		    /* Add room for descriptors */
-		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index[]");
-		    Lrowind_bc_ptr[ljb] = index;
-		    if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) {
-			fprintf(stderr, "col block %d ", jb);
-			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
-		    }
-		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-		    index[0] = nrbl;  /* Number of row blocks */
-		    index[1] = len;   /* LDA of the nzval[] */
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (k = 0; k < nrbl; ++k) {
-			gb = Lrb_number[k];
-			lb = LBi( gb, grid );
-			len = Lrb_length[lb];
-			Lrb_length[lb] = 0;  /* Reset vector of block length */
-			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
-			Lrb_indptr[lb] = next_lind;
-			Lrb_valptr[lb] = next_lval;
-			next_lind += len;
-			next_lval += len;
-		    }
-		    /* Propagate the compressed row subscripts to Lindex[], and
-		       the initial values of A from SPA into Lnzval[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    len = index[1];  /* LDA of lusup[] */
-		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
-			irow = lsub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    k = Lrb_indptr[lb]++; /* Random access a block */
-			    index[k] = irow;
-			    k = Lrb_valptr[lb]++;
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = 0.0;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			}
-		    } /* for i ... */
-		} else {
-		    Lrowind_bc_ptr[ljb] = NULL;
-		    Lnzval_bc_ptr[ljb] = NULL;
-		} /* if nrbl ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-
-	} /* for jb ... */
-
-	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-	Llu->Unzval_br_ptr = Unzval_br_ptr;
-	Llu->ToRecv = ToRecv;
-	Llu->ToSendD = ToSendD;
-	Llu->ToSendR = ToSendR;
-	Llu->fmod = fmod;
-	Llu->fsendx_plist = fsendx_plist;
-	Llu->nfrecvx = nfrecvx;
-	Llu->nfsendx = nfsendx;
-	Llu->bmod = bmod;
-	Llu->bsendx_plist = bsendx_plist;
-	Llu->nbrecvx = nbrecvx;
-	Llu->nbsendx = nbsendx;
-	Llu->ilsum = ilsum;
-	Llu->ldalsum = ldaspa;
-	
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-			   nLblocks, nUblocks);
-#endif
-
-	SUPERLU_FREE(rb_marker);
-	SUPERLU_FREE(Urb_fstnz);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-	SUPERLU_FREE(Lrb_length);
-	SUPERLU_FREE(Lrb_number);
-	SUPERLU_FREE(Lrb_indptr);
-	SUPERLU_FREE(Lrb_valptr);
-	SUPERLU_FREE(dense);
-
-	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
-
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 1st distribute time:\n "
-			   "\tL\t%.2f\n\tU\t%.2f\n"
-			   "\tu_blks %d\tnrbu %d\n--------\n",
-  			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } /* else fact != SamePattern_SameRowPerm */
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed:
-       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
-    CHECK_MALLOC(iam, "Exit ddistribute()");
-#endif
-
-    return (mem_use);
-} /* DDISTRIBUTE */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dgsequ.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,178 +0,0 @@
-
-
-/*
- * File name:	dgsequ.c
- * History:     Modified from LAPACK routine DGEEQU
- */
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-dgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	    double *colcnd, double *amax, int_t *info)
-{
-/*    
-    Purpose   
-    =======   
-
-    DGSEQU_dist computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
-	    
-    R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) double*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= M:  the i-th row of A is exactly zero   
-                  >  M:  the (i-M)-th column of A is exactly zero   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int i, j, irow;
-    double rcmin, rcmax;
-    double bignum, smlnum;
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("dgsequ_dist", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = (NCformat *) A->Store;
-    Aval = (double *) Astore->nzval;
-    
-    /* Get machine constants. */
-    smlnum = dlamch_("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = 0; i < A->nrow; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    return;
-
-} /* dgsequ_dist */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlamch.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlamch.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,964 +0,0 @@
-#include <stdio.h>
-#include "Cnames.h"
-#define TRUE_ (1)
-#define FALSE_ (0)
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-
-double dlamch_(char *cmach)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-    Purpose   
-    =======   
-
-    DLAMCH determines double precision machine parameters.   
-
-    Arguments   
-    =========   
-
-    CMACH   (input) CHARACTER*1   
-            Specifies the value to be returned by DLAMCH:   
-            = 'E' or 'e',   DLAMCH := eps   
-            = 'S' or 's ,   DLAMCH := sfmin   
-            = 'B' or 'b',   DLAMCH := base   
-            = 'P' or 'p',   DLAMCH := eps*base   
-            = 'N' or 'n',   DLAMCH := t   
-            = 'R' or 'r',   DLAMCH := rnd   
-            = 'M' or 'm',   DLAMCH := emin   
-            = 'U' or 'u',   DLAMCH := rmin   
-            = 'L' or 'l',   DLAMCH := emax   
-            = 'O' or 'o',   DLAMCH := rmax   
-
-            where   
-
-            eps   = relative machine precision   
-            sfmin = safe minimum, such that 1/sfmin does not overflow   
-            base  = base of the machine   
-            prec  = eps*base   
-            t     = number of (base) digits in the mantissa   
-            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
-            emin  = minimum exponent before (gradual) underflow   
-            rmin  = underflow threshold - base**(emin-1)   
-            emax  = largest exponent before overflow   
-            rmax  = overflow threshold  - (base**emax)*(1-eps)   
-
-   ===================================================================== 
-*/
-
-    static int first = TRUE_;
-
-    /* System generated locals */
-    int i__1;
-    double ret_val;
-    /* Builtin functions */
-    double pow_di(double *, int *);
-    /* Local variables */
-    static double base;
-    static int beta;
-    static double emin, prec, emax;
-    static int imin, imax;
-    static int lrnd;
-    static double rmin, rmax, t, rmach;
-    extern int lsame_(char *, char *);
-    static double small, sfmin;
-    extern /* Subroutine */ int dlamc2_(int *, int *, int *, 
-	    double *, int *, double *, int *, double *);
-    static int it;
-    static double rnd, eps;
-
-    if (first) {
-	first = FALSE_;
-	dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
-	base = (double) beta;
-	t = (double) it;
-	if (lrnd) {
-	    rnd = 1.;
-	    i__1 = 1 - it;
-	    eps = pow_di(&base, &i__1) / 2;
-	} else {
-	    rnd = 0.;
-	    i__1 = 1 - it;
-	    eps = pow_di(&base, &i__1);
-	}
-	prec = eps * base;
-	emin = (double) imin;
-	emax = (double) imax;
-	sfmin = rmin;
-	small = 1. / rmax;
-	if (small >= sfmin) {
-
-	/* Use SMALL plus a bit, to avoid the possibility of rounding   
-             causing overflow when computing  1/sfmin. */
-	    sfmin = small * (eps + 1.);
-	}
-    }
-
-    if (lsame_(cmach, "E")) {
-	rmach = eps;
-    } else if (lsame_(cmach, "S")) {
-	rmach = sfmin;
-    } else if (lsame_(cmach, "B")) {
-	rmach = base;
-    } else if (lsame_(cmach, "P")) {
-	rmach = prec;
-    } else if (lsame_(cmach, "N")) {
-	rmach = t;
-    } else if (lsame_(cmach, "R")) {
-	rmach = rnd;
-    } else if (lsame_(cmach, "M")) {
-	rmach = emin;
-    } else if (lsame_(cmach, "U")) {
-	rmach = rmin;
-    } else if (lsame_(cmach, "L")) {
-	rmach = emax;
-    } else if (lsame_(cmach, "O")) {
-	rmach = rmax;
-    }
-
-    ret_val = rmach;
-    return ret_val;
-
-/*     End of DLAMCH */
-
-} /* dlamch_ */
-
-
-/* Subroutine */ int dlamc1_(int *beta, int *t, int *rnd, int 
-	*ieee1)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC1 determines the machine parameters given by BETA, T, RND, and   
-    IEEE1.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    IEEE1   (output) INT   
-            Specifies whether rounding appears to be done in the IEEE   
-            'round to nearest' style.   
-
-    Further Details   
-    ===============   
-
-    The routine is based on the routine  ENVRON  by Malcolm and   
-    incorporates suggestions by Gentleman and Marovich. See   
-
-       Malcolm M. A. (1972) Algorithms to reveal properties of   
-          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
-
-       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
-          that reveal properties of floating point arithmetic units.   
-          Comms. of the ACM, 17, 276-277.   
-
-   ===================================================================== 
-*/
-    /* Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    double d__1, d__2;
-    /* Local variables */
-    static int lrnd;
-    static double a, b, c, f;
-    static int lbeta;
-    static double savec;
-    extern double dlamc3_(double *, double *);
-    static int lieee1;
-    static double t1, t2;
-    static int lt;
-    static double one, qtr;
-
-    if (first) {
-	first = FALSE_;
-	one = 1.;
-
-/*        LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BE
-TA,   
-          IEEE1, T and RND.   
-
-          Throughout this routine  we use the function  DLAMC3  to ens
-ure   
-          that relevant values are  stored and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          Compute  a = 2.0**m  with the  smallest positive integer m s
-uch   
-          that   
-
-             fl( a + 1.0 ) = a. */
-
-	a = 1.;
-	c = 1.;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L10:
-	if (c == one) {
-	    a *= 2;
-	    c = dlamc3_(&a, &one);
-	    d__1 = -a;
-	    c = dlamc3_(&c, &d__1);
-	    goto L10;
-	}
-/* +       END WHILE   
-
-          Now compute  b = 2.0**m  with the smallest positive integer 
-m   
-          such that   
-
-             fl( a + b ) .gt. a. */
-
-	b = 1.;
-	c = dlamc3_(&a, &b);
-
-/* +       WHILE( C.EQ.A )LOOP */
-L20:
-	if (c == a) {
-	    b *= 2;
-	    c = dlamc3_(&a, &b);
-	    goto L20;
-	}
-/* +       END WHILE   
-
-          Now compute the base.  a and c  are neighbouring floating po
-int   
-          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
- so   
-          their difference is beta. Adding 0.25 to c is to ensure that
- it   
-          is truncated to beta and not ( beta - 1 ). */
-
-	qtr = one / 4;
-	savec = c;
-	d__1 = -a;
-	c = dlamc3_(&c, &d__1);
-	lbeta = (int) (c + qtr);
-
-/*        Now determine whether rounding or chopping occurs,  by addin
-g a   
-          bit  less  than  beta/2  and a  bit  more  than  beta/2  to 
- a. */
-
-	b = (double) lbeta;
-	d__1 = b / 2;
-	d__2 = -b / 100;
-	f = dlamc3_(&d__1, &d__2);
-	c = dlamc3_(&f, &a);
-	if (c == a) {
-	    lrnd = TRUE_;
-	} else {
-	    lrnd = FALSE_;
-	}
-	d__1 = b / 2;
-	d__2 = b / 100;
-	f = dlamc3_(&d__1, &d__2);
-	c = dlamc3_(&f, &a);
-	if (lrnd && c == a) {
-	    lrnd = FALSE_;
-	}
-
-/*        Try and decide whether rounding is done in the  IEEE  'round
- to   
-          nearest' style. B/2 is half a unit in the last place of the 
-two   
-          numbers A and SAVEC. Furthermore, A is even, i.e. has last  
-bit   
-          zero, and SAVEC is odd. Thus adding B/2 to A should not  cha
-nge   
-          A, but adding B/2 to SAVEC should change SAVEC. */
-
-	d__1 = b / 2;
-	t1 = dlamc3_(&d__1, &a);
-	d__1 = b / 2;
-	t2 = dlamc3_(&d__1, &savec);
-	lieee1 = t1 == a && t2 > savec && lrnd;
-
-/*        Now find  the  mantissa, t.  It should  be the  integer part
- of   
-          log to the base beta of a,  however it is safer to determine
-  t   
-          by powering.  So we find t as the smallest positive integer 
-for   
-          which   
-
-             fl( beta**t + 1.0 ) = 1.0. */
-
-	lt = 0;
-	a = 1.;
-	c = 1.;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L30:
-	if (c == one) {
-	    ++lt;
-	    a *= lbeta;
-	    c = dlamc3_(&a, &one);
-	    d__1 = -a;
-	    c = dlamc3_(&c, &d__1);
-	    goto L30;
-	}
-/* +       END WHILE */
-
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *ieee1 = lieee1;
-    return 0;
-
-/*     End of DLAMC1 */
-
-} /* dlamc1_ */
-
-
-/* Subroutine */ int dlamc2_(int *beta, int *t, int *rnd, 
-	double *eps, int *emin, double *rmin, int *emax, 
-	double *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC2 determines the machine parameters specified in its argument   
-    list.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    EPS     (output) DOUBLE PRECISION   
-            The smallest positive number such that   
-
-               fl( 1.0 - EPS ) .LT. 1.0,   
-
-            where fl denotes the computed value.   
-
-    EMIN    (output) INT   
-            The minimum exponent before (gradual) underflow occurs.   
-
-    RMIN    (output) DOUBLE PRECISION   
-            The smallest normalized number for the machine, given by   
-            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
-  
-            of BETA.   
-
-    EMAX    (output) INT   
-            The maximum exponent before overflow occurs.   
-
-    RMAX    (output) DOUBLE PRECISION   
-            The largest positive number for the machine, given by   
-            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
-  
-            value of BETA.   
-
-    Further Details   
-    ===============   
-
-    The computation of  EPS  is based on a routine PARANOIA by   
-    W. Kahan of the University of California at Berkeley.   
-
-   ===================================================================== 
-*/
-    /* Table of constant values */
-    static int c__1 = 1;
-    
-    /* Initialized data */
-    static int first = TRUE_;
-    static int iwarn = FALSE_;
-    /* System generated locals */
-    int i__1;
-    double d__1, d__2, d__3, d__4, d__5;
-    /* Builtin functions */
-    double pow_di(double *, int *);
-    /* Local variables */
-    static int ieee;
-    static double half;
-    static int lrnd;
-    static double leps, zero, a, b, c;
-    static int i, lbeta;
-    static double rbase;
-    static int lemin, lemax, gnmin;
-    static double small;
-    static int gpmin;
-    static double third, lrmin, lrmax, sixth;
-    extern /* Subroutine */ int dlamc1_(int *, int *, int *, 
-	    int *);
-    extern double dlamc3_(double *, double *);
-    static int lieee1;
-    extern /* Subroutine */ int dlamc4_(int *, double *, int *), 
-	    dlamc5_(int *, int *, int *, int *, int *, 
-	    double *);
-    static int lt, ngnmin, ngpmin;
-    static double one, two;
-
-    if (first) {
-	first = FALSE_;
-	zero = 0.;
-	one = 1.;
-	two = 2.;
-
-/*        LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values
- of   
-          BETA, T, RND, EPS, EMIN and RMIN.   
-
-          Throughout this routine  we use the function  DLAMC3  to ens
-ure   
-          that relevant values are stored  and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          DLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1. 
-*/
-
-	dlamc1_(&lbeta, &lt, &lrnd, &lieee1);
-
-/*        Start to find EPS. */
-
-	b = (double) lbeta;
-	i__1 = -lt;
-	a = pow_di(&b, &i__1);
-	leps = a;
-
-/*        Try some tricks to see whether or not this is the correct  E
-PS. */
-
-	b = two / 3;
-	half = one / 2;
-	d__1 = -half;
-	sixth = dlamc3_(&b, &d__1);
-	third = dlamc3_(&sixth, &sixth);
-	d__1 = -half;
-	b = dlamc3_(&third, &d__1);
-	b = dlamc3_(&b, &sixth);
-	b = abs(b);
-	if (b < leps) {
-	    b = leps;
-	}
-
-	leps = 1.;
-
-/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
-L10:
-	if (leps > b && b > zero) {
-	    leps = b;
-	    d__1 = half * leps;
-/* Computing 5th power */
-	    d__3 = two, d__4 = d__3, d__3 *= d__3;
-/* Computing 2nd power */
-	    d__5 = leps;
-	    d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5);
-	    c = dlamc3_(&d__1, &d__2);
-	    d__1 = -c;
-	    c = dlamc3_(&half, &d__1);
-	    b = dlamc3_(&half, &c);
-	    d__1 = -b;
-	    c = dlamc3_(&half, &d__1);
-	    b = dlamc3_(&half, &c);
-	    goto L10;
-	}
-/* +       END WHILE */
-
-	if (a < leps) {
-	    leps = a;
-	}
-
-/*        Computation of EPS complete.   
-
-          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3
-)).   
-          Keep dividing  A by BETA until (gradual) underflow occurs. T
-his   
-          is detected when we cannot recover the previous A. */
-
-	rbase = one / lbeta;
-	small = one;
-	for (i = 1; i <= 3; ++i) {
-	    d__1 = small * rbase;
-	    small = dlamc3_(&d__1, &zero);
-/* L20: */
-	}
-	a = dlamc3_(&one, &small);
-	dlamc4_(&ngpmin, &one, &lbeta);
-	d__1 = -one;
-	dlamc4_(&ngnmin, &d__1, &lbeta);
-	dlamc4_(&gpmin, &a, &lbeta);
-	d__1 = -a;
-	dlamc4_(&gnmin, &d__1, &lbeta);
-	ieee = FALSE_;
-
-	if (ngpmin == ngnmin && gpmin == gnmin) {
-	    if (ngpmin == gpmin) {
-		lemin = ngpmin;
-/*            ( Non twos-complement machines, no gradual under
-flow;   
-                e.g.,  VAX ) */
-	    } else if (gpmin - ngpmin == 3) {
-		lemin = ngpmin - 1 + lt;
-		ieee = TRUE_;
-/*            ( Non twos-complement machines, with gradual und
-erflow;   
-                e.g., IEEE standard followers ) */
-	    } else {
-		lemin = min(ngpmin,gpmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if (ngpmin == gpmin && ngnmin == gnmin) {
-	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
-		lemin = max(ngpmin,ngnmin);
-/*            ( Twos-complement machines, no gradual underflow
-;   
-                e.g., CYBER 205 ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
-		 {
-	    if (gpmin - min(ngpmin,ngnmin) == 3) {
-		lemin = max(ngpmin,ngnmin) - 1 + lt;
-/*            ( Twos-complement machines with gradual underflo
-w;   
-                no known machine ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else {
-/* Computing MIN */
-	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
-	    lemin = min(i__1,gnmin);
-/*         ( A guess; no known machine ) */
-	    iwarn = TRUE_;
-	}
-/* **   
-   Comment out this if block if EMIN is ok */
-	if (iwarn) {
-	    first = TRUE_;
-	    printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    printf("EMIN = %8i\n",lemin);
-	    printf("If, after inspection, the value EMIN looks acceptable");
-            printf("please comment out \n the IF block as marked within the"); 
-            printf("code of routine DLAMC2, \n otherwise supply EMIN"); 
-            printf("explicitly.\n");
-	}
-/* **   
-
-          Assume IEEE arithmetic if we found denormalised  numbers abo
-ve,   
-          or if arithmetic seems to round in the  IEEE style,  determi
-ned   
-          in routine DLAMC1. A true IEEE machine should have both  thi
-ngs   
-          true; however, faulty machines may have one or the other. */
-
-	ieee = ieee || lieee1;
-
-/*        Compute  RMIN by successive division by  BETA. We could comp
-ute   
-          RMIN as BASE**( EMIN - 1 ),  but some machines underflow dur
-ing   
-          this computation. */
-
-	lrmin = 1.;
-	i__1 = 1 - lemin;
-	for (i = 1; i <= 1-lemin; ++i) {
-	    d__1 = lrmin * rbase;
-	    lrmin = dlamc3_(&d__1, &zero);
-/* L30: */
-	}
-
-/*        Finally, call DLAMC5 to compute EMAX and RMAX. */
-
-	dlamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *eps = leps;
-    *emin = lemin;
-    *rmin = lrmin;
-    *emax = lemax;
-    *rmax = lrmax;
-
-    return 0;
-
-
-/*     End of DLAMC2 */
-
-} /* dlamc2_ */
-
-
-double dlamc3_(double *a, double *b)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC3  is intended to force  A  and  B  to be stored prior to doing 
-  
-    the addition of  A  and  B ,  for use in situations where optimizers 
-  
-    might hold one of these in a register.   
-
-    Arguments   
-    =========   
-
-    A, B    (input) DOUBLE PRECISION   
-            The values A and B.   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       System generated locals */
-    double ret_val;
-
-    ret_val = *a + *b;
-
-    return ret_val;
-
-/*     End of DLAMC3 */
-
-} /* dlamc3_ */
-
-
-/* Subroutine */ int dlamc4_(int *emin, double *start, int *base)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC4 is a service routine for DLAMC2.   
-
-    Arguments   
-    =========   
-
-    EMIN    (output) EMIN   
-            The minimum exponent before (gradual) underflow, computed by 
-  
-            setting A = START and dividing by BASE until the previous A   
-            can not be recovered.   
-
-    START   (input) DOUBLE PRECISION   
-            The starting point for determining EMIN.   
-
-    BASE    (input) INT   
-            The base of the machine.   
-
-   ===================================================================== 
-*/
-    /* System generated locals */
-    int i__1;
-    double d__1;
-    /* Local variables */
-    static double zero, a;
-    static int i;
-    static double rbase, b1, b2, c1, c2, d1, d2;
-    extern double dlamc3_(double *, double *);
-    static double one;
-
-    a = *start;
-    one = 1.;
-    rbase = one / *base;
-    zero = 0.;
-    *emin = 1;
-    d__1 = a * rbase;
-    b1 = dlamc3_(&d__1, &zero);
-    c1 = a;
-    c2 = a;
-    d1 = a;
-    d2 = a;
-/* +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.   
-      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP */
-L10:
-    if (c1 == a && c2 == a && d1 == a && d2 == a) {
-	--(*emin);
-	a = b1;
-	d__1 = a / *base;
-	b1 = dlamc3_(&d__1, &zero);
-	d__1 = b1 * *base;
-	c1 = dlamc3_(&d__1, &zero);
-	d1 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d1 += b1;
-/* L20: */
-	}
-	d__1 = a * rbase;
-	b2 = dlamc3_(&d__1, &zero);
-	d__1 = b2 / rbase;
-	c2 = dlamc3_(&d__1, &zero);
-	d2 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d2 += b2;
-/* L30: */
-	}
-	goto L10;
-    }
-/* +    END WHILE */
-
-    return 0;
-
-/*     End of DLAMC4 */
-
-} /* dlamc4_ */
-
-
-/* Subroutine */ int dlamc5_(int *beta, int *p, int *emin, 
-	int *ieee, int *emax, double *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC5 attempts to compute RMAX, the largest machine floating-point   
-    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
-    approximately to a power of 2.  It will fail on machines where this   
-    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
-  
-    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
-    too large (i.e. too close to zero), probably with overflow.   
-
-    Arguments   
-    =========   
-
-    BETA    (input) INT   
-            The base of floating-point arithmetic.   
-
-    P       (input) INT   
-            The number of base BETA digits in the mantissa of a   
-            floating-point value.   
-
-    EMIN    (input) INT   
-            The minimum exponent before (gradual) underflow.   
-
-    IEEE    (input) INT   
-            A int flag specifying whether or not the arithmetic   
-            system is thought to comply with the IEEE standard.   
-
-    EMAX    (output) INT   
-            The largest exponent before overflow   
-
-    RMAX    (output) DOUBLE PRECISION   
-            The largest machine floating-point number.   
-
-   ===================================================================== 
-  
-
-
-       First compute LEXP and UEXP, two powers of 2 that bound   
-       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
-       approximately to the bound that is closest to abs(EMIN).   
-       (EMAX is the exponent of the required number RMAX). */
-    /* Table of constant values */
-    static double c_b5 = 0.;
-    
-    /* System generated locals */
-    int i__1;
-    double d__1;
-    /* Local variables */
-    static int lexp;
-    static double oldy;
-    static int uexp, i;
-    static double y, z;
-    static int nbits;
-    extern double dlamc3_(double *, double *);
-    static double recbas;
-    static int exbits, expsum, try__;
-
-
-
-    lexp = 1;
-    exbits = 1;
-L10:
-    try__ = lexp << 1;
-    if (try__ <= -(*emin)) {
-	lexp = try__;
-	++exbits;
-	goto L10;
-    }
-    if (lexp == -(*emin)) {
-	uexp = lexp;
-    } else {
-	uexp = try__;
-	++exbits;
-    }
-
-/*     Now -LEXP is less than or equal to EMIN, and -UEXP is greater   
-       than or equal to EMIN. EXBITS is the number of bits needed to   
-       store the exponent. */
-
-    if (uexp + *emin > -lexp - *emin) {
-	expsum = lexp << 1;
-    } else {
-	expsum = uexp << 1;
-    }
-
-/*     EXPSUM is the exponent range, approximately equal to   
-       EMAX - EMIN + 1 . */
-
-    *emax = expsum + *emin - 1;
-    nbits = exbits + 1 + *p;
-
-/*     NBITS is the total number of bits needed to store a   
-       floating-point number. */
-
-    if (nbits % 2 == 1 && *beta == 2) {
-
-/*        Either there are an odd number of bits used to store a   
-          floating-point number, which is unlikely, or some bits are 
-  
-          not used in the representation of numbers, which is possible
-,   
-          (e.g. Cray machines) or the mantissa has an implicit bit,   
-          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
-   
-          most likely. We have to assume the last alternative.   
-          If this is true, then we need to reduce EMAX by one because 
-  
-          there must be some way of representing zero in an implicit-b
-it   
-          system. On machines like Cray, we are reducing EMAX by one 
-  
-          unnecessarily. */
-
-	--(*emax);
-    }
-
-    if (*ieee) {
-
-/*        Assume we are on an IEEE machine which reserves one exponent
-   
-          for infinity and NaN. */
-
-	--(*emax);
-    }
-
-/*     Now create RMAX, the largest machine number, which should   
-       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .   
-
-       First compute 1.0 - BETA**(-P), being careful that the   
-       result is less than 1.0 . */
-
-    recbas = 1. / *beta;
-    z = *beta - 1.;
-    y = 0.;
-    i__1 = *p;
-    for (i = 1; i <= *p; ++i) {
-	z *= recbas;
-	if (y < 1.) {
-	    oldy = y;
-	}
-	y = dlamc3_(&y, &z);
-/* L20: */
-    }
-    if (y >= 1.) {
-	y = oldy;
-    }
-
-/*     Now multiply by BETA**EMAX to get RMAX. */
-
-    i__1 = *emax;
-    for (i = 1; i <= *emax; ++i) {
-	d__1 = y * *beta;
-	y = dlamc3_(&d__1, &c_b5);
-/* L30: */
-    }
-
-    *rmax = y;
-    return 0;
-
-/*     End of DLAMC5 */
-
-} /* dlamc5_ */
-
-double pow_di(double *ap, int *bp)
-{
-    double pow, x;
-    int n;
-
-    pow = 1;
-    x = *ap;
-    n = *bp;
-
-    if(n != 0){
-	if(n < 0) {
-	    n = -n;
-	    x = 1/x;
-	}
-	for( ; ; ) {
-	    if(n & 01) pow *= x;
-	    if(n >>= 1)	x *= x;
-	    else break;
-	}
-    }
-    return(pow);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlangs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-
-
-/*
- * File name:	dlangs.c
- * History:     Modified from lapack routine DLANGE
- */
-#include <math.h>
-#include "superlu_ddefs.h"
-
-double dlangs_dist(char *norm, SuperMatrix *A)
-{
-/* 
-    Purpose   
-    =======   
-
-    DLANGS_dist returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    DLANGE returns the value   
-
-       DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                (   
-                ( norm1(A),         NORM = '1', 'O' or 'o'   
-                (   
-                ( normI(A),         NORM = 'I' or 'i'   
-                (   
-                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in DLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int_t    i, j, irow;
-    double   value=0., sum;
-    double   *rwork;
-
-    Astore = (NCformat *) A->Store;
-    Aval   = (double *) Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-	
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
-		value = SUPERLU_MAX( value, fabs( Aval[i]) );
-	
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += fabs(Aval[i]);
-	    value = SUPERLU_MAX(value, sum);
-	}
-	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
-	    ABORT("SUPERLU_MALLOC fails for rwork.");
-	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
-		irow = Astore->rowind[i];
-		rwork[irow] += fabs(Aval[i]);
-	    }
-	value = 0.;
-	for (i = 0; i < A->nrow; ++i)
-	    value = SUPERLU_MAX(value, rwork[i]);
-	
-	SUPERLU_FREE (rwork);
-	
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else
-	ABORT("Illegal norm specified.");
-
-    return (value);
-
-} /* dlangs_dist */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlaqgs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dlaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dlaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,130 +0,0 @@
-
-
-/*
- * File name:	dlaqgs.c
- * History:     Modified from LAPACK routine DLAQGE
- */
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-dlaqgs_dist(SuperMatrix *A, double *r, double *c, 
-	    double rowcnd, double colcnd, double amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    DLAQGS_dist equilibrates a general sparse M by N matrix A using the row
-    and column scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
-	    
-    R       (input) double*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) double*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) double
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) double
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) double
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int_t i, j, irow;
-    double large, small, cj;
-    extern double dlamch_(char *);
-
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = (NCformat *) A->Store;
-    Aval = (double *) Astore->nzval;
-    
-    /* Initialize LARGE and SMALL. */
-    small = dlamch_("Safe minimum") / dlamch_("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    for (j = 0; j < A->ncol; ++j) {
-		cj = c[j];
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    Aval[i] *= cj;
-                }
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		Aval[i] *= r[irow];
-	    }
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Row and column scaling */
-	for (j = 0; j < A->ncol; ++j) {
-	    cj = c[j];
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		Aval[i] *= cj * r[irow];
-	    }
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* dlaqgs_dist */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dldperm.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dldperm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dldperm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dldperm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,157 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-extern void mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
-		    int_t*, int_t [], int_t*, int_t[], int_t*, double [],
-		    int_t [], int_t []);
-
-void
-dldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
-	double nzval[], int_t *perm, double u[], double v[])
-{
-/*
- * Purpose
- * =======
- *
- *   DLDPERM finds a row permutation so that the matrix has large
- *   entries on the diagonal.
- *
- * Arguments
- * =========
- *
- * job    (input) int
- *        Control the action. Possible values for JOB are:
- *        = 1 : Compute a row permutation of the matrix so that the
- *              permuted matrix has as many entries on its diagonal as
- *              possible. The values on the diagonal are of arbitrary size.
- *              HSL subroutine MC21A/AD is used for this.
- *        = 2 : Compute a row permutation of the matrix so that the smallest 
- *              value on the diagonal of the permuted matrix is maximized.
- *        = 3 : Compute a row permutation of the matrix so that the smallest
- *              value on the diagonal of the permuted matrix is maximized.
- *              The algorithm differs from the one used for JOB = 2 and may
- *              have quite a different performance.
- *        = 4 : Compute a row permutation of the matrix so that the sum
- *              of the diagonal entries of the permuted matrix is maximized.
- *        = 5 : Compute a row permutation of the matrix so that the product
- *              of the diagonal entries of the permuted matrix is maximized
- *              and vectors to scale the matrix so that the nonzero diagonal 
- *              entries of the permuted matrix are one in absolute value and 
- *              all the off-diagonal entries are less than or equal to one in 
- *              absolute value.
- *        Restriction: 1 <= JOB <= 5.
- *
- * n      (input) int
- *        The order of the matrix.
- *
- * nnz    (input) int
- *        The number of nonzeros in the matrix.
- *
- * adjncy (input) int*, of size nnz
- *        The adjacency structure of the matrix, which contains the row
- *        indices of the nonzeros.
- *
- * colptr (input) int*, of size n+1
- *        The pointers to the beginning of each column in ADJNCY.
- *
- * nzval  (input) double*, of size nnz
- *        The nonzero values of the matrix. nzval[k] is the value of
- *        the entry corresponding to adjncy[k].
- *        It is not used if job = 1.
- *
- * perm   (output) int*, of size n
- *        The permutation vector. perm[i] = j means row i in the
- *        original matrix is in row j of the permuted matrix.
- *
- * u      (output) double*, of size n
- *        If job = 5, the natural logarithms of the row scaling factors. 
- *
- * v      (output) double*, of size n
- *        If job = 5, the natural logarithms of the column scaling factors. 
- *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
- */
- 
-    int_t i, liw, ldw, num;
-    int_t *iw, icntl[10], info[10];
-    double *dw;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(0, "Enter dldperm()");
-#endif
-    liw = 5*n;
-    if ( job == 3 ) liw = 10*n + nnz;
-    if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
-    ldw = 3*n + nnz;
-    if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
-	    
-    /* Increment one to get 1-based indexing. */
-    for (i = 0; i <= n; ++i) ++colptr[i];
-    for (i = 0; i < nnz; ++i) ++adjncy[i];
-#if ( DEBUGlevel>=2 )
-    printf("LDPERM(): n %d, nnz %d\n", n, nnz);
-    PrintInt10("colptr", n+1, colptr);
-    PrintInt10("adjncy", nnz, adjncy);
-#endif
-	
-    /* 
-     * NOTE:
-     * =====
-     *
-     * MC64AD assumes that column permutation vector is defined as:
-     * perm(i) = j means column i of permuted A is in column j of original A.
-     *
-     * Since a symmetric permutation preserves the diagonal entries. Then
-     * by the following relation:
-     *     P'(A*P')P = P'A
-     * we can apply inverse(perm) to rows of A to get large diagonal entries.
-     * But, since 'perm' defined in MC64AD happens to be the reverse of
-     * SuperLU's definition of permutation vector, therefore, it is already
-     * an inverse for our purpose. We will thus use it directly.
-     *
-     */
-    mc64id_(icntl);
-#if 0
-    /* Suppress error and warning messages. */
-    icntl[0] = -1;
-    icntl[1] = -1;
-#endif
-
-    mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval, &num, perm,
-	    &liw, iw, &ldw, dw, icntl, info);
-
-#if ( DEBUGlevel>=2 )
-    PrintInt10("perm", n, perm);
-    printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num);
-#endif
-    if ( info[0] == 1 ) { /* Structurally singular */
-        printf(".. The last %d permutations:\n", n-num);
-	PrintInt10("perm", n-num, &perm[num]);
-    }
-
-    /* Restore to 0-based indexing. */
-    for (i = 0; i <= n; ++i) --colptr[i];
-    for (i = 0; i < nnz; ++i) --adjncy[i];
-    for (i = 0; i < n; ++i) --perm[i];
-
-    if ( job == 5 )
-        for (i = 0; i < n; ++i) {
-	    u[i] = dw[i];
-	    v[i] = dw[n+i];
-	}
-
-    SUPERLU_FREE(iw);
-    SUPERLU_FREE(dw);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(0, "Exit dldperm()");
-#endif
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,146 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-
-/* Variables external to this file */
-extern LU_stack_t stack;
-
-
-void *duser_malloc_dist(int_t bytes, int_t which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-
-void duser_free_dist(int_t bytes, int_t which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
-		       mem_usage_t *mem_usage)
-{
-    register int_t dword, gb, iword, k, maxsup, nb, nsupers;
-    int_t *index, *xsup;
-    int iam, mycol, myrow;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    iword = sizeof(int_t);
-    dword = sizeof(double);
-    maxsup = sp_ienv_dist(3);
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    mem_usage->for_lu = 0;
-
-    /* For L factor */
-    nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-    for (k = 0; k < nb; ++k) {
-	gb = k * grid->npcol + mycol; /* Global block number. */
-	if ( gb < nsupers ) {
-	    index = Llu->Lrowind_bc_ptr[k];
-	    if ( index ) {
-		mem_usage->for_lu += (float)
-		    ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
-		mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
-	    }
-	}
-    }
-
-    /* For U factor */
-    nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
-    for (k = 0; k < nb; ++k) {
-	gb = k * grid->nprow + myrow; /* Global block number. */
-	if ( gb < nsupers ) {
-	    index = Llu->Ufstnz_br_ptr[k];
-	    if ( index ) {
-		mem_usage->for_lu += (float)(index[2] * iword);
-		mem_usage->for_lu += (float)(index[1] * dword);
-	    }
-	}
-    }
-
-    /* Working storage to support factorization */
-    mem_usage->total = mem_usage->for_lu;
-    mem_usage->total +=
-	(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
-		( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
-    /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
-    mem_usage->total +=
-	(float)( Llu->bufmax[0] * iword +  Llu->bufmax[1] * dword );
-    mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
-    k = CEILING( nsupers, grid->nprow );
-    mem_usage->total += (float)(2 * k * iword);
-
-    return 0;
-} /* dQuerySpace_dist */
-
-
-/*
- * Allocate storage for original matrix A
- */
-void
-dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa)
-{
-    *a    = (double *) doubleMalloc_dist(nnz);
-    *asub = (int_t *) intMalloc_dist(nnz);
-    *xa   = (int_t *) intMalloc_dist(n+1);
-}
-
-
-double *doubleMalloc_dist(int_t n)
-{
-    double *buf;
-    buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); 
-    return (buf);
-}
-
-double *doubleCalloc_dist(int_t n)
-{
-    double *buf;
-    register int_t i;
-    double zero = 0.0;
-    buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double));
-    if ( !buf ) return (buf);
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.c.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,146 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-
-/* Variables external to this file */
-extern LU_stack_t stack;
-
-
-void *duser_malloc_dist(int_t bytes, int_t which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-
-void duser_free_dist(int_t bytes, int_t which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
-		       mem_usage_t *mem_usage)
-{
-    register int_t dword, gb, iword, k, maxsup, nb, nsupers;
-    int_t *index, *xsup;
-    int iam, mycol, myrow;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    iword = sizeof(int_t);
-    dword = sizeof(double);
-    maxsup = sp_ienv_dist(3);
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    mem_usage->for_lu = 0;
-
-    /* For L factor */
-    nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-    for (k = 0; k < nb; ++k) {
-	gb = k * grid->npcol + mycol; /* Global block number. */
-	if ( gb < nsupers ) {
-	    index = Llu->Lrowind_bc_ptr[k];
-	    if ( index ) {
-		mem_usage->for_lu += (float)
-		    ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
-		mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
-	    }
-	}
-    }
-
-    /* For U factor */
-    nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
-    for (k = 0; k < nb; ++k) {
-	gb = k * grid->nprow + myrow; /* Global block number. */
-	if ( gb < nsupers ) {
-	    index = Llu->Ufstnz_br_ptr[k];
-	    if ( index ) {
-		mem_usage->for_lu += (float)(index[2] * iword);
-		mem_usage->for_lu += (float)(index[1] * dword);
-	    }
-	}
-    }
-
-    /* Working storage to support factorization */
-    mem_usage->total = mem_usage->for_lu;
-    mem_usage->total +=
-	(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
-		( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
-    /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
-    mem_usage->total +=
-	(float)( Llu->bufmax[0] * iword +  Llu->bufmax[1] * dword );
-    mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
-    k = CEILING( nsupers, grid->nprow );
-    mem_usage->total += (float)(2 * k * iword);
-
-    return 0;
-} /* dQuerySpace_dist */
-
-
-/*
- * Allocate storage for original matrix A
- */
-void
-dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa)
-{
-    *a    = (double *) doubleMalloc_dist(nnz);
-    *asub = (int_t *) intMalloc_dist(nnz);
-    *xa   = (int_t *) intMalloc_dist(n+1);
-}
-
-
-double *doubleMalloc_dist(int_t n)
-{
-    double *buf;
-    buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); 
-    return (buf);
-}
-
-double *doubleCalloc_dist(int_t n)
-{
-    double *buf;
-    register int_t i;
-    double zero = 0.0;
-    buf = (double *) SUPERLU_MALLOC(n * sizeof(double));
-    if ( !buf ) return (buf);
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.patch hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.patch
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmemory.patch	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmemory.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,8 +0,0 @@
-132c132
-<     buf = (double *) SUPERLU_MALLOC(n * sizeof(double)); 
----
->     buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); 
-141c141
-<     buf = (double *) SUPERLU_MALLOC(n * sizeof(double));
----
->     buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double));
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmyblas2.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmyblas2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dmyblas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dmyblas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,225 +0,0 @@
-
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		dmyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void dlsolve ( int ldm, int ncol, double *M, double *rhs )
-{
-    int k;
-    double x0, x1, x2, x3, x4, x5, x6, x7;
-    double *M0;
-    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-      Mki4 = Mki3 + ldm + 1;
-      Mki5 = Mki4 + ldm + 1;
-      Mki6 = Mki5 + ldm + 1;
-      Mki7 = Mki6 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++;
-      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++;
-      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
-      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
-			   - x6 * *Mki6++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      rhs[++firstcol] = x4;
-      rhs[++firstcol] = x5;
-      rhs[++firstcol] = x6;
-      rhs[++firstcol] = x7;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++
-                        - x4 * *Mki4++ - x5 * *Mki5++
-			- x6 * *Mki6++ - x7 * *Mki7++;
- 
-      M0 += 8 * ldm + 8;
-    }
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++;
- 
-      M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-
-      rhs[++firstcol] = x1;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
- 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-dusolve (
-	int ldm,	/* in */
-	int ncol,	/* in */
-	double *M,	/* in */
-	double *rhs	/* modified */
-)
-{
-    double xj;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++)
-	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void dmatvec (
-	int ldm,	/* in -- leading dimension of M */
-	int nrow,	/* in */ 
-	int ncol,	/* in */
-	double *M,	/* in */
-	double *vec,	/* in */
-	double *Mxvec	/* in/out */
-)
-{
-    double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
-    double *M0;
-    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-        Mki2 = Mki1 + ldm;
-        Mki3 = Mki2 + ldm;
-	Mki4 = Mki3 + ldm;
-	Mki5 = Mki4 + ldm;
-	Mki6 = Mki5 + ldm;
-	Mki7 = Mki6 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	vi4 = vec[firstcol++];
-	vi5 = vec[firstcol++];
-	vi6 = vec[firstcol++];
-	vi7 = vec[firstcol++];	
-
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
-		      + vi4 * *Mki4++ + vi5 * *Mki5++
-		      + vi6 * *Mki6++ + vi7 * *Mki7++;
-
-	M0 += 8 * ldm;
-    }
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
-
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++)
-	    Mxvec[k] += vi0 * *Mki0++;
-
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dreadhb.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dreadhb.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dreadhb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dreadhb.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,371 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "superlu_ddefs.h"
-
-/*
- * Prototypes
- */
-static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
-static void dReadValues(FILE *, int_t, double *, int_t, int_t);
-static void FormFullA(int_t, int_t *, double **, int_t **, int_t **);
-static int dDumpLine(FILE *);
-static int dParseIntFormat(char *, int_t *, int_t *);
-static int dParseFloatFormat(char *, int_t *, int_t *);
-
-
-void
-dreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
-	     double **nzval, int_t **rowind, int_t **colptr)
-{
-/* 
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- * 
- * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format 
- * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
- *
- */
-
-    register int_t i, numer_lines, rhscrd = 0;
-    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
-    char buf[100], type[4];
-    int_t sym;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(0, "Enter dreadhb_dist()");
-#endif
-
-    /* Line 1 */
-    fgets(buf, 100, fp);
-    /*dDumpLine(fp);*/
-    /*if ( !iam ) fflush(stdout);*/
-
-    /* Line 2 */
-    for (i=0; i<5; i++) {
-	fscanf(fp, "%14c", buf); buf[14] = 0;
-	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
-	if (i == 3) numer_lines = tmp;
-	if (i == 4 && tmp) rhscrd = tmp;
-    }
-    dDumpLine(fp);
-
-    /* Line 3 */
-    fscanf(fp, "%3c", type);
-    fscanf(fp, "%11c", buf); /* pad */
-    type[3] = 0;
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf("Matrix type %s\n", type);
-#endif
-    
-    fscanf(fp, "%14c", buf); *nrow = atoi(buf); /*sscanf(buf, "%d", nrow);*/
-    fscanf(fp, "%14c", buf); *ncol = atoi(buf); /*sscanf(buf, "%d", ncol);*/
-    fscanf(fp, "%14c", buf); *nonz = atoi(buf); /*sscanf(buf, "%d", nonz);*/
-    fscanf(fp, "%14c", buf); tmp = atoi(buf);   /*sscanf(buf, "%d", &tmp);*/
-    
-    if (tmp != 0)
-	if ( !iam ) printf("This is not an assembled matrix!\n");
-    if (*nrow != *ncol)
-	if ( !iam ) printf("Matrix is not square.\n");
-    dDumpLine(fp);
-
-    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
-    dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
-
-    /* Line 4: format statement */
-    fscanf(fp, "%16c", buf);
-    dParseIntFormat(buf, &colnum, &colsize);
-    fscanf(fp, "%16c", buf);
-    dParseIntFormat(buf, &rownum, &rowsize);
-    fscanf(fp, "%20c", buf);
-    dParseFloatFormat(buf, &valnum, &valsize);
-    fscanf(fp, "%20c", buf);
-    dDumpLine(fp);
-
-    /* Line 5: right-hand side */    
-    if ( rhscrd ) dDumpLine(fp); /* skip RHSFMT */
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) {
-	printf("%d rows, %d nonzeros\n", *nrow, *nonz);
-	printf("colnum %d, colsize %d\n", colnum, colsize);
-	printf("rownum %d, rowsize %d\n", rownum, rowsize);
-	printf("valnum %d, valsize %d\n", valnum, valsize);
-    }
-#endif
-    
-    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
-#if ( DEBUGlevel>=1 )
-    if ( !iam )	printf("read colptr[%d] = %d\n", *ncol, (*colptr)[*ncol]);
-#endif
-    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
-#if ( DEBUGlevel>=1 )
-    if ( !iam )	printf("read rowind[%d] = %d\n", *nonz-1, (*rowind)[*nonz-1]);
-#endif
-    if ( numer_lines ) {
-        dReadValues(fp, *nonz, *nzval, valnum, valsize);
-#if ( DEBUGlevel>=1 )
-	if ( !iam ) printf("read nzval[%d] = %e\n", *nonz-1, (*nzval)[*nonz-1]);
-#endif
-    }
-
-    sym = (type[1] == 'S' || type[1] == 's');
-    if ( sym ) {
-	FormFullA(*ncol, nonz, nzval, rowind, colptr);
-    }
-
-    /*if ( !iam ) fflush(stdout);*/
-    fclose(fp);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(0, "Exit dreadhb_dist()");
-#endif
-}
-
-/* Eat up the rest of the current line */
-static int dDumpLine(FILE *fp)
-{
-    register int c;
-    while ((c = fgetc(fp)) != '\n') ;
-    return 0;
-}
-
-static int dParseIntFormat(char *buf, int_t *num, int_t *size)
-{
-    char *tmp;
-
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-    while (*tmp != 'I' && *tmp != 'i') ++tmp;
-    ++tmp;
-    *size = atoi(tmp); /*sscanf(tmp, "%d", size);*/
-    return 0;
-}
-
-static int dParseFloatFormat(char *buf, int_t *num, int_t *size)
-{
-    char *tmp, *period;
-    
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
-	   && *tmp != 'F' && *tmp != 'f') {
-       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
-           num picked up refers to P, which should be skipped. */
-        if (*tmp=='p' || *tmp=='P') {
-           ++tmp;
-           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-        } else {
-           ++tmp;
-        }
-    }
-    ++tmp;
-    period = tmp;
-    while (*period != '.' && *period != ')') ++period ;
-    *period = '\0';
-    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
-
-    return 0;
-}
-
-static void
-ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
-{
-    register int_t i, j, item;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	    where[i++] = item - 1;
-	}
-    }
-}
-
-static void
-dReadValues(FILE *fp, int_t n, double *destination, int_t perline, int_t persize)
-{
-    register int_t i, j, k, s;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    s = j*persize;
-	    for (k = 0; k < persize; ++k) /* No D_ format in C */
-		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
-	    destination[i++] = atof(&buf[s]);
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	}
-    }
-}
-
-/*
- * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
- * matrix. On exit, it represents the full matrix with lower and upper parts.
- */
-static void
-FormFullA(int_t n, int_t *nonz, double **nzval, int_t **rowind, int_t **colptr)
-{
-    register int_t i, j, k, col, new_nnz;
-    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
-    int_t *marker;
-    double *t_val, *al_val, *a_val;
-
-    al_rowind = *rowind;
-    al_colptr = *colptr;
-    al_val = *nzval;
-
-    if ( !(marker =(int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for marker[]");
-    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC t_colptr[]");
-    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
-    if ( !(t_val = (double*) SUPERLU_MALLOC( *nonz * sizeof(double)) ) )
-	ABORT("SUPERLU_MALLOC fails for t_val[]");
-
-    /* Get counts of each column of T, and set up column pointers */
-    for (i = 0; i < n; ++i) marker[i] = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
-	    ++marker[al_rowind[i]];
-    }
-    t_colptr[0] = 0;
-    for (i = 0; i < n; ++i) {
-	t_colptr[i+1] = t_colptr[i] + marker[i];
-	marker[i] = t_colptr[i];
-    }
-
-    /* Transpose matrix A to T */
-    for (j = 0; j < n; ++j)
-	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
-	    col = al_rowind[i];
-	    t_rowind[marker[col]] = j;
-	    t_val[marker[col]] = al_val[i];
-	    ++marker[col];
-	}
-
-    new_nnz = *nonz * 2 - n;
-    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC a_colptr[]");
-    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
-    if ( !(a_val = (double*) SUPERLU_MALLOC( new_nnz * sizeof(double)) ) )
-	ABORT("SUPERLU_MALLOC fails for a_val[]");
-    
-    a_colptr[0] = 0;
-    k = 0;
-    for (j = 0; j < n; ++j) {
-      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
-	if ( t_rowind[i] != j ) { /* not diagonal */
-	  a_rowind[k] = t_rowind[i];
-	  a_val[k] = t_val[i];
-#ifdef DEBUG
-	  if ( fabs(a_val[k]) < 4.047e-300 )
-	      printf("%5d: %e\n", k, a_val[k]);
-#endif
-	  ++k;
-	}
-      }
-
-      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
-	a_rowind[k] = al_rowind[i];
-	a_val[k] = al_val[i];
-#ifdef DEBUG
-	if ( fabs(a_val[k]) < 4.047e-300 )
-	    printf("%5d: %e\n", k, a_val[k]);
-#endif
-	++k;
-      }
-      
-      a_colptr[j+1] = k;
-    }
-
-    printf("FormFullA: new_nnz = %d, k = %d\n", new_nnz, k);
-
-    SUPERLU_FREE(al_val);
-    SUPERLU_FREE(al_rowind);
-    SUPERLU_FREE(al_colptr);
-    SUPERLU_FREE(marker);
-    SUPERLU_FREE(t_val);
-    SUPERLU_FREE(t_rowind);
-    SUPERLU_FREE(t_colptr);
-
-    *nzval = a_val;
-    *rowind = a_rowind;
-    *colptr = a_colptr;
-    *nonz = new_nnz;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dsp_blas2.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dsp_blas2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dsp_blas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dsp_blas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,487 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-/*
- * File name:		sp_blas2.c
- * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
- */
-
-#include "superlu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-/* 
- * Function prototypes 
- */
-#ifndef USE_VENDOR_BLAS
-extern void dusolve(int, int, double*, double*);
-extern void dlsolve(int, int, double*, double*);
-extern void dmatvec(int, int, int, double*, double*, double*);
-#endif
-
-
-int
-sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
-	      SuperMatrix *U, double *x, int *info)
-{
-/*
- *   Purpose
- *   =======
- *
- *   sp_dtrsv_dist() solves one of the systems of equations   
- *       A*x = b,   or   A'*x = b,
- *   where b and x are n element vectors and A is a sparse unit , or   
- *   non-unit, upper or lower triangular matrix.   
- *   No test for singularity or near-singularity is included in this   
- *   routine. Such tests must be performed before calling this routine.   
- *
- *   Parameters   
- *   ==========   
- *
- *   uplo   - (input) char*
- *            On entry, uplo specifies whether the matrix is an upper or   
- *             lower triangular matrix as follows:   
- *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
- *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
- *
- *   trans  - (input) char*
- *             On entry, trans specifies the equations to be solved as   
- *             follows:   
- *                trans = 'N' or 'n'   A*x = b.   
- *                trans = 'T' or 't'   A'*x = b.   
- *                trans = 'C' or 'c'   A'*x = b.   
- *
- *   diag   - (input) char*
- *             On entry, diag specifies whether or not A is unit   
- *             triangular as follows:   
- *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
- *                diag = 'N' or 'n'   A is not assumed to be unit   
- *                                    triangular.   
- *	     
- *   L       - (input) SuperMatrix*
- *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes, i.e.,
- *             L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- *
- *   U       - (input) SuperMatrix*
- *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
- *    
- *   x       - (input/output) double*
- *             Before entry, the incremented array X must contain the n   
- *             element right-hand side vector b. On exit, X is overwritten 
- *             with the solution vector x.
- *
- *   info    - (output) int*
- *             If *info = -i, the i-th argument had an illegal value.
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1, ftcs2, ftcs3;
-#endif
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *Lval, *Uval;
-    int incx = 1, incy = 1;
-    double alpha = 1.0, beta = 1.0;
-    int nrow;
-    int fsupc, nsupr, nsupc, luptr, istart, irow;
-    int i, k, iptr, jcol;
-    double *work;
-    flops_t solve_ops;
-    /*extern SuperLUStat_t SuperLUStat;*/
-
-    /* Test the input parameters */
-    *info = 0;
-    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
-    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2;
-    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("sp_dtrsv_dist", &i);
-	return 0;
-    }
-
-    Lstore = (SCformat *) L->Store;
-    Lval = (double *) Lstore->nzval;
-    Ustore = (NCformat *) U->Store;
-    Uval = (double *) Ustore->nzval;
-    solve_ops = 0;
-
-    if ( !(work = doubleCalloc_dist(L->nrow)) )
-	ABORT("Malloc fails for work in sp_dtrsv_dist().");
-    
-    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L)*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-		fsupc = L_FST_SUPC(k);
-		istart = L_SUB_START(fsupc);
-		nsupr = L_SUB_START(fsupc+1) - istart;
-		nsupc = L_FST_SUPC(k+1) - fsupc;
-		luptr = L_NZ_START(fsupc);
-		nrow = nsupr - nsupc;
-
-	        solve_ops += nsupc * (nsupc - 1);
-	        solve_ops += 2 * nrow * nsupc;
-
-		if ( nsupc == 1 ) {
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
-			irow = L_SUB(iptr);
-			++luptr;
-			x[irow] -= x[fsupc] * Lval[luptr];
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    ftcs1 = _cptofcd("L", strlen("L"));
-		    ftcs2 = _cptofcd("N", strlen("N"));
-		    ftcs3 = _cptofcd("U", strlen("U"));
-		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#else
-		    dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx, 1, 1, 1);
-		
-		    dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
-#endif /* _CRAY */		
-#else
-		    dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
-		
-		    dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
-			&x[fsupc], &work[0] );
-#endif		
-		
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; ++i, ++iptr) {
-			irow = L_SUB(iptr);
-			x[irow] -= work[i];	/* Scatter */
-			work[i] = 0.0;
-
-		    }
-	 	}
-	    } /* for k ... */
-	    
-	} else {
-	    /* Form x := inv(U)*x */
-	    
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; k--) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-		
-    	        solve_ops += nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    x[fsupc] /= Lval[luptr];
-		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
-			irow = U_SUB(i);
-			x[irow] -= x[fsupc] * Uval[i];
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    ftcs1 = _cptofcd("U", strlen("U"));
-		    ftcs2 = _cptofcd("N", strlen("N"));
-		    STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx);
-#else
-		    dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx, 1, 1, 1);
-#endif
-#else		
-		    dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
-#endif		
-
-		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
-				i++) {
-			    irow = U_SUB(i);
-			    x[irow] -= x[jcol] * Uval[i];
-		    	}
-                    }
-		}
-	    } /* for k ... */
-	    
-	}
-    } else { /* Form x := inv(A')*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L')*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 2 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-			x[jcol] -= x[irow] * Lval[i];
-			iptr++;
-		    }
-		}
-		
-		if ( nsupc > 1 ) {
-		    solve_ops += nsupc * (nsupc - 1);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-		    dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx, 1, 1, 1);
-#endif
-#else
-		    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L", "T", "U", 
-                        &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := inv(U')*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-			x[jcol] -= x[irow] * Uval[i];
-		    }
-		}
-
-		solve_ops += nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    x[fsupc] /= Lval[luptr];
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-		    dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx, 1, 1, 1);
-#endif
-#else
-		    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("U", "T", "N", 
-                            &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#endif
-		}
-	    } /* for k ... */
-	}
-    }
-
-    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
-    SUPERLU_FREE(work);
-    return 0;
-}
-
-
-
-
-int
-sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x, 
-	      int incx, double beta, double *y, int incy)
-{
-/*  Purpose   
-    =======   
-
-    sp_dgemv_dist()  performs one of the matrix-vector operations   
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-    where alpha and beta are scalars, x and y are vectors and A is a
-    sparse A->nrow by A->ncol matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - (input) char*
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-    ALPHA  - (input) double
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. 
-             In the future, more general A can be handled.
-
-    X      - (input) double*, array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-
-    INCX   - (input) int
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-
-    BETA   - (input) double
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-
-    Y      - (output) double*,  array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-             updated vector y.
-	     
-    INCY   - (input) int
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-
-    ==== Sparse Level 2 Blas routine.   
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int info;
-    double temp;
-    int lenx, leny, i, j, irow;
-    int iy, jx, jy, kx, ky;
-    int notran;
-
-    notran = lsame_(trans, "N");
-    Astore = (NCformat *) A->Store;
-    Aval = (double *) Astore->nzval;
-    
-    /* Test the input parameters */
-    info = 0;
-    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
-    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
-    else if (incx == 0) info = 5;
-    else if (incy == 0)	info = 8;
-    if (info != 0) {
-	xerbla_("sp_dgemv_dist ", &info);
-	return 0;
-    }
-
-    /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.)
-	return 0;
-
-    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-       up the start points in  X  and  Y. */
-    if (lsame_(trans, "N")) {
-	lenx = A->ncol;
-	leny = A->nrow;
-    } else {
-	lenx = A->nrow;
-	leny = A->ncol;
-    }
-    if (incx > 0) kx = 0;
-    else kx =  - (lenx - 1) * incx;
-    if (incy > 0) ky = 0;
-    else ky =  - (leny - 1) * incy;
-
-    /* Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-    /* First form  y := beta*y. */
-    if (beta != 1.) {
-	if (incy == 1) {
-	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) y[i] = 0.;
-	    else
-		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
-	} else {
-	    iy = ky;
-	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = 0.;
-		    iy += incy;
-		}
-	    else
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = beta * y[iy];
-		    iy += incy;
-		}
-	}
-    }
-    
-    if (alpha == 0.) return 0;
-
-    if ( notran ) {
-	/* Form  y := alpha*A*x + y. */
-	jx = kx;
-	if (incy == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		if (x[jx] != 0.) {
-		    temp = alpha * x[jx];
-		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-			irow = Astore->rowind[i];
-			y[irow] += temp * Aval[i];
-		    }
-		}
-		jx += incx;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    } else {
-	/* Form  y := alpha*A'*x + y. */
-	jy = ky;
-	if (incx == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		temp = 0.;
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    temp += Aval[i] * x[irow];
-		}
-		y[jy] += alpha * temp;
-		jy += incy;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    }
-    return 0;
-} /* sp_dgemv_dist */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dsp_blas3.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,118 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-/*
- * File name:		sp_blas3.c
- * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
- */
-
-#include "superlu_ddefs.h"
-
-int
-sp_dgemm_dist(char *transa, char *transb, int m, int n, int k, 
-	      double alpha, SuperMatrix *A, double *b, int ldb, 
-	      double beta, double *c, int ldc)
-{
-/*  Purpose   
-    =======   
-
-    sp_d performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of 
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - (input) char*
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-             Unchanged on exit.   
-
-    TRANSB - (input) char*
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-             Unchanged on exit.   
-
-    M      - (input) int   
-             On entry,  M  specifies  the number of rows of the matrix 
-	     op( A ) and of the matrix C.  M must be at least zero. 
-	     Unchanged on exit.   
-
-    N      - (input) int
-             On entry,  N specifies the number of columns of the matrix 
-	     op( B ) and the number of columns of the matrix C. N must be 
-	     at least zero.
-	     Unchanged on exit.   
-
-    K      - (input) int
-             On entry, K specifies the number of columns of the matrix 
-	     op( A ) and the number of rows of the matrix op( B ). K must 
-	     be at least  zero.   
-             Unchanged on exit.
-	     
-    ALPHA  - (input) double
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. 
-             In the future, more general A can be handled.
-
-    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
-             part of the array B must contain the matrix B, otherwise 
-             the leading n by k part of the array B must contain the 
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - (input) int
-             On entry, LDB specifies the first dimension of B as declared 
-             in the calling (sub) program. LDB must be at least max( 1, n ).  
-             Unchanged on exit.   
-
-    BETA   - (input) double
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then C need not be set on input.   
-
-    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading m by n part of the array C must 
-             contain the matrix C,  except when beta is zero, in which 
-             case C need not be set on entry.   
-             On exit, the array C is overwritten by the m by n matrix 
-	     ( alpha*op( A )*B + beta*C ).   
-
-    LDC    - (input) int
-             On entry, LDC specifies the first dimension of C as declared 
-             in the calling (sub)program. LDC must be at least max(1,m).   
-             Unchanged on exit.   
-
-    ==== Sparse Level 3 Blas routine.   
-*/
-    int    incx = 1, incy = 1;
-    int    j;
-
-    for (j = 0; j < n; ++j) {
-	sp_dgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
-    }
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dutil.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/dutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/dutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,609 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-dCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, 
-			    double *nzval, int_t *rowind, int_t *colptr,
-			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NCformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = (NCformat *) A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->rowind = rowind;
-    Astore->colptr = colptr;
-}
-
-void
-dCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
-			       int_t nnz_loc, int_t m_loc, int_t fst_row,
-			       double *nzval, int_t *colind, int_t *rowptr,
-			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NRformat_loc *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = (NRformat_loc *) A->Store;
-    Astore->nnz_loc = nnz_loc;
-    Astore->fst_row = fst_row;
-    Astore->m_loc = m_loc;
-    Astore->nzval = nzval;
-    Astore->colind = colind;
-    Astore->rowptr = rowptr;
-}
-
-/*
- * Convert a row compressed storage into a column compressed storage.
- */
-void
-dCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, 
-                         double *a, int_t *colind, int_t *rowptr,
-                         double **at, int_t **rowind, int_t **colptr)
-{
-    register int i, j, col, relpos;
-    int_t *marker;
-
-    /* Allocate storage for another copy of the matrix. */
-    *at = (double *) doubleMalloc_dist(nnz);
-    *rowind = intMalloc_dist(nnz);
-    *colptr = intMalloc_dist(n+1);
-    marker = intCalloc_dist(n);
-    
-    /* Get counts of each column of A, and set up column pointers */
-    for (i = 0; i < m; ++i)
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
-    (*colptr)[0] = 0;
-    for (j = 0; j < n; ++j) {
-	(*colptr)[j+1] = (*colptr)[j] + marker[j];
-	marker[j] = (*colptr)[j];
-    }
-
-    /* Transfer the matrix into the compressed column storage. */
-    for (i = 0; i < m; ++i) {
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    col = colind[j];
-	    relpos = marker[col];
-	    (*rowind)[relpos] = i;
-	    (*at)[relpos] = a[j];
-	    ++marker[col];
-	}
-    }
-
-    SUPERLU_FREE(marker);
-}
-
-/* Copy matrix A into matrix B. */
-void
-dCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore, *Bstore;
-    int      ncol, nnz, i;
-
-    B->Stype = A->Stype;
-    B->Dtype = A->Dtype;
-    B->Mtype = A->Mtype;
-    B->nrow  = A->nrow;;
-    B->ncol  = ncol = A->ncol;
-    Astore   = (NCformat *) A->Store;
-    Bstore   = (NCformat *) B->Store;
-    Bstore->nnz = nnz = Astore->nnz;
-    for (i = 0; i < nnz; ++i)
-	((double *)Bstore->nzval)[i] = ((double *)Astore->nzval)[i];
-    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
-    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
-}
-
-
-void dPrint_CompCol_Matrix_dist(SuperMatrix *A)
-{
-    NCformat     *Astore;
-    register int i;
-    double       *dp;
-    
-    printf("\nCompCol matrix: ");
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (NCformat *) A->Store;
-    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
-    if ( (dp = (double *) Astore->nzval) != NULL ) {
-        printf("nzval:\n");
-        for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
-    }
-    printf("\nrowind:\n");
-    for (i = 0; i < Astore->nnz; ++i) printf("%d  ", Astore->rowind[i]);
-    printf("\ncolptr:\n");
-    for (i = 0; i <= A->ncol; ++i) printf("%d  ", Astore->colptr[i]);
-    printf("\nend CompCol matrix.\n");
-}
-
-void dPrint_Dense_Matrix_dist(SuperMatrix *A)
-{
-    DNformat     *Astore;
-    register int i;
-    double       *dp;
-    
-    printf("\nDense matrix: ");
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (DNformat *) A->Store;
-    dp = (double *) Astore->nzval;
-    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,Astore->lda);
-    printf("\nnzval: ");
-    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
-    printf("\nend Dense matrix.\n");
-}
-
-int dPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
-{
-    NRformat_loc     *Astore;
-    int_t i, nnz_loc, m_loc;
-    double       *dp;
-    
-    printf("\n==== CompRowLoc matrix: ");
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (NRformat_loc *) A->Store;
-    printf("nrow %d, ncol %d\n", A->nrow,A->ncol);
-    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
-    printf("nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc,
-	   Astore->fst_row);
-    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
-    PrintInt10("colind", nnz_loc, Astore->colind);
-    if ( (dp = (double *) Astore->nzval) != NULL )
-        PrintDouble5("nzval", nnz_loc, dp);
-    printf("==== end CompRowLoc matrix\n");
-}
-
-int file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
-{
-    NRformat_loc     *Astore;
-    int_t i, nnz_loc, m_loc;
-    double       *dp;
-    
-    fprintf(fp, "\n==== CompRowLoc matrix: ");
-    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (NRformat_loc *) A->Store;
-    fprintf(fp, "nrow %d, ncol %d\n", A->nrow, A->ncol);
-    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
-    fprintf(fp, "nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc,
-	    Astore->fst_row);
-    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
-    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
-    if ( (dp = (double *) Astore->nzval) != NULL )
-        file_PrintDouble5(fp, "nzval", nnz_loc, dp);
-    fprintf(fp, "==== end CompRowLoc matrix\n");
-}
-
-void
-dCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, double *x,
-			  int_t ldx, Stype_t stype, Dtype_t dtype,
-			  Mtype_t mtype)
-{
-    DNformat    *Xstore;
-    
-    X->Stype = stype;
-    X->Dtype = dtype;
-    X->Mtype = mtype;
-    X->nrow = m;
-    X->ncol = n;
-    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
-    Xstore = (DNformat *) X->Store;
-    Xstore->lda = ldx;
-    Xstore->nzval = (double *) x;
-}
-
-void
-dCopy_Dense_Matrix_dist(int_t M, int_t N, double *X, int_t ldx,
-			double *Y, int_t ldy)
-{
-/*
- *  Purpose
- *  =======
- *
- *  Copies a two-dimensional matrix X to another matrix Y.
- */
-    int    i, j;
-    
-    for (j = 0; j < N; ++j)
-        for (i = 0; i < M; ++i)
-            Y[i + j*ldy] = X[i + j*ldx];
-}
-
-void
-dCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, 
-			      double *nzval, int_t *nzval_colptr,
-			      int_t *rowind, int_t *rowind_colptr,
-			      int_t *col_to_sup, int_t *sup_to_col,
-			      Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    SCformat *Lstore;
-
-    L->Stype = stype;
-    L->Dtype = dtype;
-    L->Mtype = mtype;
-    L->nrow = m;
-    L->ncol = n;
-    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
-    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
-    Lstore = L->Store;
-    Lstore->nnz = nnz;
-    Lstore->nsuper = col_to_sup[n];
-    Lstore->nzval = nzval;
-    Lstore->nzval_colptr = nzval_colptr;
-    Lstore->rowind = rowind;
-    Lstore->rowind_colptr = rowind_colptr;
-    Lstore->col_to_sup = col_to_sup;
-    Lstore->sup_to_col = sup_to_col;
-
-}
-
-void
-dGenXtrue_dist(int_t n, int_t nrhs, double *x, int_t ldx)
-{
-    int  i, j;
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) {
-	    if ( i % 2 ) x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/
-	    else x[i + j*ldx] = 1.0;
-	}
-}
-
-/*
- * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
- */
-void
-dFillRHS_dist(char *trans, int_t nrhs, double *x, int_t ldx,
-	      SuperMatrix *A, double *rhs, int_t ldb)
-{
-    double one = 1.0;
-    double zero = 0.0;
-
-    sp_dgemm_dist(trans, "N", A->nrow, nrhs, A->ncol, one, A,
-		  x, ldx, zero, rhs, ldb);
-
-}
-
-/* 
- * Fills a double precision array with a given value.
- */
-void 
-dfill_dist(double *a, int_t alen, double dval)
-{
-    register int_t i;
-    for (i = 0; i < alen; i++) a[i] = dval;
-}
-
-
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void dinf_norm_error_dist(int_t n, int_t nrhs, double *x, int_t ldx,
-			  double *xtrue, int_t ldxtrue,
-                          gridinfo_t *grid)
-{
-    double err, xnorm;
-    double *x_work, *xtrue_work;
-    int i, j;
-
-    for (j = 0; j < nrhs; j++) {
-      x_work = &x[j*ldx];
-      xtrue_work = &xtrue[j*ldxtrue];
-      err = xnorm = 0.0;
-      for (i = 0; i < n; i++) {
-	err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
-	xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
-      }
-      err = err / xnorm;
-      printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
-    }
-}
-
-void PrintDouble5(char *name, int_t len, double *x)
-{
-    register int_t i;
-    
-    printf("%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 5 == 0 ) printf("\n[%2d-%2d] ", i, i+4);
-	printf("%14e", x[i]);
-    }
-    printf("\n");
-}
-
-int file_PrintDouble5(FILE *fp, char *name, int_t len, double *x)
-{
-    register int_t i;
-    
-    fprintf(fp, "%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 5 == 0 ) fprintf(fp, "\n[%2d-%2d] ", i, i+4);
-	fprintf(fp, "%14e", x[i]);
-    }
-    fprintf(fp, "\n");
-}
-
-/* 
- * Print the blocks in the factored matrix L.
- */
-void dPrintLblocks(int_t iam, int_t nsupers, gridinfo_t *grid,
-		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
-{
-    register int_t c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *index;
-    double *nzval;
-
-    printf("\n(%d) L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam);
-    ncb = nsupers / grid->npcol;
-    extra = nsupers % grid->npcol;
-    mycol = MYCOL( iam, grid );
-    if ( mycol < extra ) ++ncb;
-    for (lb = 0; lb < ncb; ++lb) {
-	index = Llu->Lrowind_bc_ptr[lb];
-	if ( index ) { /* Not an empty column */
-	    nzval = Llu->Lnzval_bc_ptr[lb];
-	    nb = index[0];
-	    nsupr = index[1];
-	    gb = lb * grid->npcol + mycol;
-	    nsupc = SuperSize( gb );
-	    printf("(%d) block column %d (local), # row blocks %d\n",
-		   iam, lb, nb);
-	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
-		len = index[k+1];
-		printf("(%d) row-block %d: block # %d\tlength %d\n", 
-		       iam, c, index[k], len);
-		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
-		for (j = 0; j < nsupc; ++j) {
-		    PrintDouble5("nzval", len, &nzval[r + j*nsupr]);
-		}
-		k += LB_DESCRIPTOR + len;
-		r += len;
-	    }
-	}
-	printf("(%d)", iam);
- 	PrintInt10("ToSendR[]", grid->npcol, Llu->ToSendR[lb]);
-	PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
-    }
-    printf("nfrecvx %4d\n", Llu->nfrecvx);
-    k = CEILING( nsupers, grid->nprow );
-    PrintInt10("fmod", k, Llu->fmod);
-    
-} /* DPRINTLBLOCKS */
-
-
-/* 
- * Print the blocks in the factored matrix U.
- */
-void dPrintUblocks(int_t iam, int_t nsupers, gridinfo_t *grid, 
-		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
-{
-    register int_t c, extra, jb, k, lb, len, nb, nrb, nsupc;
-    register int_t myrow, r;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *index;
-    double *nzval;
-
-    printf("\n(%d) U BLOCKS IN ROW-MAJOR ORDER -->\n", iam);
-    nrb = nsupers / grid->nprow;
-    extra = nsupers % grid->nprow;
-    myrow = MYROW( iam, grid );
-    if ( myrow < extra ) ++nrb;
-    for (lb = 0; lb < nrb; ++lb) {
-	index = Llu->Ufstnz_br_ptr[lb];
-	if ( index ) { /* Not an empty row */
-	    nzval = Llu->Unzval_br_ptr[lb];
-	    nb = index[0];
-	    printf("(%d) block row %d (local), # column blocks %d\n",
-		   iam, lb, nb);
-	    r  = 0;
-	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
-		jb = index[k];
-		len = index[k+1];
-		printf("(%d) col-block %d: block # %d\tlength %d\n", 
-		       iam, c, jb, index[k+1]);
-		nsupc = SuperSize( jb );
-		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
-		PrintDouble5("nzval", len, &nzval[r]);
-		k += UB_DESCRIPTOR + nsupc;
-		r += len;
-	    }
-
-	    printf("(%d) ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
-	}
-    }
-} /* DPRINTUBLOCKS */
-
-int
-dprint_gsmv_comm(FILE *fp, int_t m_loc, pdgsmv_comm_t *gsmv_comm,
-                 gridinfo_t *grid)
-{
-  int_t procs = grid->nprow*grid->npcol;
-  fprintf(fp, "TotalIndSend %d\tTotalValSend %d\n", gsmv_comm->TotalIndSend,
-	  gsmv_comm->TotalValSend);
-  file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start);
-  file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend);
-  file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv);
-  file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend);
-  file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv);
-  file_PrintInt10(fp, "SendCounts", procs, gsmv_comm->SendCounts);
-  file_PrintInt10(fp, "RecvCounts", procs, gsmv_comm->RecvCounts);
-}
-
-
-void
-GenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist,
-	    gridinfo_t *grid, double **xact, int *ldx, double **b, int *ldb)
-{
-    int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows,
-          nsupr, nsupers, rel;
-    int_t *supno, *xsup, *lxsup;
-    double *x, *bb;
-    NCformat *Astore;
-    double   *Aval;
-
-    n = A->ncol;
-    *ldb = 0;
-    supno = Glu_persist->supno;
-    xsup = Glu_persist->xsup;
-    nsupers = supno[n-1] + 1;
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    Astore = (NCformat *) A->Store;
-    Aval = (double *) Astore->nzval;
-    lb = CEILING( nsupers, grid->nprow ) + 1;
-    if ( !(lxsup = intMalloc_dist(lb)) )
-	ABORT("Malloc fails for lxsup[].");
-
-    lsup = 0;
-    nlrows = 0;
-    for (j = 0; j < nsupers; ++j) {
-	i = PROW( j, grid );
-	if ( myrow == i ) {
-	    nsupr = SuperSize( j );
-	    *ldb += nsupr;
-	    lxsup[lsup++] = nlrows;
-	    nlrows += nsupr;
-	}
-    }
-    *ldx = n;
-    if ( !(x = doubleMalloc_dist(((size_t)*ldx) * nrhs)) )
-	ABORT("Malloc fails for x[].");
-    if ( !(bb = doubleCalloc_dist(*ldb * nrhs)) )
-	ABORT("Calloc fails for bb[].");
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0;
-
-    /* Form b = A*x. */
-    for (j = 0; j < n; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    gb = supno[irow];
-	    gbrow = PROW( gb, grid );
-	    if ( myrow == gbrow ) {
-		rel = irow - xsup[gb];
-		lb = LBi( gb, grid );
-		bb[lxsup[lb] + rel] += Aval[i] * x[j];
-	    }
-	}
-
-    /* Memory allocated but not freed: xact, b */
-    *xact = x;
-    *b = bb;
-
-    SUPERLU_FREE(lxsup);
-
-#if ( PRNTlevel>=2 )
-    for (i = 0; i < grid->nprow*grid->npcol; ++i) {
-	if ( iam == i ) {
-	    printf("\n(%d)\n", iam);
-	    PrintDouble5("rhs", *ldb, *b);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-} /* GENXTRUERHS */
-
-/* g5.rua
-          b = A*x    y = L\b
-   0      1	     1.0000
-   1      0	     0.2500
-   2      1	     1.0000
-   3      2	     2.0000
-   4      1	     1.7500
-   5      1	     1.8917
-   6      0	     1.1879
-   7      2	     2.0000
-   8      2	     2.0000
-   9      1	     1.0000
-   10     1	     1.7500
-   11     0	          0
-   12     1	     1.8750
-   13     2	     2.0000
-   14     1	     1.0000
-   15     0	     0.2500
-   16     1	     1.7667
-   17     0	     0.6419
-   18     1	     2.2504
-   19     0	     1.1563
-   20     0	     0.9069
-   21     0	     1.4269
-   22     1	     2.7510
-   23     1	     2.2289
-   24     0	     2.4332
-
-   g6.rua
-       b=A*x  y=L\b
-    0    0         0
-    1    1    1.0000
-    2    1    1.0000
-    3    2    2.5000
-    4    0         0
-    5    2    2.0000
-    6    1    1.0000
-    7    1    1.7500
-    8    1    1.0000
-    9    0    0.2500
-   10    0    0.5667
-   11    1    2.0787
-   12    0    0.8011
-   13    1    1.9838
-   14    1    1.0000
-   15    1    1.0000
-   16    2    2.5000
-   17    0    0.8571
-   18    0         0
-   19    1    1.0000
-   20    0    0.2500
-   21    1    1.0000
-   22    2    2.0000
-   23    1    1.7500
-   24    1    1.8917
-   25    0    1.1879
-   26    0    0.8011
-   27    1    1.9861
-   28    1    2.0199
-   29    0    1.3620
-   30    0    0.6136
-   31    1    2.3677
-   32    0    1.1011
-   33    0    1.5258
-   34    0    1.7628
-   35    0    2.1658
-*/
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/etree.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/etree.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/etree.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/etree.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,419 +0,0 @@
-
-/*  Elimination tree computation and layout routines */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "superlu_ddefs.h"
-
-/* 
- *  Implementation of disjoint set union routines.
- *  Elements are integers in 0..n-1, and the 
- *  names of the sets themselves are of type int.
- *  
- *  Calls are:
- *  initialize_disjoint_sets (n) initial call.
- *  s = make_set (i)             returns a set containing only i.
- *  s = link (t, u)		 returns s = t union u, destroying t and u.
- *  s = find (i)		 return name of set containing i.
- *  finalize_disjoint_sets 	 final call.
- *
- *  This implementation uses path compression but not weighted union.
- *  See Tarjan's book for details.
- *  John Gilbert, CMI, 1987.
- *
- *  Implemented path-halving by XL 7/5/95.
- */
-
-
-static 
-int_t *mxCallocInt(int_t n)
-{
-    register int_t i;
-    int_t *buf;
-
-    buf = (int_t *) SUPERLU_MALLOC( n * sizeof(int_t) );
-    if ( buf ) 
-	for (i = 0; i < n; i++) buf[i] = 0;
-    return (buf);
-}
-      
-static
-void initialize_disjoint_sets (
-			       int_t n,
-			       int_t **pp    /* parent array for sets */
-			       )
-{
-	if ( !( (*pp) = mxCallocInt(n)) ) 
-	    ABORT("mxCallocInit fails for pp[]");
-}
-
-
-static
-int_t make_set (
-		int_t   i,
-		int_t	*pp    /* parent array for sets */
-		)
-{
-	pp[i] = i;
-	return i;
-}
-
-
-static
-int_t link (
-	    int_t s,
-	    int_t t,
-	    int_t *pp
-	    )
-{
-	pp[s] = t;
-	return t;
-}
-
-
-/* PATH HALVING */
-static
-int_t find (
-	    int_t i,
-	    int_t *pp
-	    )
-{
-    register int_t p, gp;
-    
-    p = pp[i];
-    gp = pp[p];
-    while (gp != p) {
-	pp[i] = gp;
-	i = gp;
-	p = pp[i];
-	gp = pp[p];
-    }
-    return (p);
-}
-
-#if 0
-/* PATH COMPRESSION */
-static
-int_t find (
-	    int_t i
-	    )
-{
-	if (pp[i] != i) 
-		pp[i] = find (pp[i]);
-	return pp[i];
-}
-#endif
-
-static
-void finalize_disjoint_sets (
-			     int_t *pp
-			    )
-{
-	SUPERLU_FREE(pp);
-}
-
-/*
- *      p = spsymetree (A);
- *
- *      Find the elimination tree for symmetric matrix A.
- *      This uses Liu's algorithm, and runs in time O(nz*log n).
- *
- *      Input:
- *        Square sparse matrix A.  No check is made for symmetry;
- *        elements below and on the diagonal are ignored.
- *        Numeric values are ignored, so any explicit zeros are 
- *        treated as nonzero.
- *      Output:
- *        Integer array of parents representing the etree, with n
- *        meaning a root of the elimination forest.
- *      Note:  
- *        This routine uses only the upper triangle, while sparse
- *        Cholesky (as in spchol.c) uses only the lower.  Matlab's
- *        dense Cholesky uses only the upper.  This routine could
- *        be modified to use the lower triangle either by transposing
- *        the matrix or by traversing it by rows with auxiliary
- *        pointer and link arrays.
- *
- *      John R. Gilbert, Xerox, 10 Dec 1990
- *      Based on code by JRG dated 1987, 1988, and 1990.
- *      Modified by X.S. Li, November 1999.
- */
-
-/*
- * Symmetric elimination tree
- */
-int_t
-sp_symetree_dist(
-	    int_t *acolst, int_t *acolend, /* column starts and ends past 1 */
-	    int_t *arow,            /* row indices of A */
-	    int_t n,                /* dimension of A */
-	    int_t *parent	    /* parent in elim tree */
-	    )
-{
-	int_t	*root;		    /* root of subtee of etree 	*/
-	int_t	rset, cset;             
-	int_t	row, col;
-	int_t	rroot;
-	int_t	p;
-	int_t   *pp;
-
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(0, "Enter sp_symetree()");
-#endif
-
-	root = mxCallocInt (n);
-	initialize_disjoint_sets (n, &pp);
-
-	for (col = 0; col < n; col++) {
-		cset = make_set (col, pp);
-		root[cset] = col;
-		parent[col] = n; /* Matlab */
-		for (p = acolst[col]; p < acolend[col]; p++) {
-			row = arow[p];
-			if (row >= col) continue;
-			rset = find (row, pp);
-			rroot = root[rset];
-			if (rroot != col) {
-				parent[rroot] = col;
-				cset = link (cset, rset, pp);
-				root[cset] = col;
-			}
-		}
-	}
-	SUPERLU_FREE (root);
-	finalize_disjoint_sets (pp);
-
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(0, "Exit sp_symetree()");
-#endif
-	return 0;
-} /* SP_SYMETREE_DIST */
-
-
-/*
- *      Find the elimination tree for A'*A.
- *      This uses something similar to Liu's algorithm. 
- *      It runs in time O(nz(A)*log n) and does not form A'*A.
- *
- *      Input:
- *        Sparse matrix A.  Numeric values are ignored, so any
- *        explicit zeros are treated as nonzero.
- *      Output:
- *        Integer array of parents representing the elimination
- *        tree of the symbolic product A'*A.  Each vertex is a
- *        column of A, and nc means a root of the elimination forest.
- *
- *      John R. Gilbert, Xerox, 10 Dec 1990
- *      Based on code by JRG dated 1987, 1988, and 1990.
- */
-
-/*
- * Nonsymmetric elimination tree
- */
-int_t
-sp_coletree_dist(
-	    int_t *acolst, int_t *acolend, /* column start and end past 1 */
-	    int_t *arow,                   /* row indices of A */
-	    int_t nr, int_t nc,            /* dimension of A */
-	    int_t *parent	           /* parent in elim tree */
-	    )
-{
-	int_t	*root;			/* root of subtee of etree 	*/
-	int_t   *firstcol;		/* first nonzero col in each row*/
-	int_t	rset, cset;             
-	int_t	row, col;
-	int_t	rroot;
-	int_t	p;
-	int_t   *pp;
-
-#if ( DEBUGlevel>=1 )
-	int_t iam = 0;
-	CHECK_MALLOC(iam, "Enter sp_coletree()");
-#endif
-
-	root = mxCallocInt (nc);
-	initialize_disjoint_sets (nc, &pp);
-
-	/* Compute firstcol[row] = first nonzero column in row */
-
-	firstcol = mxCallocInt (nr);
-	for (row = 0; row < nr; firstcol[row++] = nc);
-	for (col = 0; col < nc; col++) 
-		for (p = acolst[col]; p < acolend[col]; p++) {
-			row = arow[p];
-			firstcol[row] = SUPERLU_MIN(firstcol[row], col);
-		}
-
-	/* Compute etree by Liu's algorithm for symmetric matrices,
-           except use (firstcol[r],c) in place of an edge (r,c) of A.
-	   Thus each row clique in A'*A is replaced by a star
-	   centered at its first vertex, which has the same fill. */
-
-	for (col = 0; col < nc; col++) {
-		cset = make_set (col, pp);
-		root[cset] = col;
-		parent[col] = nc; /* Matlab */
-		for (p = acolst[col]; p < acolend[col]; p++) {
-			row = firstcol[arow[p]];
-			if (row >= col) continue;
-			rset = find (row, pp);
-			rroot = root[rset];
-			if (rroot != col) {
-				parent[rroot] = col;
-				cset = link (cset, rset, pp);
-				root[cset] = col;
-			}
-		}
-	}
-
-	SUPERLU_FREE (root);
-	SUPERLU_FREE (firstcol);
-	finalize_disjoint_sets (pp);
-
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit sp_coletree()");
-#endif
-	return 0;
-} /* SP_COLETREE_DIST */
-
-/*
- *  q = TreePostorder_dist (n, p);
- *
- *	Postorder a tree.
- *	Input:
- *	  p is a vector of parent pointers for a forest whose
- *        vertices are the integers 0 to n-1; p[root]==n.
- *	Output:
- *	  q is a vector indexed by 0..n-1 such that q[i] is the
- *	  i-th vertex in a postorder numbering of the tree.
- *
- *        ( 2/7/95 modified by X.Li:
- *          q is a vector indexed by 0:n-1 such that vertex i is the
- *          q[i]-th vertex in a postorder numbering of the tree.
- *          That is, this is the inverse of the previous q. )
- *
- *	In the child structure, lower-numbered children are represented
- *	first, so that a tree which is already numbered in postorder
- *	will not have its order changed.
- *    
- *  Written by John Gilbert, Xerox, 10 Dec 1990.
- *  Based on code written by John Gilbert at CMI in 1987.
- */
-
-static int_t	*first_kid, *next_kid;	/* Linked list of children.	*/
-static int_t	*post, postnum;
-
-static
-/*
- * Depth-first search from vertex v.
- */
-void etdfs (
-	    int_t	  v,
-	    int_t   first_kid[],
-	    int_t   next_kid[],
-	    int_t   post[], 
-	    int_t   *postnum
-	    )
-{
-	int	w;
-
-	for (w = first_kid[v]; w != -1; w = next_kid[w]) {
-		etdfs (w, first_kid, next_kid, post, postnum);
-	}
-	/* post[postnum++] = v; in Matlab */
-	post[v] = (*postnum)++;    /* Modified by X. Li on 08/10/07 */
-}
-
-
-static
-/*
- * Depth-first search from vertex n.
- * No recursion.
- */
-void nr_etdfs (int_t n, int_t *parent,
-	       int_t *first_kid, int_t *next_kid,
-	       int_t *post, int_t postnum)
-{
-    int_t current = n, first, next;
-
-    while (postnum != n){
-     
-        /* no kid for the current node */
-        first = first_kid[current];
-
-        /* no first kid for the current node */
-        if (first == -1){
-
-            /* numbering this node because it has no kid */
-            post[current] = postnum++;
-
-            /* looking for the next kid */
-            next = next_kid[current];
-
-            while (next == -1){
-
-                /* no more kids : back to the parent node */
-                current = parent[current];
-
-                /* numbering the parent node */
-                post[current] = postnum++;
-
-                /* get the next kid */
-                next = next_kid[current];
-	    }
-            
-            /* stopping criterion */
-            if (postnum==n+1) return;
-
-            /* updating current node */
-            current = next;
-        }
-        /* updating current node */
-        else {
-            current = first;
-	}
-    }
-}
-
-/*
- * Post order a tree
- */
-int_t *TreePostorder_dist(
-			  int_t n,
-			  int_t *parent
-			  )
-{
-	int_t	v, dad;
-	int_t   *first_kid, *next_kid, *post, postnum;
-
-	/* Allocate storage for working arrays and results	*/
-	if ( !(first_kid = mxCallocInt (n+1)) )
-	    ABORT("mxCallocInt fails for first_kid[]");
-	if ( !(next_kid = mxCallocInt (n+1)) )
-	    ABORT("mxCallocInt fails for next_kid[]");
-	if ( !(post = mxCallocInt (n+1)) )
-	    ABORT("mxCallocInt fails for post[]");
-
-	/* Set up structure describing children */
-	for (v = 0; v <= n; first_kid[v++] = -1);
-	for (v = n-1; v >= 0; v--) {
-		dad = parent[v];
-		next_kid[v] = first_kid[dad];
-		first_kid[dad] = v;
-	}
-
-	/* Depth-first search from dummy root vertex #n */
-	postnum = 0;
-#if 0
-	/* recursion */
-	etdfs (n, first_kid, next_kid, post, &postnum);
-#else
-	/* no recursion */
-	nr_etdfs(n, parent, first_kid, next_kid, post, postnum);
-#endif
-
-	SUPERLU_FREE(first_kid);
-	SUPERLU_FREE(next_kid);
-	return post;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/GetDiagU.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/GetDiagU.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/GetDiagU.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/GetDiagU.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,101 +0,0 @@
-/* 
- * -- Auxiliary routine in distributed SuperLU (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * Xiaoye S. Li
- * April 16, 2002
- *
- */
-
-#include "superlu_ddefs.h"
-
-void GetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *diagU)
-{
-  /*
-   * Purpose
-   * =======
-   *
-   * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
-   *  
-   * Arguments
-   * =========
-   *
-   * n        (input) int
-   *          Dimension of the matrix.
-   *
-   * LUstruct (input) LUstruct_t*
-   *          The data structures to store the distributed L and U factors.
-   *          see superlu_ddefs.h for its definition.
-   *
-   * grid     (input) gridinfo_t*
-   *          The 2D process mesh. It contains the MPI communicator, the number
-   *          of process rows (NPROW), the number of process columns (NPCOL),
-   *          and my process rank. It is an input argument to all the
-   *          parallel routines.
-   *
-   * diagU    (output) double*, dimension (n)
-   *          The main diagonal of matrix U.
-   *          On exit, it is available on all processes.
-   *
-   *
-   * Note
-   * ====
-   *
-   * The diagonal blocks of the L and U matrices are stored in the L
-   * data structures, and are on the diagonal processes of the
-   * 2D process grid.
-   *
-   * This routine is modified from gather_diag_to_all() in pdgstrs_Bglobal.c.
-   *
-   */
-    int_t *xsup;
-    int iam, knsupc, pkk;
-    int nsupr; /* number of rows in the block L(:,k) (LDA) */
-    int_t i, j, jj, k, lk, lwork, nsupers, p;
-    int_t num_diag_procs, *diag_procs, *diag_len;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double *dblock, *dwork, *lusup;
-
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-
-    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		   &diag_procs, &diag_len);
-    jj = diag_len[0];
-    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
-    if ( !(dwork = doubleMalloc_dist(jj)) ) ABORT("Malloc fails for dwork[]");
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy diagonal into buffer dwork[]. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBj( k, grid );
-		nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
-		lusup = Llu->Lnzval_bc_ptr[lk];
-		for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */
-		    dwork[lwork+i] = lusup[i*(nsupr+1)];
-		lwork += knsupc;
-	    }
-	    MPI_Bcast( dwork, lwork, MPI_DOUBLE, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( dwork, diag_len[p], MPI_DOUBLE, pkk, grid->comm );
-	}
-
-	/* Scatter dwork[] into global diagU vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    dblock = &diagU[FstBlockC( k )];
-	    for (i = 0; i < knsupc; ++i) dblock[i] = dwork[lwork+i];
-	    lwork += knsupc;
-	}
-    } /* for p = ... */
-
-    SUPERLU_FREE(diag_procs);
-    SUPERLU_FREE(diag_len);
-    SUPERLU_FREE(dwork);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,474 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley,
- * November 1, 2007
- * Feburary 20, 2008
- */
-
-#include "superlu_ddefs.h"
-
-
-void
-get_metis(
-	  int_t n,         /* dimension of matrix B */
-	  int_t bnz,       /* number of nonzeros in matrix A. */
-	  int_t *b_colptr, /* column pointer of size n+1 for matrix B. */
-	  int_t *b_rowind, /* row indices of size bnz for matrix B. */
-	  int_t *perm_c    /* out - the column permutation vector. */
-	  )
-{
-#define METISOPTIONS 8
-    int ct, i, j, nm, numflag = 0; /* C-Style ordering */
-    int metis_options[METISOPTIONS];
-    int_t *perm, *iperm;
-
-    metis_options[0] = 0; /* Use Defaults for now */
-    perm = intMalloc_dist(n);
-    iperm = intMalloc_dist(n);
-
-    nm = n;
-
-#ifdef USE_METIS
-    /* Call metis */
-#undef USEEND
-#ifdef USEEND
-      METIS_EdgeND(&nm, b_colptr, b_rowind, &numflag, metis_options,
-		   perm, iperm);
-#else
-      METIS_NodeND(&nm, b_colptr, b_rowind, &numflag, metis_options,
-		   perm, iperm);
-#endif
-#endif
-
-      /* Copy the permutation vector into SuperLU data structure. */
-      for (i = 0; i < n; ++i) perm_c[i] = iperm[i];
-
-      SUPERLU_FREE(b_colptr);
-      SUPERLU_FREE(b_rowind);
-      SUPERLU_FREE(perm);
-      SUPERLU_FREE(iperm);
-}
-
-void
-getata_dist(
-	    const int_t m,    /* number of rows in matrix A. */
-	    const int_t n,    /* number of columns in matrix A. */
-	    const int_t nz,   /* number of nonzeros in matrix A */
-	    int_t *colptr,    /* column pointer of size n+1 for matrix A. */
-	    int_t *rowind,    /* row indices of size nz for matrix A. */
-	    int_t *atanz,     /* out - on exit, returns the actual number of
-				 nonzeros in matrix A'*A. */
-	    int_t **ata_colptr, /* out - size n+1 */
-	    int_t **ata_rowind  /* out - size *atanz */
-	    )
-{
-/*
- * Purpose
- * =======
- *
- * Form the structure of A'*A. A is an m-by-n matrix in column oriented
- * format represented by (colptr, rowind). The output A'*A is in column
- * oriented format (symmetrically, also row oriented), represented by
- * (ata_colptr, ata_rowind).
- *
- * This routine is modified from GETATA routine by Tim Davis.
- * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
- * i.e., the sum of the square of the row counts.
- *
- * Questions
- * =========
- *     o  Do I need to withhold the *dense* rows?
- *     o  How do I know the number of nonzeros in A'*A?
- * 
- */
-    register int_t i, j, k, col, num_nz, ti, trow;
-    int_t *marker, *b_colptr, *b_rowind;
-    int_t *t_colptr, *t_rowind; /* a column oriented form of T = A' */
-
-    if ( !(marker = (int_t*) SUPERLU_MALLOC( (SUPERLU_MAX(m,n)+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for marker[]");
-    if ( !(t_colptr = (int_t*) SUPERLU_MALLOC( (m+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC t_colptr[]");
-    if ( !(t_rowind = (int_t*) SUPERLU_MALLOC( nz * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
-
-    
-    /* Get counts of each column of T, and set up column pointers */
-    for (i = 0; i < m; ++i) marker[i] = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = colptr[j]; i < colptr[j+1]; ++i)
-	    ++marker[rowind[i]];
-    }
-    t_colptr[0] = 0;
-    for (i = 0; i < m; ++i) {
-	t_colptr[i+1] = t_colptr[i] + marker[i];
-	marker[i] = t_colptr[i];
-    }
-
-    /* Transpose the matrix from A to T */
-    for (j = 0; j < n; ++j)
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    col = rowind[i];
-	    t_rowind[marker[col]] = j;
-	    ++marker[col];
-	}
-
-    
-    /* ----------------------------------------------------------------
-       compute B = T * A, where column j of B is:
-
-       Struct (B_*j) =    UNION   ( Struct (T_*k) )
-                        A_kj != 0
-
-       do not include the diagonal entry
-   
-       ( Partition A as: A = (A_*1, ..., A_*n)
-         Then B = T * A = (T * A_*1, ..., T * A_*n), where
-         T * A_*j = (T_*1, ..., T_*m) * A_*j.  )
-       ---------------------------------------------------------------- */
-
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-
-    /* First pass determines number of nonzeros in B */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
-	    k = rowind[i];
-	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
-		trow = t_rowind[ti];
-		if ( marker[trow] != j ) {
-		    marker[trow] = j;
-		    num_nz++;
-		}
-	    }
-	}
-    }
-    *atanz = num_nz;
-    
-    /* Allocate storage for A'*A */
-    if ( !(*ata_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for ata_colptr[]");
-    if ( *atanz ) {
-	if ( !(*ata_rowind = (int_t*)SUPERLU_MALLOC(*atanz*sizeof(int_t)) ) ) {
-	    fprintf(stderr, ".. atanz = %ld\n", *atanz);
-	    ABORT("SUPERLU_MALLOC fails for ata_rowind[]");
-	}
-    }
-    b_colptr = *ata_colptr; /* aliasing */
-    b_rowind = *ata_rowind;
-    
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-    
-    /* Compute each column of B, one at a time */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	b_colptr[j] = num_nz;
-	
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
-	    k = rowind[i];
-	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
-		trow = t_rowind[ti];
-		if ( marker[trow] != j ) {
-		    marker[trow] = j;
-		    b_rowind[num_nz++] = trow;
-		}
-	    }
-	}
-    }
-    b_colptr[n] = num_nz;
-       
-    SUPERLU_FREE(marker);
-    SUPERLU_FREE(t_colptr);
-    SUPERLU_FREE(t_rowind);
-}
-
-
-void
-at_plus_a_dist(
-	       const int_t n,    /* number of columns in matrix A. */
-	       const int_t nz,   /* number of nonzeros in matrix A */
-	       int_t *colptr,    /* column pointer of size n+1 for matrix A. */
-	       int_t *rowind,    /* row indices of size nz for matrix A. */
-	       int_t *bnz,       /* out - on exit, returns the actual number of
-				    nonzeros in matrix A'+A. */
-	       int_t **b_colptr, /* out - size n+1 */
-	       int_t **b_rowind  /* out - size *bnz */
-	       )
-{
-/*
- * Purpose
- * =======
- *
- * Form the structure of A'+A. A is an n-by-n matrix in column oriented
- * format represented by (colptr, rowind). The output A'+A is in column
- * oriented format (symmetrically, also row oriented), represented by
- * (b_colptr, b_rowind).
- *
- */
-    register int_t i, j, k, col, num_nz;
-    int_t *t_colptr, *t_rowind; /* a column oriented form of T = A' */
-    int_t *marker;
-
-    if ( !(marker = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for marker[]");
-    if ( !(t_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for t_colptr[]");
-    if ( !(t_rowind = (int_t*) SUPERLU_MALLOC( nz * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails t_rowind[]");
-
-    
-    /* Get counts of each column of T, and set up column pointers */
-    for (i = 0; i < n; ++i) marker[i] = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = colptr[j]; i < colptr[j+1]; ++i)
-	    ++marker[rowind[i]];
-    }
-    t_colptr[0] = 0;
-    for (i = 0; i < n; ++i) {
-	t_colptr[i+1] = t_colptr[i] + marker[i];
-	marker[i] = t_colptr[i];
-    }
-
-    /* Transpose the matrix from A to T */
-    for (j = 0; j < n; ++j)
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    col = rowind[i];
-	    t_rowind[marker[col]] = j;
-	    ++marker[col];
-	}
-
-
-    /* ----------------------------------------------------------------
-       compute B = A + T, where column j of B is:
-
-       Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k)
-
-       do not include the diagonal entry
-       ---------------------------------------------------------------- */
-
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-
-    /* First pass determines number of nonzeros in B */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	/* Add pattern of column A_*k to B_*j */
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    k = rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		++num_nz;
-	    }
-	}
-
-	/* Add pattern of column T_*k to B_*j */
-	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
-	    k = t_rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		++num_nz;
-	    }
-	}
-    }
-    *bnz = num_nz;
-    
-    /* Allocate storage for A+A' */
-    if ( !(*b_colptr = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
-	ABORT("SUPERLU_MALLOC fails for b_colptr[]");
-    if ( *bnz ) {
-	if ( !(*b_rowind = (int_t*) SUPERLU_MALLOC( *bnz * sizeof(int_t)) ) )
-	    ABORT("SUPERLU_MALLOC fails for b_rowind[]");
-    }
-    
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-    
-    /* Compute each column of B, one at a time */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	(*b_colptr)[j] = num_nz;
-	
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	/* Add pattern of column A_*k to B_*j */
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    k = rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		(*b_rowind)[num_nz++] = k;
-	    }
-	}
-
-	/* Add pattern of column T_*k to B_*j */
-	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
-	    k = t_rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		(*b_rowind)[num_nz++] = k;
-	    }
-	}
-    }
-    (*b_colptr)[n] = num_nz;
-       
-    SUPERLU_FREE(marker);
-    SUPERLU_FREE(t_colptr);
-    SUPERLU_FREE(t_rowind);
-} /* at_plus_a_dist */
-
-
-void
-get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c)
-/*
- * Purpose
- * =======
- *
- * GET_PERM_C_DIST obtains a permutation matrix Pc, by applying the multiple
- * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A',
- * or using approximate minimum degree column ordering by Davis et. al.
- * The LU factorization of A*Pc tends to have less fill than the LU 
- * factorization of A.
- *
- * Arguments
- * =========
- *
- * ispec   (input) colperm_t
- *         Specifies what type of column permutation to use to reduce fill.
- *         = NATURAL: natural ordering (i.e., Pc = I)
- *         = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A
- *         = MMD_ATA: minimum degree ordering on structure of A'*A
- *         = METIS_AT_PLUS_A: MeTis on A'+A
- * 
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A 
- *         can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
- *         In the future, more general A can be handled.
- *
- * perm_c  (output) int*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- */
-{
-    NCformat *Astore = A->Store;
-    int_t m, n, bnz = 0, *b_colptr, *b_rowind, i;
-    int_t delta, maxint, nofsub, *invp;
-    int_t *dhead, *qsize, *llist, *marker;
-    double t, SuperLU_timer_();
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(pnum, "Enter get_perm_c_dist()");
-#endif
-
-    m = A->nrow;
-    n = A->ncol;
-
-    t = SuperLU_timer_();
-
-    switch ( ispec ) {
-
-        case NATURAL: /* Natural ordering */
-	      for (i = 0; i < n; ++i) perm_c[i] = i;
-#if ( PRNTlevel>=1 )
-	      if ( !pnum ) printf(".. Use natural column ordering\n");
-#endif
-	      return;
-
-        case MMD_AT_PLUS_A: /* Minimum degree ordering on A'+A */
-	      if ( m != n ) ABORT("Matrix is not square");
-	      at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
-			     &bnz, &b_colptr, &b_rowind);
-	      t = SuperLU_timer_() - t;
-	      /*printf("Form A'+A time = %8.3f\n", t);*/
-#if ( PRNTlevel>=1 )
-	      if ( !pnum ) printf(".. Use minimum degree ordering on A'+A.\n");
-#endif
-	      break;
-
-        case MMD_ATA: /* Minimum degree ordering on A'*A */
-	      getata_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
-			  &bnz, &b_colptr, &b_rowind);
-	      t = SuperLU_timer_() - t;
-	      /*printf("Form A'*A time = %8.3f\n", t);*/
-#if ( PRNTlevel>=1 )
-	      if ( !pnum ) printf(".. Use minimum degree ordering on A'*A\n");
-#endif
-	      break;
-
-        case METIS_AT_PLUS_A: /* METIS ordering on A'+A */
-	      if ( m != n ) ABORT("Matrix is not square");
-	      at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
-			     &bnz, &b_colptr, &b_rowind);
-	      get_metis(n, bnz, b_colptr, b_rowind, perm_c);
-#if ( PRNTlevel>=1 )
-	      if ( !pnum ) printf(".. Use METIS ordering on A'+A\n");
-#endif
-	      return;
-
-        default:
-	      ABORT("Invalid ISPEC");
-    }
-
-    if ( bnz ) {
-	t = SuperLU_timer_();
-
-	/* Initialize and allocate storage for GENMMD. */
-	delta = 0; /* DELTA is a parameter to allow the choice of nodes
-		      whose degree <= min-degree + DELTA. */
-	maxint = 2147483647; /* 2**31 - 1 */
-	invp = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
-	if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp.");
-	dhead = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
-	if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead.");
-	qsize = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
-	if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize.");
-	llist = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t));
-	if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist.");
-	marker = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t));
-	if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker.");
-	
-	/* Transform adjacency list into 1-based indexing required by GENMMD.*/
-	for (i = 0; i <= n; ++i) ++b_colptr[i];
-	for (i = 0; i < bnz; ++i) ++b_rowind[i];
-	
-	genmmd_dist_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, 
-		     qsize, llist, marker, &maxint, &nofsub);
-
-	/* Transform perm_c into 0-based indexing. */
-	for (i = 0; i < n; ++i) --perm_c[i];
-
-	SUPERLU_FREE(invp);
-	SUPERLU_FREE(dhead);
-	SUPERLU_FREE(qsize);
-	SUPERLU_FREE(llist);
-	SUPERLU_FREE(marker);
-	SUPERLU_FREE(b_rowind);
-
-	t = SuperLU_timer_() - t;
-	/*    printf("call GENMMD time = %8.3f\n", t);*/
-
-    } else { /* Empty adjacency structure */
-	for (i = 0; i < n; ++i) perm_c[i] = i;
-    }
-
-    SUPERLU_FREE(b_colptr);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(pnum, "Exit get_perm_c_dist()");
-#endif
-} /* get_perm_c_dist */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/get_perm_c_parmetis.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,898 +0,0 @@
-
-/*
- * -- Distributed symbolic factorization auxialiary routine  (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
- * INRIA France - January 2004
- * Laura Grigori
- *
- * November 1, 2007
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-#include <math.h>
-#include "superlu_ddefs.h"
-
-/*
- * Internal protypes
- */
-
-static float
-a_plus_at_CompRow_loc
-(int, int_t *, int, int_t *, int_t , int_t *, int_t *,  
- int, int_t *, int_t *, int_t **,  int_t **, gridinfo_t *);
-
-float
-get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
-		     int nprocs_i, int noDomains, 
-		     int_t **sizes, int_t **fstVtxSep,
-		     gridinfo_t *grid, MPI_Comm *metis_comm)
-/*
- * Purpose
- * =======
- *
- * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a
- * graph partitioning algorithm to the symmetrized graph A+A'.  The
- * multilevel graph partitioning algorithm used is the
- * ParMETIS_V3_NodeND routine available in the parallel graph
- * partitioning package parMETIS.  
- *
- * The number of independent sub-domains noDomains computed by this
- * algorithm has to be a power of 2.  Hence noDomains is the larger
- * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow
- * * npcol is the number of processors used in SuperLU_DIST.
- *
- * Arguments
- * =========
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow.  Matrix A is distributed
- *         in NRformat_loc format.
- *
- * perm_r  (input) int_t*
- *         Row permutation vector of size A->nrow, which defines the 
- *         permutation matrix Pr; perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *
- * perm_c  (output) int_t*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * nprocs_i (input) int*
- *         Number of processors the input matrix is distributed on in a block
- *         row format.  It corresponds to number of processors used in
- *         SuperLU_DIST.
- *
- * noDomains (input) int*, must be power of 2
- *         Number of independent domains to be computed by the graph
- *         partitioning algorithm.  ( noDomains <= nprocs_i )
- *
- * sizes   (output) int_t**, of size 2 * noDomains
- *         Returns pointer to an array containing the number of nodes
- *         for each sub-domain and each separator.  Separators are stored 
- *         from left to right.
- *         Memory for the array is allocated in this routine.
- *
- * fstVtxSep (output) int_t**, of size 2 * noDomains
- *         Returns pointer to an array containing first node for each
- *         sub-domain and each separator.
- *         Memory for the array is allocated in this routine.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the symbolic factorization.
- *   > 0, number of bytes allocated when out of memory.
- *
- */
-{
-  NRformat_loc *Astore;
-  int   iam, p;
-  int   *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int;
-  int   *options, numflag;
-  int_t m_loc, nnz_loc, fst_row;
-  int_t m, n, bnz, i, j;
-  int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
-  int_t *b_rowptr, *b_colind;
-  int_t *dist_order;
-  int  *recvcnts, *displs;
-  /* first row index on each processor when the matrix is distributed
-     on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
-  int_t  *vtxdist_i, *vtxdist_o; 
-  int_t szSep, k, noNodes;
-  float apat_mem_l; /* memory used during the computation of the graph of A+A' */
-  float mem;  /* Memory used during this routine */
-  MPI_Status status;
-
-  /* Initialization. */
-  MPI_Comm_rank (grid->comm, &iam);
-  n = A->ncol;
-  m = A->nrow;
-  if ( m != n ) ABORT("Matrix is not square");
-  mem = 0.;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");
-#endif
-
-  Astore = (NRformat_loc *) A->Store;
-  nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */
-  m_loc = Astore->m_loc;     /* number of rows local to this processor */
-  fst_row = Astore->fst_row; /* global index of the first row */
-  rowptr = Astore->rowptr;   /* pointer to rows and column indices */
-  colind = Astore->colind;
-  
-#if ( PRNTlevel>=1 )
-  if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n",
-		     noDomains);
-#endif
-
-  numflag = 0;
-  /* determine first row on each processor */
-  vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
-  if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i.");
-  vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
-  if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o.");
-
-  MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t,
-		 grid->comm);
-  vtxdist_i[nprocs_i] = m;
-
-  if (noDomains == nprocs_i) {
-    /* keep the same distribution of A */
-    for (p = 0; p <= nprocs_i; p++)
-      vtxdist_o[p] = vtxdist_i[p];
-  }
-  else {
-    i = n / noDomains;
-    j = n % noDomains;
-    for (k = 0, p = 0; p < noDomains; p++) {
-      vtxdist_o[p] = k;
-      k += i;
-      if (p < j)  k++;
-    }
-    /* The remaining non-participating processors get the same 
-       first-row-number as the last processor.   */
-    for (p = noDomains; p <= nprocs_i; p++)
-      vtxdist_o[p] = k;
-  }
-
-#if ( DEBUGlevel>=2 )
-  if (!iam)
-    PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);
-#endif  
-
-  /* Compute distributed A + A' */
-  if ((apat_mem_l = 
-       a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i,
-			     n, rowptr, colind, noDomains, vtxdist_o,
-			     &bnz, &b_rowptr, &b_colind, grid)) > 0)
-    return (apat_mem_l);
-  mem += -apat_mem_l;
-  
-  /* Initialize and allocate storage for parMetis. */    
-  (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
-  if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes.");
-  l_sizes = *sizes;
-  (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
-  if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
-  l_fstVtxSep = *fstVtxSep;
-  m_loc = vtxdist_o[iam+1] - vtxdist_o[iam];
-  
-  if ( iam < noDomains) 
-    /* dist_order_int is the perm returned by parMetis, distributed */
-    if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int))))
-      ABORT("SUPERLU_MALLOC fails for dist_order_int.");
-
-  /* ParMETIS represents the column pointers and row indices of *
-   * the input matrix using integers. When SuperLU_DIST uses    *
-   * long int for the int_t type, then several supplementary    *
-   * copies need to be performed in order to call ParMETIS.     */
-#if defined (_LONGINT)
-  l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int));
-  if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int.");
-  
-  /* Allocate storage */
-  if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int))))
-    ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]");
-  for (i = 0; i <= m_loc; i++)
-    b_rowptr_int[i] = b_rowptr[i];
-  SUPERLU_FREE (b_rowptr);
-  
-  if ( bnz ) {
-    if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int))))
-      ABORT("SUPERLU_MALLOC fails for b_colind_int[]");
-    for (i = 0; i < bnz; i++)
-      b_colind_int[i] = b_colind[i];
-    SUPERLU_FREE (b_colind);
-  }
-  
-  if ( !(vtxdist_o_int = 
-	 (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int))))
-    ABORT("SUPERLU_MALLOC fails for vtxdist_o_int.");
-  for (i = 0; i <= nprocs_i; i++)
-    vtxdist_o_int[i] = vtxdist_o[i];
-  SUPERLU_FREE (vtxdist_o);
-
-#else  /* Default */
-
-  vtxdist_o_int = vtxdist_o;
-  b_rowptr_int = b_rowptr; b_colind_int = b_colind;
-  l_sizes_int = l_sizes;
-
-#endif
-    
-  if ( iam < noDomains) {
-    options = (int *) SUPERLU_MALLOC(4 * sizeof(int));
-    options[0] = 0;
-    options[1] = 0;
-    options[2] = 0;
-    options[3] = 1;
-
-#if 0
-    ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, 
-		       &numflag, options,
-		       dist_order_int, l_sizes_int, metis_comm);
-#endif
-  }
-  
-  if (bnz) 
-    SUPERLU_FREE (b_colind_int);
-  if ( iam < noDomains) {
-    SUPERLU_FREE (options);
-  }
-  SUPERLU_FREE (b_rowptr_int);
-  
-#if defined (_LONGINT)
-  /* Copy data from dist_order_int to dist_order */
-  if ( iam < noDomains) {
-    /* dist_order is the perm returned by parMetis, distributed */
-    if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
-      ABORT("SUPERLU_MALLOC fails for dist_order.");
-    for (i = 0; i < m_loc; i++)
-      dist_order[i] = dist_order_int[i];
-    SUPERLU_FREE(dist_order_int);
-    
-    for (i = 0; i < 2*noDomains; i++)
-      l_sizes[i] = l_sizes_int[i];
-    SUPERLU_FREE(l_sizes_int);
-  }
-#else 
-  dist_order = dist_order_int;
-#endif
-  
-  /* Allgatherv dist_order to get perm_c */
-  if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
-    ABORT ("SUPERLU_MALLOC fails for displs.");
-  if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
-    ABORT ("SUPERLU_MALLOC fails for recvcnts.");
-  for (i = 0; i < nprocs_i; i++)
-    recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i];
-  displs[0]=0;
-  for(i=1; i < nprocs_i; i++) 
-    displs[i] = displs[i-1] + recvcnts[i-1];
-  
-  MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, 
-		  mpi_int_t, grid->comm);
-
-  if ( iam < noDomains) {
-    SUPERLU_FREE (dist_order);
-  }
-  SUPERLU_FREE (vtxdist_i);
-  SUPERLU_FREE (vtxdist_o_int);
-  SUPERLU_FREE (recvcnts);
-  SUPERLU_FREE (displs);
-  
-  /* send l_sizes to every processor p >= noDomains */
-  if (!iam)
-    for (p = noDomains; p < nprocs_i; p++)
-      MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm);
-  if (noDomains <= iam && iam < nprocs_i)
-    MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm,
-	      &status);
-  
-  /* Determine the first node in each separator, store it in l_fstVtxSep */  
-  for (j = 0; j < 2 * noDomains; j++)
-    l_fstVtxSep[j] = 0;
-  l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2];
-  szSep = noDomains;
-  i = 0;
-  while (szSep != 1) {
-    for (j = i; j < i + szSep; j++) {
-      l_fstVtxSep[j] += l_sizes[j]; 	      
-    }
-    for (j = i; j < i + szSep; j++) {
-      k = i + szSep + (j-i) / 2;
-      l_fstVtxSep[k] += l_fstVtxSep[j]; 
-    }
-    i += szSep;
-    szSep = szSep / 2;
-  }
-  
-  l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2];
-  i = 2 * noDomains - 2;
-  szSep = 1;
-  while (i > 0) {
-    for (j = i; j < i + szSep; j++) {
-      k = (i - 2 * szSep) + (j-i) * 2 + 1;
-      noNodes = l_fstVtxSep[k];
-      l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k];
-      l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - 
-	noNodes - l_sizes[k-1];
-    }
-    szSep *= 2;
-    i -= szSep;
-  }
-
-#if ( PRNTlevel>=2 )
-  if (!iam ) {
-    PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes);
-    PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep);
-  }
-#endif
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");
-#endif
-  
-  return (-mem);
-
-} /* get_perm_c_parmetis */
-
-
-static float
-a_plus_at_CompRow_loc
-(
- int   iam,         /* Input - my processor number */
- int_t *perm_r,     /* Input - row permutation vector Pr */
- int   nprocs_i,    /* Input - number of processors the input matrix
-		       is distributed on */
- int_t *vtxdist_i,  /* Input - index of first row on each processor of the input matrix */
- int_t n,           /* Input - number of columns in matrix A. */
- int_t *rowptr,     /* Input - row pointers of size m_loc+1 for matrix A. */
- int_t *colind,     /* Input - column indices of size nnz_loc for matrix A. */
- int   nprocs_o,    /* Input - number of processors the output matrix
-		       is distributed on */
- int_t *vtxdist_o,  /* Input - index of first row on each processor of the output matrix */
- int_t *p_bnz,      /* Output - on exit, returns the actual number of
-		       local nonzeros in matrix A'+A. */
- int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */
- int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */
- gridinfo_t *grid    /* Input - grid of processors information */
- )
-{
-/*
- * Purpose
- * =======
- *
- * Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in
- * NRformat_loc format, represented by (rowptr, colind). The output
- * B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row
- * oriented), represented by (b_rowptr, b_colind).
- *
- * The input matrix A is distributed in block row format on nprocs_i
- * processors.  The output matrix B is distributed in block row format
- * on nprocs_o processors, where nprocs_o <= nprocs_i.  On output, the
- * matrix B has its rows permuted according to perm_r.
- *
- * Sketch of the algorithm
- * =======================
- *
- * Let iam by my process number.  Let fst_row, lst_row = m_loc +
- * fst_row be the first/last row stored on iam.
- * 
- * Compute Pr' - the inverse row permutation, stored in iperm_r.
- *
- * Compute the transpose  of the block row of Pr*A that iam owns:
- *    T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr'
- *
- *
- * All to all communication such that every processor iam receives all
- * the blocks of the transpose matrix that it needs, that is
- *           T[fst_row:lst_row, :]
- *
- * Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :]
- *
- * If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that
- * is compute Pr*B) and redistribute from nprocs_i to nprocs_o
- * according to the block row distribution in vtxdist_i, vtxdist_o.
- */
-  
-  int_t i, j, k, col, num_nz, nprocs;
-  int_t *tcolind_recv; /* temporary receive buffer */
-  int_t *tcolind_send; /* temporary send buffer */
-  int_t sz_tcolind_send, sz_tcolind_loc, sz_tcolind_recv;
-  int_t ind, ind_tmp, ind_rcv;
-  int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */
-  int_t *marker, *iperm_r;
-  int_t *sendCnts, *recvCnts;
-  int_t *sdispls, *rdispls;
-  int_t bnz, *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t;
-  int_t p, t_ind, nelts, ipcol;
-  int_t m_loc, m_loc_o;      /* number of local rows */ 
-  int_t fst_row, fst_row_o;  /* index of first local row */
-  int_t nnz_loc;    /* number of local nonzeros in matrix A */
-  float apat_mem, apat_mem_max;
-  int   *intBuf1, *intBuf2, *intBuf3, *intBuf4;  
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()");
-#endif
-  
-  fst_row    = vtxdist_i[iam];
-  m_loc      = vtxdist_i[iam+1] - vtxdist_i[iam];
-  nnz_loc    = rowptr[m_loc];
-  redist_pra = FALSE;  
-  nprocs     = SUPERLU_MAX(nprocs_i, nprocs_o);
-  apat_mem_max = 0.;
-  
-  if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for marker[]");
-  if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for iperm_r[]");
-  if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for sendCnts[]");
-  if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for recvCnts[]");
-  if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for sdispls[]");
-  if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for rdispls[]");
-  apat_mem = 2 * n + 4 * nprocs + 3;
-
-#if defined (_LONGINT)
-  intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int));
-  intBuf2 = intBuf1 + nprocs;
-  intBuf3 = intBuf1 + 2 * nprocs;
-  intBuf4 = intBuf1 + 3 * nprocs;
-  apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t);
-#endif  
-
-  /* compute the inverse row permutation vector */
-  for (i = 0; i < n; i++) {
-    marker[i] = 1;
-    if (perm_r[i] != i)
-      redist_pra = TRUE;
-    iperm_r[perm_r[i]] = i;
-  }
-
-  /* TRANSPOSE LOCAL ROWS ON MY PROCESSOR iam.         */
-  /* THE RESULT IS STORED IN TCOLIND_SEND.             */
-  /* THIS COUNTS FOR TWO PASSES OF THE LOCAL MATRIX.   */
-
-  /* First pass to get counts of each row of T, and set up column pointers */
-  for (j = 0; j < m_loc; j++) {
-    for (i = rowptr[j]; i < rowptr[j+1]; i++){
-      marker[iperm_r[colind[i]]]++;
-    }
-  }
-  /* determine number of elements to be sent to each processor */
-  for (p = 0; p < nprocs_i; p++) {
-    sendCnts[p] = 0;
-    for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) 
-      sendCnts[p] += marker[i];
-  }
-  /* exchange send/receive counts information in between all processors */
-  MPI_Alltoall (sendCnts, 1, mpi_int_t,
-		recvCnts, 1, mpi_int_t, grid->comm);
-  sendCnts[iam] = 0;
-  sz_tcolind_loc = recvCnts[iam];
-  
-  for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
-    rdispls[p] = j;
-    j += recvCnts[p];
-    sdispls[p] = i;  
-    i += sendCnts[p];
-  }
-  recvCnts[iam] = 0;
-  sz_tcolind_recv = j;
-  sz_tcolind_send = i;
-  
-  /* allocate memory to receive necessary blocks of transpose matrix T */
-  if (sz_tcolind_recv) {
-    if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv 
-						   * sizeof(int_t) )))
-      ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
-    apat_mem += sz_tcolind_recv;
-  }
-  /* allocate memory to send blocks of local transpose matrix T to other processors */
-  if (sz_tcolind_send) {
-    if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) 
-						  * sizeof(int_t))))
-      ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
-    apat_mem += sz_tcolind_send;
-  }
-
-  /* Set up marker[] to point at the beginning of each row in the
-     send/receive buffer.  For each row, we store first its number of
-     elements, and then the elements. */  
-  ind_rcv = rdispls[iam];
-  for (p = 0; p < nprocs_i; p++) {
-    for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) {
-      nelts = marker[i] - 1;
-      if (p == iam) {
-	tcolind_recv[ind_rcv] = nelts;
-	marker[i] = ind_rcv + 1;
-	ind_rcv += nelts + 1;
-      }
-      else {
-	tcolind_send[sdispls[p]] = nelts;
-	marker[i] = sdispls[p] + 1;
-	sdispls[p] += nelts + 1;
-      }
-    }
-  }
-  /* reset sdispls vector */
-  for (i = 0, p = 0; p < nprocs_i; p++) {
-    sdispls[p] = i;  
-    i += sendCnts[p];
-  }
-  /* Second pass of the local matrix A to copy data to be send */
-  for (j = 0; j < m_loc; j++)
-    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
-      col = colind[i];
-      ipcol = iperm_r[col];      
-      if (ipcol >= fst_row && ipcol < fst_row + m_loc)  /* local data */
-	tcolind_recv[marker[ipcol]] = perm_r[j + fst_row];      
-      else /* remote */ 
-	tcolind_send[marker[ipcol]] = perm_r[j + fst_row];
-      marker[ipcol] ++;
-    }
-  sendCnts[iam] = 0;
-  recvCnts[iam] = 0;
-
-#if defined (_LONGINT)
-  for (p=0; p<nprocs; p++) {
-    if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
-	recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
-      ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-    intBuf1[p] = (int) sendCnts[p];
-    intBuf2[p] = (int) sdispls[p];
-    intBuf3[p] = (int) recvCnts[p];
-    intBuf4[p] = (int) rdispls[p];
-  }
-#else  /* Default */
-  intBuf1 = sendCnts;  intBuf2 = sdispls;
-  intBuf3 = recvCnts;  intBuf4 = rdispls;
-#endif
-  
-  /* send/receive transpose matrix T */
-  MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
-		 tcolind_recv, intBuf3, intBuf4, mpi_int_t,
-		 grid->comm);
-  /* ------------------------------------------------------------
-     DEALLOCATE SEND COMMUNICATION STORAGE
-     ------------------------------------------------------------*/
-  if (sz_tcolind_send) {
-    SUPERLU_FREE( tcolind_send );
-    apat_mem_max = apat_mem;
-    apat_mem -= sz_tcolind_send;
-  }
-
-  /* ----------------------------------------------------------------
-     FOR LOCAL ROWS:
-       compute B = A + T, where row j of B is:
-       Struct (B(j,:)) = Struct (A(j,:)) UNION Struct (T(j,:))
-       do not include the diagonal entry
-     THIS COUNTS FOR TWO PASSES OF THE LOCAL ROWS OF A AND T.   
-     ------------------------------------------------------------------ */
-  
-  /* Reset marker to EMPTY */
-  for (i = 0; i < n; ++i) marker[i] = EMPTY;
-  /* save rdispls information */
-  for (p = 0; p < nprocs_i; p++)
-    sdispls[p] = rdispls[p];
-
-  /* First pass determines number of nonzeros in B */
-  num_nz = 0;
-  for (j = 0; j < m_loc; j++) {
-    /* Flag the diagonal so it's not included in the B matrix */
-    marker[perm_r[j + fst_row]] = j;
-    
-    /* Add pattern of row A(j,:) to B(j,:) */
-    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
-      k = colind[i];
-      if ( marker[k] != j ) {
-	marker[k] = j;
-	++num_nz;
-      }
-    }
-    
-    /* Add pattern of row T(j,:) to B(j,:) */
-    for (p = 0; p < nprocs_i; p++) {
-      t_ind = rdispls[p];
-      nelts = tcolind_recv[t_ind]; t_ind ++;
-      for (i = t_ind; i < t_ind + nelts; i++) {
-	k = tcolind_recv[i];
-	if ( marker[k] != j ) {
-	  marker[k] = j;
-	  ++num_nz;
-	}
-      }
-      t_ind += nelts;
-      rdispls[p] = t_ind;
-    }
-  }
-  bnz_t = num_nz;
-
-  /* Allocate storage for B=Pr*A+A'*Pr' */
-  if ( !(b_rowptr_t = (int_t*) SUPERLU_MALLOC((m_loc+1) * sizeof(int_t))))
-    ABORT("SUPERLU_MALLOC fails for b_rowptr_t[]");
-  if ( bnz_t ) {
-    if ( !(b_colind_t = (int_t*) SUPERLU_MALLOC( bnz_t * sizeof(int_t))))
-      ABORT("SUPERLU_MALLOC fails for b_colind_t[]");
-  }
-  apat_mem += m_loc + 1 + bnz_t;
-  if (apat_mem > apat_mem_max)
-    apat_mem_max = apat_mem;
-  
-  /* Reset marker to EMPTY */
-  for (i = 0; i < n; i++) marker[i] = EMPTY;
-  /* restore rdispls information */
-  for (p = 0; p < nprocs_i; p++)
-    rdispls[p] = sdispls[p];
-  
-  /* Second pass, compute each row of B, one at a time */
-  num_nz = 0;
-  t_ind = 0;
-  for (j = 0; j < m_loc; j++) {
-    b_rowptr_t[j] = num_nz;
-    
-    /* Flag the diagonal so it's not included in the B matrix */
-    marker[perm_r[j + fst_row]] = j;
-
-    /* Add pattern of row A(j,:) to B(j,:) */
-    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
-      k = colind[i];
-      if ( marker[k] != j ) {
-	marker[k] = j;
-	b_colind_t[num_nz] = k; num_nz ++;
-      }
-    }
-
-    /* Add pattern of row T(j,:) to B(j,:) */
-    for (p = 0; p < nprocs_i; p++) {
-      t_ind = rdispls[p];
-      nelts = tcolind_recv[t_ind]; t_ind++;
-      for (i = t_ind; i < t_ind + nelts; i++) {
-	k = tcolind_recv[i];
-	if ( marker[k] != j ) {
-	  marker[k] = j;
-	  b_colind_t[num_nz] = k; num_nz++;
-	}
-      }
-      t_ind += nelts;
-      rdispls[p] = t_ind;
-    }
-  }
-  b_rowptr_t[m_loc] = num_nz;
-
-  for (p = 0; p <= SUPERLU_MIN(nprocs_i, nprocs_o); p++) 
-    if (vtxdist_i[p] != vtxdist_o[p])
-      redist_pra = TRUE;
-  
-  if (sz_tcolind_recv) {
-    SUPERLU_FREE (tcolind_recv);
-    apat_mem -= sz_tcolind_recv;
-  }
-  SUPERLU_FREE (marker);
-  SUPERLU_FREE (iperm_r);
-  apat_mem -= 2 * n + 1;
-  
-  /* redistribute permuted matrix (by rows) from nproc_i processors
-     to nproc_o processors */
-  if (redist_pra) {
-    m_loc_o = vtxdist_o[iam+1] - vtxdist_o[iam];
-    fst_row_o = vtxdist_o[iam];
-    nnz_loc = 0;
-    
-    if ( !(b_rowptr = intMalloc_dist(m_loc_o + 1)) )
-      ABORT("Malloc fails for *b_rowptr[].");
-    apat_mem += m_loc_o + 1;
-    if (apat_mem > apat_mem_max)
-      apat_mem_max = apat_mem;
-
-    for (p = 0; p < nprocs_i; p++) {
-      sendCnts[p] = 0;
-      recvCnts[p] = 0;
-    }
-
-    for (i = 0; i < m_loc; i++) {
-      k = perm_r[i+fst_row];
-      /* find the processor to which row k belongs */
-      j = FALSE; p = 0;
-      while (!j) {
-	if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
-	  j = TRUE;
-	else 
-	  p ++;
-      }
-      if (p == iam) {
-	b_rowptr[k-fst_row_o] = b_rowptr_t[i + 1] - b_rowptr_t[i];
-	nnz_loc += b_rowptr[k-fst_row_o];
-      }
-      else
-	sendCnts[p] += b_rowptr_t[i + 1] - b_rowptr_t[i] + 2;
-    }
-    /* exchange send/receive counts information in between all processors */
-    MPI_Alltoall (sendCnts, 1, mpi_int_t,
-		  recvCnts, 1, mpi_int_t, grid->comm);
-    
-    for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
-      rdispls[p] = j;
-      j += recvCnts[p];
-      sdispls[p] = i;  
-      i += sendCnts[p];
-    }
-    rdispls[p] = j;
-    sdispls[p] = i;
-    sz_tcolind_recv = j;
-    sz_tcolind_send = i;
-
-    /* allocate memory for local data */
-    tcolind_recv = NULL;
-    tcolind_send = NULL;
-    if (sz_tcolind_recv) {
-      if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv 
-						     * sizeof(int_t) )))
-	ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
-      apat_mem += sz_tcolind_recv;
-    }
-    /* allocate memory to receive necessary data */
-    if (sz_tcolind_send) {
-      if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) 
-						    * sizeof(int_t))))
-	ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
-      apat_mem += sz_tcolind_send;
-    }
-    if (apat_mem > apat_mem_max)
-      apat_mem_max = apat_mem;
-
-    /* Copy data to be send */
-    ind_rcv = rdispls[iam];
-    for (i = 0; i < m_loc; i++) {
-      k = perm_r[i+fst_row];
-      /* find the processor to which row k belongs */
-      j = FALSE; p = 0;
-      while (!j) {
-	if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
-	  j = TRUE;
-	else 
-	  p ++;
-      }
-      if (p != iam) { /* remote */ 
-	tcolind_send[sdispls[p]] = k;
-	tcolind_send[sdispls[p]+1] = b_rowptr_t[i+1] - b_rowptr_t[i];
-	sdispls[p] += 2;
-	for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++) {
-	  tcolind_send[sdispls[p]] = b_colind_t[j]; sdispls[p] ++;
-	}
-      }
-    }
-  
-    /* reset sdispls vector */
-    for (i = 0, p = 0; p < nprocs_i; p++) {
-      sdispls[p] = i;  
-      i += sendCnts[p];
-    }
-    sendCnts[iam] = 0;
-    recvCnts[iam] = 0;
-    
-#if defined (_LONGINT)
-    for (p=0; p<nprocs; p++) {
-      if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
-	  recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
-	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-      intBuf1[p] = (int) sendCnts[p];
-      intBuf2[p] = (int) sdispls[p];
-      intBuf3[p] = (int) recvCnts[p];
-      intBuf4[p] = (int) rdispls[p];
-    }
-#else  /* Default */
-    intBuf1 = sendCnts;  intBuf2 = sdispls;
-    intBuf3 = recvCnts;  intBuf4 = rdispls;
-#endif
-
-    /* send/receive permuted matrix T by rows */
-    MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
-		   tcolind_recv, intBuf3, intBuf4, mpi_int_t,
-		   grid->comm);
-    /* ------------------------------------------------------------
-       DEALLOCATE COMMUNICATION STORAGE
-       ------------------------------------------------------------*/
-    if (sz_tcolind_send) {
-      SUPERLU_FREE( tcolind_send );
-      apat_mem -= sz_tcolind_send;
-    }
-    
-    /* ------------------------------------------------------------
-       STORE ROWS IN ASCENDING ORDER OF THEIR NUMBER
-       ------------------------------------------------------------*/
-    for (p = 0; p < nprocs; p++) {
-      if (p != iam) {
-	i = rdispls[p];
-	while (i < rdispls[p+1]) {
-	  j = tcolind_recv[i];
-	  nelts = tcolind_recv[i+1];
-	  i += 2 + nelts;
-	  b_rowptr[j-fst_row_o] = nelts;
-	  nnz_loc += nelts;
-	}
-      }
-    }
-
-    if (nnz_loc)
-      if ( !(b_colind = intMalloc_dist(nnz_loc)) ) {
-	ABORT("Malloc fails for bcolind[].");
-	apat_mem += nnz_loc;
-	if (apat_mem > apat_mem_max)
-	  apat_mem_max = apat_mem;
-      }
-
-    /* Initialize the array of row pointers */
-    k = 0;
-    for (j = 0; j < m_loc_o; j++) {
-      i = b_rowptr[j];
-      b_rowptr[j] = k;
-      k += i;
-    }
-    if (m_loc_o) b_rowptr[j] = k;
-    
-    /* Copy the data into the row oriented storage */
-    for (p = 0; p < nprocs; p++) {
-      if (p != iam) {
-	i = rdispls[p];
-	while (i < rdispls[p+1]) {
-	  j = tcolind_recv[i];
-	  nelts = tcolind_recv[i+1];
-	  for (i += 2, k = b_rowptr[j-fst_row_o]; 
-	       k < b_rowptr[j-fst_row_o+1]; i++, k++) 
-	    b_colind[k] = tcolind_recv[i];
-	}
-      }
-    }
-    for (i = 0; i < m_loc; i++) {
-      k = perm_r[i+fst_row];
-      if (k >= vtxdist_o[iam] && k < vtxdist_o[iam+1]) {
-	ind = b_rowptr[k-fst_row_o];
-	for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++, ind++)
-	  b_colind[ind] = b_colind_t[j];
-      }
-    }
-    
-    SUPERLU_FREE(b_rowptr_t);
-    if ( bnz_t )
-      SUPERLU_FREE(b_colind_t);
-    if (sz_tcolind_recv)
-      SUPERLU_FREE(tcolind_recv);
-    apat_mem -= bnz_t + m_loc + sz_tcolind_recv;
-    
-    *p_bnz = nnz_loc;
-    *p_b_rowptr = b_rowptr;
-    *p_b_colind = b_colind;
-  }
-  else {
-    *p_bnz = bnz_t;
-    *p_b_rowptr = b_rowptr_t;
-    *p_b_colind = b_colind_t;
-  }
-  
-  SUPERLU_FREE (rdispls);
-  SUPERLU_FREE (sdispls);
-  SUPERLU_FREE (sendCnts);
-  SUPERLU_FREE (recvCnts);
-  apat_mem -= 4 * nprocs + 2;
-#if defined (_LONGINT)
-  SUPERLU_FREE (intBuf1);
-  apat_mem -= 4*nprocs*sizeof(int) / sizeof(int_t);
-#endif
-  
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit a_plus_at_CompRow_loc()");
-#endif
-  
-  return (- apat_mem_max * sizeof(int_t));
-} /* a_plus_at_CompRow_loc */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/hypre_interface.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/hypre_interface.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/hypre_interface.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/hypre_interface.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,6 +0,0 @@
-#include "fortran.h"
-double hypre_F90_NAME(hy_dlamch,HY_DLAMCH)(char* in)
-{
-   return hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(in);
-} 
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/lsame.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/lsame.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/lsame.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/lsame.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,72 +0,0 @@
-#include "Cnames.h"
-
-int lsame_(char *ca, char *cb)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-    Purpose   
-    =======   
-
-    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.   
-
-    Arguments   
-    =========   
-
-    CA      (input) CHARACTER*1   
-    CB      (input) CHARACTER*1   
-            CA and CB specify the single characters to be compared.   
-
-   ===================================================================== 
-*/  
-
-    /* System generated locals */
-    int ret_val;
-    
-    /* Local variables */
-    int inta, intb, zcode;
-
-    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
-    if (ret_val) {
-	return ret_val;
-    }
-
-    /* Now test for equivalence if both characters are alphabetic. */
-
-    zcode = 'Z';
-
-    /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime   
-       machines, on which ICHAR returns a value with bit 8 set.   
-       ICHAR('A') on Prime machines returns 193 which is the same as   
-       ICHAR('A') on an EBCDIC machine. */
-
-    inta = *(unsigned char *)ca;
-    intb = *(unsigned char *)cb;
-
-    if (zcode == 90 || zcode == 122) {
-	/* ASCII is assumed - ZCODE is the ASCII code of either lower or   
-          upper case 'Z'. */
-	if (inta >= 97 && inta <= 122) inta += -32;
-	if (intb >= 97 && intb <= 122) intb += -32;
-
-    } else if (zcode == 233 || zcode == 169) {
-	/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or   
-          upper case 'Z'. */
-	if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta 
-		>= 162 && inta <= 169)
-	    inta += 64;
-	if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb 
-		>= 162 && intb <= 169)
-	    intb += 64;
-    } else if (zcode == 218 || zcode == 250) {
-	/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code   
-          plus 128 of either lower or upper case 'Z'. */
-	if (inta >= 225 && inta <= 250) inta += -32;
-	if (intb >= 225 && intb <= 250) intb += -32;
-    }
-    ret_val = inta == intb;
-    return ret_val;
-    
-} /* lsame_ */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/machines.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/machines.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/machines.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/machines.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,49 +0,0 @@
-/*
- * -- SuperLU MT routine (version 1.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * August 15, 1997
- *
- * These macros define which machine will be used.
- */
-
-#ifndef __SUPERLU_MACHINES /* allow multiple inclusions */
-#define __SUPERLU_MACHINES
-
-#define SGI	        0
-#define ORIGIN	        1
-#define DEC	        2
-#define CRAY_T3E	3
-#define SUN             4
-#define PTHREAD         5
-#define IBM             6
-
-#ifdef _SGI
-#define MACH SGI 
-#endif
-
-#ifdef _ORIGIN
-#define MACH ORIGIN 
-#endif
-
-#ifdef _DEC
-#define MACH DEC 
-#endif
-
-#ifdef _CRAY
-#define MACH CRAY_T3E 
-#endif
-
-#ifdef _SOLARIS
-#define MACH SUN 
-#endif
-
-#ifdef _PTHREAD
-#define MACH PTHREAD
-#endif
-
-#if ( defined(_SP2) || defined(_SP) )
-#define MACH IBM
-#endif
-
-#endif /* __SUPERLU_MACHINES */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,191 +0,0 @@
-#######################################################################
-#
-#  This makefile creates a library for distributed SuperLU.
-#  The files are organized as follows:
-#
-#       ALLAUX  -- Auxiliary routines called from all precisions
-#       DSLUSRC -- Double precision real serial SuperLU routines
-#       DPLUSRC -- Double precision real parallel SuperLU routines
-#       ZSLUSRC -- Double precision complex serial SuperLU routines
-#       ZPLUSRC -- Double precision complex parallel SuperLU routines
-#
-#  The library can be set up to include routines for any combination
-#  of the two precisions.  To create or add to the library, enter make
-#  followed by one or more of the precisions desired.  Some examples:
-#       make double
-#       make double complex16
-#  Alternatively, the command
-#       make
-#  without any arguments creates a library of all two precisions.
-#  The library is called
-#       superlu.a
-#  and is created at the next higher directory level.
-#
-#  To remove the object files after the library is created, enter
-#       make clean
-#
-#######################################################################
-#include ../make.inc
-#
-include ../../../config/Makefile.config
-CINCLUDES = ${INCLUDES} ${MPIINCLUDE}
-C_COMPILE_FLAGS = \
- -I$(srcdir)\
- -I$(srcdir)/../../../\
- -I$(srcdir)/../../../utilities\
- ${CINCLUDES}
-
-# Precision independent routines
-#
-ALLAUX 	= sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
-	  mmd.o comm.o memory.o util.o superlu_grid.o \
-	  pxerbla.o superlu_timer.o GetDiagU.o mc64ad.o mc21.o symbfact.o \
-	  psymbfact.o psymbfact_util.o \
-	  get_perm_c_parmetis.o
-
-#### LAPACK auxiliary routines
-LAAUX	= lsame.o xerbla.o slamch.o dlamch.o
-
-#
-# Routines to permute large entries to diagonal
-#
-# DLDPERM = dldperm.o mc64ad.o mc21.o #fd05.o
-
-#
-# Routines literally taken from SuperLU
-#
-DSLUSRC	= dlangs.o dgsequ.o dlaqgs.o dutil.o \
-	  dmemory.o dmyblas2.o dsp_blas2.o dsp_blas3.o
-ZSLUSRC	= dcomplex.o zlangs.o zgsequ.o zlaqgs.o zutil.o \
-	  zmemory.o zmyblas2.o dmemory.o zsp_blas2.o zsp_blas3.o
-
-#
-# Routines for double precision parallel SuperLU
-DPLUSRC = dldperm.o ddistribute.o pdgstrf_irecv.o pdgstrs_Bglobal.o \
-	  pdgstrs1.o pdgssvx_ABglobal.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o\
-	  pdgssvx.o pdgstrs.o pddistribute.o pdlangs.o pdutil.o \
-	  pdgsequ.o pdlaqgs.o pdgsrfs.o pdgsmv.o pdgstrs_lsum.o \
-	  pdsymbfact_distdata.o
-
-#
-# Routines for double complex parallel SuperLU
-ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf_irecv.o \
-	  pzgstrs1.o pzgstrs_Bglobal.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \
-	  pzgssvx.o pzgstrs.o pzdistribute.o pzlangs.o pzutil.o \
-	  pzgsequ.o pzlaqgs.o pzgsrfs.o pzgsmv.o pzgstrs_lsum.o \
-	  pzsymbfact_distdata.o
-
-#######################################
-### changed for hypre
-#######################################
-
-FILES =\
- dldperm.c \
- ddistribute.c \
- pdgstrf_irecv.c \
- pdgstrs_Bglobal.c \
- pdgstrs1.c \
- pdgssvx_ABglobal.c \
- pdgsrfs_ABXglobal.c \
- pdgsmv_AXglobal.c \
- pdgssvx.c \
- pdgstrs.c \
- pddistribute.c \
- pdlangs.c \
- pdutil.c \
- pdgsequ.c \
- pdlaqgs.c \
- pdgsrfs.c \
- pdgsmv.c \
- pdgstrs_lsum.c \
- pdsymbfact_distdata.c \
- get_perm_c_parmetis.o \
- sp_ienv.c \
- etree.c \
- sp_colorder.c \
- get_perm_c.c \
- mmd.c \
- comm.c \
- memory.c \
- util.c \
- superlu_grid.c \
- pxerbla.c \
- superlu_timer.c \
- GetDiagU.c \
- symbfact.c \
- psymbfact.c \
- psymbfact_util.c \
- lsame.c \
- xerbla.c \
- dlangs.c \
- dgsequ.c \
- dlaqgs.c \
- dutil.c \
- dmemory.c \
- slamch.c \
- dmyblas2.c \
- dsp_blas2.c \
- dsp_blas3.c \
- hypre_interface.c
-
-FILES2 =\
- mc64ad.f \
- mc21.f
-
-OBJS = ${FILES:.c=.o} ${FILES2:.f=.o}
-
-all: libHYPRE_dsuperlu${HYPRE_LIB_SUFFIX}
-	cp -fp *.h $(HYPRE_BUILD_DIR)/include
-#	cp -fp libHYPRE* $(HYPRE_BUILD_DIR)/lib
-
-install: libHYPRE_dsuperlu${HYPRE_LIB_SUFFIX}
-	cp -f *.h $(HYPRE_INC_INSTALL)
-#	cp -f libHYPRE* $(HYPRE_LIB_INSTALL)
-	@echo  " "
-
-clean:
-	rm -f *.o libHYPRE*
-	rm -rf pchdir tca.map *inslog*
-distclean: clean
-
-libHYPRE_dsuperlu.a: ${OBJS}
-	@echo  "Building $@ ... "
-	${AR} $@ ${OBJS}
-	${RANLIB} $@
-
-libHYPRE_dsuperlu.so: ${OBJS}
-	@echo  "Building $@ ... "
-	${BUILD_CC_SHARED} -o $@ ${OBJS}
-
-#######################################
-
-#all:  double complex16
-
-double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX)
-	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
-		$(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX)
-	$(RANLIB) $(DSUPERLULIB)
-
-complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX)
-	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
-		$(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX)
-	$(RANLIB) $(DSUPERLULIB)
-
-
-##################################
-# Do not optimize these routines #
-##################################
-slamch.o: slamch.c ; $(CC) $(CFLAGS) $(NOOPTS) $(CDEFS) -c $<
-dlamch.o: dlamch.c ; $(CC) $(CFLAGS) $(NOOPTS) $(CDEFS) -c $<
-##################################
-
-#.c.o:
-#	$(CC) $(CFLAGS) $(CDEFS) -DUSE_VENDOR_BLAS -c $< $(VERBOSE)
-
-#.f.o:
-#	f77 $(FFLAGS) -c $< $(VERBOSE)
-
-#clean:	
-#	rm -f *.o $(DSUPERLULIB)
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/Makefile.dsuperlu	1970-01-01 00:00:00.000000000 +0000
@@ -1,100 +0,0 @@
-#######################################################################
-#
-#  This makefile creates a library for distributed SuperLU.
-#  The files are organized as follows:
-#
-#       ALLAUX  -- Auxiliary routines called from all precisions
-#       DSLUSRC -- Double precision real serial SuperLU routines
-#       DPLUSRC -- Double precision real parallel SuperLU routines
-#       ZSLUSRC -- Double precision complex serial SuperLU routines
-#       ZPLUSRC -- Double precision complex parallel SuperLU routines
-#
-#  The library can be set up to include routines for any combination
-#  of the two precisions.  To create or add to the library, enter make
-#  followed by one or more of the precisions desired.  Some examples:
-#       make double
-#       make double complex16
-#  Alternatively, the command
-#       make
-#  without any arguments creates a library of all two precisions.
-#  The library is called
-#       superlu.a
-#  and is created at the next higher directory level.
-#
-#  To remove the object files after the library is created, enter
-#       make clean
-#
-#######################################################################
-include ../make.inc
-#
-# Precision independent routines
-#
-ALLAUX 	= sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
-	  mmd.o comm.o memory.o util.o superlu_grid.o \
-	  pxerbla.o superlu_timer.o GetDiagU.o mc64ad.o mc21.o symbfact.o \
-	  psymbfact.o psymbfact_util.o \
-	  get_perm_c_parmetis.o
-
-#### LAPACK auxiliary routines
-LAAUX	= lsame.o xerbla.o slamch.o dlamch.o
-
-#
-# Routines to permute large entries to diagonal
-#
-# DLDPERM = dldperm.o mc64ad.o mc21.o #fd05.o
-
-#
-# Routines literally taken from SuperLU
-#
-DSLUSRC	= dlangs.o dgsequ.o dlaqgs.o dutil.o \
-	  dmemory.o dmyblas2.o dsp_blas2.o dsp_blas3.o
-ZSLUSRC	= dcomplex.o zlangs.o zgsequ.o zlaqgs.o zutil.o \
-	  zmemory.o zmyblas2.o dmemory.o zsp_blas2.o zsp_blas3.o
-
-#
-# Routines for double precision parallel SuperLU
-DPLUSRC = dldperm.o ddistribute.o pdgstrf_irecv.o pdgstrs_Bglobal.o \
-	  pdgstrs1.o pdgssvx_ABglobal.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o\
-	  pdgssvx.o pdgstrs.o pddistribute.o pdlangs.o pdutil.o \
-	  pdgsequ.o pdlaqgs.o pdgsrfs.o pdgsmv.o pdgstrs_lsum.o \
-	  pdsymbfact_distdata.o
-
-#
-# Routines for double complex parallel SuperLU
-ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf_irecv.o \
-	  pzgstrs1.o pzgstrs_Bglobal.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \
-	  pzgssvx.o pzgstrs.o pzdistribute.o pzlangs.o pzutil.o \
-	  pzgsequ.o pzlaqgs.o pzgsrfs.o pzgsmv.o pzgstrs_lsum.o \
-	  pzsymbfact_distdata.o
-
-#all:  double complex16
-all:  double 
-
-double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX)
-	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
-		$(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX)
-	$(RANLIB) $(DSUPERLULIB)
-
-complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX)
-	$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
-		$(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX)
-	$(RANLIB) $(DSUPERLULIB)
-
-
-##################################
-# Do not optimize these routines #
-##################################
-slamch.o: slamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $<
-dlamch.o: dlamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $<
-##################################
-
-.c.o:
-	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE)
-
-.f.o:
-	$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
-
-clean:	
-	rm -f *.o $(DSUPERLULIB)
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc21.f hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc21.f
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc21.f	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc21.f	1970-01-01 00:00:00.000000000 +0000
@@ -1,143 +0,0 @@
-*######DATE 8 Oct 1992 COPYRIGHT Rutherford Appleton Laboratory
-C######8/10/92 Toolpack tool decs employed.
-C######8/10/92 D version created by name change only.
-      SUBROUTINE MC21AD(N,ICN,LICN,IP,LENR,IPERM,NUMNZ,IW)
-C     .. Scalar Arguments ..
-      INTEGER LICN,N,NUMNZ
-C     ..
-C     .. Array Arguments ..
-      INTEGER ICN(LICN),IP(N),IPERM(N),IW(N,4),LENR(N)
-C     ..
-C     .. External Subroutines ..
-      EXTERNAL MC21BD
-C     ..
-C     .. Executable Statements ..
-      CALL MC21BD(N,ICN,LICN,IP,LENR,IPERM,NUMNZ,IW(1,1),IW(1,2),
-     +            IW(1,3),IW(1,4))
-      RETURN
-C
-      END
-      SUBROUTINE MC21BD(N,ICN,LICN,IP,LENR,IPERM,NUMNZ,PR,ARP,CV,OUT)
-C   PR(I) IS THE PREVIOUS ROW TO I IN THE DEPTH FIRST SEARCH.
-C IT IS USED AS A WORK ARRAY IN THE SORTING ALGORITHM.
-C   ELEMENTS (IPERM(I),I) I=1, ... N  ARE NON-ZERO AT THE END OF THE
-C ALGORITHM UNLESS N ASSIGNMENTS HAVE NOT BEEN MADE.  IN WHICH CASE
-C (IPERM(I),I) WILL BE ZERO FOR N-NUMNZ ENTRIES.
-C   CV(I) IS THE MOST RECENT ROW EXTENSION AT WHICH COLUMN I
-C WAS VISITED.
-C   ARP(I) IS ONE LESS THAN THE NUMBER OF NON-ZEROS IN ROW I
-C WHICH HAVE NOT BEEN SCANNED WHEN LOOKING FOR A CHEAP ASSIGNMENT.
-C   OUT(I) IS ONE LESS THAN THE NUMBER OF NON-ZEROS IN ROW I
-C WHICH HAVE NOT BEEN SCANNED DURING ONE PASS THROUGH THE MAIN LOOP.
-C
-C   INITIALIZATION OF ARRAYS.
-C     .. Scalar Arguments ..
-      INTEGER LICN,N,NUMNZ
-C     ..
-C     .. Array Arguments ..
-      INTEGER ARP(N),CV(N),ICN(LICN),IP(N),IPERM(N),LENR(N),OUT(N),PR(N)
-C     ..
-C     .. Local Scalars ..
-      INTEGER I,II,IN1,IN2,IOUTK,J,J1,JORD,K,KK
-C     ..
-C     .. Executable Statements ..
-      DO 10 I = 1,N
-        ARP(I) = LENR(I) - 1
-        CV(I) = 0
-        IPERM(I) = 0
-   10 CONTINUE
-      NUMNZ = 0
-C
-C
-C   MAIN LOOP.
-C   EACH PASS ROUND THIS LOOP EITHER RESULTS IN A NEW ASSIGNMENT
-C OR GIVES A ROW WITH NO ASSIGNMENT.
-      DO 100 JORD = 1,N
-        J = JORD
-        PR(J) = -1
-        DO 70 K = 1,JORD
-C LOOK FOR A CHEAP ASSIGNMENT
-          IN1 = ARP(J)
-          IF (IN1.LT.0) GO TO 30
-          IN2 = IP(J) + LENR(J) - 1
-          IN1 = IN2 - IN1
-          DO 20 II = IN1,IN2
-            I = ICN(II)
-            IF (IPERM(I).EQ.0) GO TO 80
-   20     CONTINUE
-C   NO CHEAP ASSIGNMENT IN ROW.
-          ARP(J) = -1
-C   BEGIN LOOKING FOR ASSIGNMENT CHAIN STARTING WITH ROW J.
-   30     CONTINUE
-          OUT(J) = LENR(J) - 1
-C INNER LOOP.  EXTENDS CHAIN BY ONE OR BACKTRACKS.
-          DO 60 KK = 1,JORD
-            IN1 = OUT(J)
-            IF (IN1.LT.0) GO TO 50
-            IN2 = IP(J) + LENR(J) - 1
-            IN1 = IN2 - IN1
-C FORWARD SCAN.
-            DO 40 II = IN1,IN2
-              I = ICN(II)
-              IF (CV(I).EQ.JORD) GO TO 40
-C   COLUMN I HAS NOT YET BEEN ACCESSED DURING THIS PASS.
-              J1 = J
-              J = IPERM(I)
-              CV(I) = JORD
-              PR(J) = J1
-              OUT(J1) = IN2 - II - 1
-              GO TO 70
-C
-   40       CONTINUE
-C
-C   BACKTRACKING STEP.
-   50       CONTINUE
-            J = PR(J)
-            IF (J.EQ.-1) GO TO 100
-   60     CONTINUE
-C
-   70   CONTINUE
-C
-C   NEW ASSIGNMENT IS MADE.
-   80   CONTINUE
-        IPERM(I) = J
-        ARP(J) = IN2 - II - 1
-        NUMNZ = NUMNZ + 1
-        DO 90 K = 1,JORD
-          J = PR(J)
-          IF (J.EQ.-1) GO TO 100
-          II = IP(J) + LENR(J) - OUT(J) - 2
-          I = ICN(II)
-          IPERM(I) = J
-   90   CONTINUE
-C
-  100 CONTINUE
-C
-C   IF MATRIX IS STRUCTURALLY SINGULAR, WE NOW COMPLETE THE
-C PERMUTATION IPERM.
-      IF (NUMNZ.EQ.N) RETURN
-      DO 110 I = 1,N
-        ARP(I) = 0
-  110 CONTINUE
-      K = 0
-      DO 130 I = 1,N
-        IF (IPERM(I).NE.0) GO TO 120
-        K = K + 1
-        OUT(K) = I
-        GO TO 130
-C
-  120   CONTINUE
-        J = IPERM(I)
-        ARP(J) = I
-  130 CONTINUE
-      K = 0
-      DO 140 I = 1,N
-        IF (ARP(I).NE.0) GO TO 140
-        K = K + 1
-        IOUTK = OUT(K)
-        IPERM(IOUTK) = I
-  140 CONTINUE
-      RETURN
-C
-      END
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc64ad.f hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc64ad.f
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mc64ad.f	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mc64ad.f	1970-01-01 00:00:00.000000000 +0000
@@ -1,2002 +0,0 @@
-CCCCC COPYRIGHT (c) 1999  Council for the Central Laboratory of the
-CCCCC Research Councils.    All rights reserved.
-CCCCC PACKAGE MC64A/AD
-CCCCC AUTHORS Iain Duff (i.duff@rl.ac.uk) and Jacko Koster (jak@ii.uib.no)
-CCCCC LAST UPDATE 20/09/99
-CCCCC
-C *** Conditions on external use ***
-C
-C The user shall acknowledge the contribution of this
-C package in any publication of material dependent upon the use of
-C the package. The user shall use reasonable endeavours to notify
-C the authors of the package of this publication.
-C
-C The user can modify this code but, at no time
-C shall the right or title to all or any part of this package pass
-C to the user. The user shall make available free of charge
-C to the authors for any purpose all information relating to any
-C alteration or addition made to this package for the purposes of
-C extending the capabilities or enhancing the performance of this
-C package.
-C
-C The user shall not pass this code directly to a third party without the
-C express prior consent of the authors.  Users wanting to licence their
-C own copy of these routines should send email to hsl@aeat.co.uk
-C
-C None of the comments from the Copyright notice up to and including this
-C one shall be removed or altered in any way.
-
-C**********************************************************************
-      SUBROUTINE MC64ID(ICNTL)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-C  Purpose
-C  =======
-C
-C  The components of the array ICNTL control the action of MC64A/AD.
-C  Default values for these are set in this subroutine.
-C
-C  Parameters
-C  ==========
-C
-      INTEGER ICNTL(10)
-C
-C  Local variables
-      INTEGER I
-C
-C    ICNTL(1) has default value 6.
-C     It is the output stream for error messages. If it
-C     is negative, these messages will be suppressed.
-C
-C    ICNTL(2) has default value 6.
-C     It is the output stream for warning messages.
-C     If it is negative, these messages are suppressed.
-C
-C    ICNTL(3) has default value -1.
-C     It is the output stream for monitoring printing.
-C     If it is negative, these messages are suppressed.
-C
-C    ICNTL(4) has default value 0.
-C     If left at the defaut value, the incoming data is checked for
-C     out-of-range indices and duplicates.  Setting ICNTL(4) to any
-C     other will avoid the checks but is likely to cause problems
-C     later if out-of-range indices or duplicates are present.
-C     The user should only set ICNTL(4) non-zero, if the data is
-C     known to avoid these problems.
-C
-C    ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to
-C     zero in this routine.
-
-C Initialization of the ICNTL array.
-      ICNTL(1) = 6
-      ICNTL(2) = 6
-      ICNTL(3) = -1
-      DO 10 I = 4,10
-        ICNTL(I) = 0
-   10 CONTINUE
-
-      RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64AD(JOB,N,NE,IP,IRN,A,NUM,CPERM,LIW,IW,LDW,DW,
-     &           ICNTL,INFO)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-C  Purpose
-C  =======
-C
-C This subroutine attempts to find a column permutation for an NxN 
-C sparse matrix A = {a_ij} that makes the permuted matrix have N 
-C entries on its diagonal.
-C If the matrix is structurally nonsingular, the subroutine optionally
-C returns a column permutation that maximizes the smallest element 
-C on the diagonal, maximizes the sum of the diagonal entries, or 
-C maximizes the product of the diagonal entries of the permuted matrix.
-C For the latter option, the subroutine also finds scaling factors 
-C that may be used to scale the matrix so that the nonzero diagonal 
-C entries of the permuted matrix are one in absolute value and all the 
-C off-diagonal entries are less than or equal to one in absolute value.
-C The natural logarithms of the scaling factors u(i), i=1..N, for the 
-C rows and v(j), j=1..N, for the columns are returned so that the 
-C scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j).
-C
-C  Parameters
-C  ==========
-C
-      INTEGER JOB,N,NE,NUM,LIW,LDW
-      INTEGER IP(N+1),IRN(NE),CPERM(N),IW(LIW),ICNTL(10),INFO(10)
-      DOUBLE PRECISION A(NE),DW(LDW)
-C
-C JOB is an INTEGER variable which must be set by the user to
-C control the action. It is not altered by the subroutine.
-C Possible values for JOB are:
-C   1 Compute a column permutation of the matrix so that the
-C     permuted matrix has as many entries on its diagonal as possible. 
-C     The values on the diagonal are of arbitrary size. HSL subroutine 
-C     MC21A/AD is used for this. See [1].
-C   2 Compute a column permutation of the matrix so that the smallest 
-C     value on the diagonal of the permuted matrix is maximized.
-C     See [3].
-C   3 Compute a column permutation of the matrix so that the smallest
-C     value on the diagonal of the permuted matrix is maximized.
-C     The algorithm differs from the one used for JOB = 2 and may
-C     have quite a different performance. See [2].
-C   4 Compute a column permutation of the matrix so that the sum
-C     of the diagonal entries of the permuted matrix is maximized.
-C     See [3].
-C   5 Compute a column permutation of the matrix so that the product
-C     of the diagonal entries of the permuted matrix is maximized
-C     and vectors to scale the matrix so that the nonzero diagonal 
-C     entries of the permuted matrix are one in absolute value and 
-C     all the off-diagonal entries are less than or equal to one in 
-C     absolute value. See [3].
-C  Restriction: 1 <= JOB <= 5.
-C 
-C N is an INTEGER variable which must be set by the user to the
-C   order of the matrix A. It is not altered by the subroutine.
-C   Restriction: N >= 1.
-C
-C NE is an INTEGER variable which must be set by the user to the
-C   number of entries in the matrix. It is not altered by the 
-C   subroutine.
-C   Restriction: NE >= 1.
-C
-C IP is an INTEGER array of length N+1.
-C   IP(J), J=1..N, must be set by the user to the position in array IRN 
-C   of the first row index of an entry in column J. IP(N+1) must be set
-C   to NE+1. It is not altered by the subroutine.
-C
-C IRN is an INTEGER array of length NE. 
-C   IRN(K), K=1..NE, must be set by the user to hold the row indices of
-C   the entries of the matrix. Those belonging to column J must be 
-C   stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering
-C   of the row indices within each column is unimportant. Repeated 
-C   entries are not allowed. The array IRN is not altered by the 
-C   subroutine.
-C
-C A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. 
-C   The user must set A(K), K=1..NE, to the numerical value of the 
-C   entry that corresponds to IRN(K). 
-C   It is not used by the subroutine when JOB = 1. 
-C   It is not altered by the subroutine.
-C
-C NUM is an INTEGER variable that need not be set by the user.
-C   On successful exit, NUM will be the number of entries on the 
-C   diagonal of the permuted matrix.
-C   If NUM < N, the matrix is structurally singular.
-C
-C CPERM is an INTEGER array of length N that need not be set by the 
-C   user. On successful exit, CPERM contains the column permutation.
-C   Column CPERM(J) of the original matrix is column J in the permuted 
-C   matrix, J=1..N.
-C
-C LIW is an INTEGER variable that must be set by the user to
-C   the dimension of array IW. It is not altered by the subroutine.
-C   Restriction:
-C     JOB = 1 :  LIW >= 5N
-C     JOB = 2 :  LIW >= 4N
-C     JOB = 3 :  LIW >= 10N + NE
-C     JOB = 4 :  LIW >= 5N
-C     JOB = 5 :  LIW >= 5N
-C 
-C IW is an INTEGER array of length LIW that is used for workspace.
-C 
-C LDW is an INTEGER variable that must be set by the user to the
-C   dimension of array DW. It is not altered by the subroutine.
-C   Restriction:
-C     JOB = 1 :  LDW is not used
-C     JOB = 2 :  LDW >= N
-C     JOB = 3 :  LDW >= NE
-C     JOB = 4 :  LDW >= 2N + NE
-C     JOB = 5 :  LDW >= 3N + NE
-C 
-C DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW 
-C   that is used for workspace. If JOB = 5, on return,
-C   DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N.
-C 
-C ICNTL is an INTEGER array of length 10. Its components control the 
-C   output of MC64A/AD and must be set by the user before calling
-C   MC64A/AD. They are not altered by the subroutine.
-C
-C   ICNTL(1) must be set to specify the output stream for
-C   error messages. If ICNTL(1) < 0, messages are suppressed.
-C   The default value set by MC46I/ID is 6.
-C
-C   ICNTL(2) must be set by the user to specify the output stream for
-C   warning messages. If ICNTL(2) < 0, messages are suppressed.
-C   The default value set by MC46I/ID is 6.
-C
-C   ICNTL(3) must be set by the user to specify the output stream for
-C   diagnostic messages. If ICNTL(3) < 0, messages are suppressed.
-C   The default value set by MC46I/ID is -1.
-C
-C   ICNTL(4) must be set by the user to a value other than 0 to avoid
-C   checking of the input data.
-C   The default value set by MC46I/ID is 0.
-C    
-C INFO is an INTEGER array of length 10 which need not be set by the 
-C   user. INFO(1) is set non-negative to indicate success. A negative 
-C   value is returned if an error occurred, a positive value if a 
-C   warning occurred. INFO(2) holds further information on the error.
-C   On exit from the subroutine, INFO(1) will take one of the 
-C   following values:
-C    0 : successful entry (for structurally nonsingular matrix).
-C   +1 : successful entry (for structurally singular matrix).
-C   +2 : the returned scaling factors are large and may cause
-C        overflow when used to scale the matrix. 
-C        (For JOB = 5 entry only.)
-C   -1 : JOB < 1 or JOB > 5.  Value of JOB held in INFO(2).
-C   -2 : N < 1.  Value of N held in INFO(2).
-C   -3 : NE < 1. Value of NE held in INFO(2).
-C   -4 : the defined length LIW violates the restriction on LIW.
-C        Value of LIW required given by INFO(2).
-C   -5 : the defined length LDW violates the restriction on LDW.
-C        Value of LDW required given by INFO(2).
-C   -6 : entries are found whose row indices are out of range. INFO(2)
-C        contains the index of a column in which such an entry is found.
-C   -7 : repeated entries are found. INFO(2) contains the index of a 
-C        column in which such entries are found.
-C  INFO(3) to INFO(10) are not currently used and are set to zero by 
-C        the routine.
-C
-C References:
-C  [1]  I. S. Duff, (1981),
-C       "Algorithm 575. Permutations for a zero-free diagonal",
-C       ACM Trans. Math. Software 7(3), 387-390.
-C  [2]  I. S. Duff and J. Koster, (1998),
-C       "The design and use of algorithms for permuting large
-C       entries to the diagonal of sparse matrices",
-C       SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901.
-C  [3]  I. S. Duff and J. Koster, (1999),
-C       "On algorithms for permuting large entries to the diagonal 
-C       of sparse matrices",
-C       Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England.
-
-C Local variables and parameters
-      INTEGER I,J,K
-      DOUBLE PRECISION FACT,ZERO,RINF
-      PARAMETER (ZERO=0.0D+00)
-C External routines and functions
-c     EXTERNAL FD05AD
-c     DOUBLE PRECISION FD05AD
-      EXTERNAL MC21AD,MC64BD,MC64RD,MC64SD,MC64WD, HY_DLAMCH
-      DOUBLE PRECISION HY_DLAMCH
-C Intrinsic functions
-      INTRINSIC ABS,LOG
-
-C Set RINF to largest positive real number (infinity)
-c XSL    RINF = FD05AD(5)
-      RINF = HY_DLAMCH('Overflow')
-
-C Check value of JOB
-      IF (JOB.LT.1 .OR. JOB.GT.5) THEN
-        INFO(1) = -1
-        INFO(2) = JOB
-        IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9001) INFO(1),'JOB',JOB
-        GO TO 99
-      ENDIF
-C Check value of N
-      IF (N.LT.1) THEN
-        INFO(1) = -2
-        INFO(2) = N
-        IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9001) INFO(1),'N',N
-        GO TO 99
-      ENDIF
-C Check value of NE
-      IF (NE.LT.1) THEN
-        INFO(1) = -3
-        INFO(2) = NE
-        IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9001) INFO(1),'NE',NE
-        GO TO 99
-      ENDIF
-C Check LIW
-      IF (JOB.EQ.1) K = 5*N
-      IF (JOB.EQ.2) K = 4*N
-      IF (JOB.EQ.3) K = 10*N + NE
-      IF (JOB.EQ.4) K = 5*N
-      IF (JOB.EQ.5) K = 5*N
-      IF (LIW.LT.K) THEN
-        INFO(1) = -4
-        INFO(2) = K
-        IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9004) INFO(1),K
-        GO TO 99
-      ENDIF
-C Check LDW
-C If JOB = 1, do not check
-      IF (JOB.GT.1) THEN
-        IF (JOB.EQ.2) K = N
-        IF (JOB.EQ.3) K = NE
-        IF (JOB.EQ.4) K = 2*N + NE
-        IF (JOB.EQ.5) K = 3*N + NE
-        IF (LDW.LT.K) THEN
-          INFO(1) = -5
-          INFO(2) = K
-          IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9005) INFO(1),K
-          GO TO 99
-        ENDIF
-      ENDIF
-      IF (ICNTL(4).EQ.0) THEN
-C Check row indices. Use IW(1:N) as workspace
-        DO 3 I = 1,N
-          IW(I) = 0
-    3   CONTINUE
-        DO 6 J = 1,N
-          DO 4 K = IP(J),IP(J+1)-1
-            I = IRN(K)
-C Check for row indices that are out of range
-            IF (I.LT.1 .OR. I.GT.N) THEN
-              INFO(1) = -6
-              INFO(2) = J
-              IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9006) INFO(1),J,I
-              GO TO 99
-            ENDIF
-C Check for repeated row indices within a column
-            IF (IW(I).EQ.J) THEN
-              INFO(1) = -7
-              INFO(2) = J
-              IF (ICNTL(1).GE.0) WRITE(ICNTL(1),9007) INFO(1),J,I 
-              GO TO 99
-            ELSE
-              IW(I) = J
-            ENDIF
-    4     CONTINUE
-    6   CONTINUE
-      ENDIF
-
-C Print diagnostics on input
-      IF (ICNTL(3).GE.0) THEN
-        WRITE(ICNTL(3),9020) JOB,N,NE
-        WRITE(ICNTL(3),9021) (IP(J),J=1,N+1)
-        WRITE(ICNTL(3),9022) (IRN(J),J=1,NE)
-        IF (JOB.GT.1) WRITE(ICNTL(3),9023) (A(J),J=1,NE)
-      ENDIF
-
-C Set components of INFO to zero
-      DO 8 I=1,10
-        INFO(I) = 0
-    8 CONTINUE
-
-C Compute maximum matching with MC21A/AD
-      IF (JOB.EQ.1) THEN
-C Put length of column J in IW(J)
-        DO 10 J = 1,N
-          IW(J) = IP(J+1) - IP(J)
-   10   CONTINUE
-C IW(N+1:5N) is workspace
-        CALL MC21AD(N,IRN,NE,IP,IW(1),CPERM,NUM,IW(N+1))
-        GO TO 90
-      ENDIF
-
-C Compute bottleneck matching
-      IF (JOB.EQ.2) THEN
-C IW(1:5N), DW(1:N) are workspaces
-        CALL MC64BD(N,NE,IP,IRN,A,CPERM,NUM,
-     &     IW(1),IW(N+1),IW(2*N+1),IW(3*N+1),DW)
-        GO TO 90
-      ENDIF
-
-C Compute bottleneck matching
-      IF (JOB.EQ.3) THEN
-C Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE
-        DO 20 K = 1,NE
-          IW(K) = IRN(K)
-          DW(K) = ABS(A(K))
-   20   CONTINUE
-C Sort entries in each column by decreasing value. 
-        CALL MC64RD(N,NE,IP,IW,DW)
-C IW(NE+1:NE+10N) is workspace
-        CALL MC64SD(N,NE,IP,IW(1),DW,CPERM,NUM,IW(NE+1),
-     &     IW(NE+N+1),IW(NE+2*N+1),IW(NE+3*N+1),IW(NE+4*N+1),
-     &     IW(NE+5*N+1),IW(NE+6*N+1))
-        GO TO 90
-      ENDIF
-
-      IF (JOB.EQ.4) THEN
-        DO 50 J = 1,N
-          FACT = ZERO
-          DO 30 K = IP(J),IP(J+1)-1
-            IF (ABS(A(K)).GT.FACT) FACT = ABS(A(K))
-   30     CONTINUE
-          DO 40 K = IP(J),IP(J+1)-1
-            DW(2*N+K) = FACT - ABS(A(K))
-   40     CONTINUE
-   50   CONTINUE
-C B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces
-        CALL MC64WD(N,NE,IP,IRN,DW(2*N+1),CPERM,NUM,
-     &     IW(1),IW(N+1),IW(2*N+1),IW(3*N+1),IW(4*N+1),
-     &     DW(1),DW(N+1))
-        GO TO 90
-      ENDIF
-
-      IF (JOB.EQ.5) THEN
-        DO 75 J = 1,N
-          FACT = ZERO
-          DO 60 K = IP(J),IP(J+1)-1
-            DW(3*N+K) = ABS(A(K))
-            IF (DW(3*N+K).GT.FACT) FACT = DW(3*N+K)
-   60     CONTINUE
-          DW(2*N+J) = FACT
-          IF (FACT.NE.ZERO) THEN
-            FACT = LOG(FACT)
-          ELSE
-            FACT = RINF/N
-          ENDIF
-          DO 70 K = IP(J),IP(J+1)-1
-            IF (DW(3*N+K).NE.ZERO) THEN
-              DW(3*N+K) = FACT - LOG(DW(3*N+K))
-            ELSE
-              DW(3*N+K) = RINF/N
-            ENDIF
-   70     CONTINUE
-   75   CONTINUE
-C B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces
-        CALL MC64WD(N,NE,IP,IRN,DW(3*N+1),CPERM,NUM,
-     &     IW(1),IW(N+1),IW(2*N+1),IW(3*N+1),IW(4*N+1),
-     &     DW(1),DW(N+1))
-        IF (NUM.EQ.N) THEN
-          DO 80 J = 1,N
-            IF (DW(2*N+J).NE.ZERO) THEN
-              DW(N+J) = DW(N+J) - LOG(DW(2*N+J))
-            ELSE
-              DW(N+J) = ZERO
-            ENDIF
-   80     CONTINUE
-        ENDIF
-C Check size of scaling factors
-        FACT = 0.5*LOG(RINF)
-        DO 86 J = 1,N
-          IF (DW(J).LT.FACT .AND. DW(N+J).LT.FACT) GO TO 86
-          INFO(1) = 2
-          GO TO 90
-   86   CONTINUE 
-C       GO TO 90
-      ENDIF
-
-   90 IF (INFO(1).EQ.0 .AND. NUM.LT.N) THEN
-C Matrix is structurally singular, return with warning
-        INFO(1) = 1
-        IF (ICNTL(2).GE.0) WRITE(ICNTL(2),9011) INFO(1)
-      ENDIF
-      IF (INFO(1).EQ.2) THEN
-C Scaling factors are large, return with warning
-        IF (ICNTL(2).GE.0) WRITE(ICNTL(2),9012) INFO(1)
-      ENDIF
-
-C Print diagnostics on output
-      IF (ICNTL(3).GE.0) THEN
-        WRITE(ICNTL(3),9030) (INFO(J),J=1,2)
-        WRITE(ICNTL(3),9031) NUM
-        WRITE(ICNTL(3),9032) (CPERM(J),J=1,N)
-        IF (JOB.EQ.5) THEN
-          WRITE(ICNTL(3),9033) (DW(J),J=1,N)
-          WRITE(ICNTL(3),9034) (DW(N+J),J=1,N)
-        ENDIF
-      ENDIF
-
-C Return from subroutine.
-   99 RETURN
-
- 9001 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2,
-     &        ' because ',(A),' = ',I10)
- 9004 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/
-     &        '        LIW too small, must be at least ',I8)
- 9005 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/
-     &        '        LDW too small, must be at least ',I8)
- 9006 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/
-     &        '        Column ',I8,
-     &        ' contains an entry with invalid row index ',I8)
- 9007 FORMAT (' ****** Error in MC64A/AD. INFO(1) = ',I2/
-     &        '        Column ',I8,
-     &        ' contains two or more entries with row index ',I8)
- 9011 FORMAT (' ****** Warning from MC64A/AD. INFO(1) = ',I2/
-     &        '        The matrix is structurally singular.')
- 9012 FORMAT (' ****** Warning from MC64A/AD. INFO(1) = ',I2/
-     &        '        Some scaling factors may be too large.')
- 9020 FORMAT (' ****** Input parameters for MC64A/AD:'/
-     &        ' JOB = ',I8/' N   = ',I8/' NE  = ',I8)
- 9021 FORMAT (' IP(1:N+1)  = ',8I8/(14X,8I8))
- 9022 FORMAT (' IRN(1:NE)  = ',8I8/(14X,8I8))
- 9023 FORMAT (' A(1:NE)    = ',4(1PD14.4)/(14X,4(1PD14.4)))
- 9030 FORMAT (' ****** Output parameters for MC64A/AD:'/
-     &        ' INFO(1:2)  = ',2I8)
- 9031 FORMAT (' NUM        = ',I8)
- 9032 FORMAT (' CPERM(1:N) = ',8I8/(14X,8I8))
- 9033 FORMAT (' DW(1:N)    = ',5(F11.3)/(14X,5(F11.3)))
- 9034 FORMAT (' DW(N+1:2N) = ',5(F11.3)/(14X,5(F11.3)))
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64BD(N,NE,IP,IRN,A,IPERM,NUM,JPERM,PR,Q,L,D) 
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER N,NE,NUM
-      INTEGER IP(N+1),IRN(NE),IPERM(N),JPERM(N),PR(N),Q(N),L(N)
-      DOUBLE PRECISION A(NE),D(N)
-
-C N, NE, IP, IRN are described in MC64A/AD.
-C A is a REAL (DOUBLE PRECISION in the D-version) array of length
-C   NE. A(K), K=1..NE, must be set to the value of the entry
-C   that corresponds to IRN(K). It is not altered.
-C IPERM is an INTEGER array of length N. On exit, it contains the 
-C    matching: IPERM(I) = 0 or row I is matched to column IPERM(I).
-C NUM is INTEGER variable. On exit, it contains the cardinality of the
-C    matching stored in IPERM.
-C IW is an INTEGER work array of length 4N.
-C DW is a REAL (DOUBLE PRECISION in D-version) work array of length N.
-
-C Local variables
-      INTEGER I,II,J,JJ,JORD,Q0,QLEN,IDUM,JDUM,ISP,JSP,
-     &        K,KK,KK1,KK2,I0,UP,LOW
-      DOUBLE PRECISION CSP,DI,DNEW,DQ0,AI,A0,BV
-C Local parameters 
-      DOUBLE PRECISION RINF,ZERO,MINONE
-      PARAMETER (ZERO=0.0D+0,MINONE=-1.0D+0)
-C Intrinsic functions
-      INTRINSIC ABS,MIN
-C External subroutines and/or functions
-c      EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DLAMCH
-c      DOUBLE PRECISION FD05AD, DLAMCH
-      EXTERNAL MC64DD,MC64ED,MC64FD, HY_DLAMCH
-      DOUBLE PRECISION HY_DLAMCH
-
-C Set RINF to largest positive real number
-c XSL  RINF = FD05AD(5)
-      RINF = HY_DLAMCH('Overflow')
-
-C Initialization
-      NUM = 0
-      BV = RINF
-      DO 10 K = 1,N
-        IPERM(K) = 0
-        JPERM(K) = 0
-        PR(K) = IP(K)
-        D(K) = ZERO
-   10 CONTINUE
-C Scan columns of matrix;
-      DO 20 J = 1,N
-        A0 = MINONE
-        DO 30 K = IP(J),IP(J+1)-1
-          I = IRN(K)
-          AI = ABS(A(K))
-          IF (AI.GT.D(I)) D(I) = AI
-          IF (JPERM(J).NE.0) GO TO 30
-          IF (AI.GE.BV) THEN
-            A0 = BV
-            IF (IPERM(I).NE.0) GO TO 30
-            JPERM(J) = I 
-            IPERM(I) = J
-            NUM = NUM + 1
-          ELSE
-            IF (AI.LE.A0) GO TO 30
-            A0 = AI
-            I0 = I
-          ENDIF
-   30   CONTINUE
-        IF (A0.NE.MINONE .AND. A0.LT.BV) THEN
-          BV = A0
-          IF (IPERM(I0).NE.0) GO TO 20
-          IPERM(I0) = J
-          JPERM(J) = I0
-          NUM = NUM + 1
-        ENDIF
-   20 CONTINUE
-C Update BV with smallest of all the largest maximum absolute values 
-C of the rows.
-      DO 25 I = 1,N
-        BV = MIN(BV,D(I))
-   25 CONTINUE
-      IF (NUM.EQ.N) GO TO 1000
-C Rescan unassigned columns; improve initial assignment
-      DO 95 J = 1,N
-        IF (JPERM(J).NE.0) GO TO 95
-        DO 50 K = IP(J),IP(J+1)-1
-          I = IRN(K)
-          AI = ABS(A(K))
-          IF (AI.LT.BV) GO TO 50
-          IF (IPERM(I).EQ.0) GO TO 90
-          JJ = IPERM(I)
-          KK1 = PR(JJ)
-          KK2 = IP(JJ+1) - 1
-          IF (KK1.GT.KK2) GO TO 50
-          DO 70 KK = KK1,KK2
-            II = IRN(KK)
-            IF (IPERM(II).NE.0) GO TO 70
-            IF (ABS(A(KK)).GE.BV) GO TO 80
-   70     CONTINUE
-          PR(JJ) = KK2 + 1
-   50   CONTINUE
-        GO TO 95
-   80   JPERM(JJ) = II
-        IPERM(II) = JJ
-        PR(JJ) = KK + 1
-   90   NUM = NUM + 1
-        JPERM(J) = I
-        IPERM(I) = J
-        PR(J) = K + 1
-   95 CONTINUE
-      IF (NUM.EQ.N) GO TO 1000
-
-C Prepare for main loop
-      DO 99 I = 1,N
-        D(I) = MINONE
-        L(I) = 0
-   99 CONTINUE
-
-C Main loop ... each pass round this loop is similar to Dijkstra's 
-C algorithm for solving the single source shortest path problem 
-
-      DO 100 JORD = 1,N
-
-        IF (JPERM(JORD).NE.0) GO TO 100
-        QLEN = 0
-        LOW = N + 1
-        UP = N + 1
-C CSP is cost of shortest path to any unassigned row
-C ISP is matrix position of unassigned row element in shortest path
-C JSP is column index of unassigned row element in shortest path
-        CSP = MINONE
-C Build shortest path tree starting from unassigned column JORD
-        J = JORD
-        PR(J) = -1
-
-C Scan column J
-        DO 115 K = IP(J),IP(J+1)-1
-          I = IRN(K)
-          DNEW = ABS(A(K))
-          IF (CSP.GE.DNEW) GO TO 115
-          IF (IPERM(I).EQ.0) THEN
-C Row I is unassigned; update shortest path info
-            CSP = DNEW
-            ISP = I
-            JSP = J
-            IF (CSP.GE.BV) GO TO 160
-          ELSE
-            D(I) = DNEW
-            IF (DNEW.GE.BV) THEN
-C Add row I to Q2
-              LOW = LOW - 1
-              Q(LOW) = I
-            ELSE
-C Add row I to Q, and push it 
-              QLEN = QLEN + 1
-              L(I) = QLEN
-              CALL MC64DD(I,N,Q,D,L,1)
-            ENDIF
-            JJ = IPERM(I)
-            PR(JJ) = J
-          ENDIF
-  115   CONTINUE
-
-        DO 150 JDUM = 1,NUM
-C If Q2 is empty, extract new rows from Q
-          IF (LOW.EQ.UP) THEN
-            IF (QLEN.EQ.0) GO TO 160
-            I = Q(1)
-            IF (CSP.GE.D(I)) GO TO 160
-            BV = D(I)
-            DO 152 IDUM = 1,N
-              CALL MC64ED(QLEN,N,Q,D,L,1)
-              L(I) = 0
-              LOW = LOW - 1
-              Q(LOW) = I
-              IF (QLEN.EQ.0) GO TO 153
-              I = Q(1)
-              IF (D(I).NE.BV) GO TO 153
-  152       CONTINUE
-C End of dummy loop; this point is never reached
-          ENDIF
-C Move row Q0 
-  153     UP = UP - 1
-          Q0 = Q(UP)
-          DQ0 = D(Q0)
-          L(Q0) = UP
-C Scan column that matches with row Q0
-          J = IPERM(Q0)
-          DO 155 K = IP(J),IP(J+1)-1
-            I = IRN(K)
-C Update D(I) 
-            IF (L(I).GE.UP) GO TO 155
-            DNEW = MIN(DQ0,ABS(A(K)))
-            IF (CSP.GE.DNEW) GO TO 155
-            IF (IPERM(I).EQ.0) THEN
-C Row I is unassigned; update shortest path info
-              CSP = DNEW
-              ISP = I
-              JSP = J
-              IF (CSP.GE.BV) GO TO 160
-            ELSE
-              DI = D(I)
-              IF (DI.GE.BV .OR. DI.GE.DNEW) GO TO 155
-              D(I) = DNEW
-              IF (DNEW.GE.BV) THEN
-C Delete row I from Q (if necessary); add row I to Q2
-                IF (DI.NE.MINONE)
-     *            CALL MC64FD(L(I),QLEN,N,Q,D,L,1)
-                L(I) = 0
-                LOW = LOW - 1
-                Q(LOW) = I
-              ELSE
-C Add row I to Q (if necessary); push row I up Q
-                IF (DI.EQ.MINONE) THEN
-                  QLEN = QLEN + 1
-                  L(I) = QLEN
-                ENDIF
-                CALL MC64DD(I,N,Q,D,L,1)
-              ENDIF
-C Update tree
-              JJ = IPERM(I)
-              PR(JJ) = J
-            ENDIF
-  155     CONTINUE
-  150   CONTINUE
-       
-C If CSP = MINONE, no augmenting path is found
-  160   IF (CSP.EQ.MINONE) GO TO 190
-C Update bottleneck value
-        BV = MIN(BV,CSP)
-C Find augmenting path by tracing backward in PR; update IPERM,JPERM
-        NUM = NUM + 1
-        I = ISP
-        J = JSP
-        DO 170 JDUM = 1,NUM+1
-          I0 = JPERM(J)
-          JPERM(J) = I
-          IPERM(I) = J
-          J = PR(J)
-          IF (J.EQ.-1) GO TO 190
-          I = I0
-  170   CONTINUE
-C End of dummy loop; this point is never reached
-  190   DO 191 KK = UP,N
-          I = Q(KK)
-          D(I) = MINONE
-          L(I) = 0
-  191   CONTINUE 
-        DO 192 KK = LOW,UP-1
-          I = Q(KK)
-          D(I) = MINONE
-  192   CONTINUE
-        DO 193 KK = 1,QLEN
-          I = Q(KK)
-          D(I) = MINONE
-          L(I) = 0
-  193   CONTINUE
-
-  100 CONTINUE
-C End of main loop
-
-C BV is bottleneck value of final matching
-      IF (NUM.EQ.N) GO TO 1000
-
-C Matrix is structurally singular, complete IPERM.
-C JPERM, PR are work arrays
-      DO 300 J = 1,N
-        JPERM(J) = 0
-  300 CONTINUE
-      K = 0
-      DO 310 I = 1,N
-        IF (IPERM(I).EQ.0) THEN
-          K = K + 1
-          PR(K) = I
-        ELSE
-          J = IPERM(I)
-          JPERM(J) = I
-        ENDIF
-  310 CONTINUE
-      K = 0
-      DO 320 I = 1,N
-        IF (JPERM(I).NE.0) GO TO 320
-        K = K + 1
-        JDUM = PR(K)
-        IPERM(JDUM) = I
-  320 CONTINUE
-
- 1000 RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64DD(I,N,Q,D,L,IWAY)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER I,N,IWAY
-      INTEGER Q(N),L(N)
-      DOUBLE PRECISION D(N)
-
-C Variables N,Q,D,L are described in MC64B/BD
-C IF IWAY is equal to 1, then
-C node I is pushed from its current position upwards
-C IF IWAY is not equal to 1, then
-C node I is pushed from its current position downwards
-
-C Local variables and parameters
-      INTEGER IDUM,K,POS,POSK,QK
-      PARAMETER (K=2)
-      DOUBLE PRECISION DI
-
-      DI = D(I)
-      POS = L(I)
-C POS is index of current position of I in the tree
-      IF (IWAY.EQ.1) THEN
-        DO 10 IDUM = 1,N
-          IF (POS.LE.1) GO TO 20
-          POSK = POS/K
-          QK = Q(POSK)
-          IF (DI.LE.D(QK)) GO TO 20 
-          Q(POS) = QK
-          L(QK) = POS 
-          POS = POSK
-   10   CONTINUE
-C End of dummy loop; this point is never reached
-      ELSE
-        DO 15 IDUM = 1,N
-          IF (POS.LE.1) GO TO 20
-          POSK = POS/K
-          QK = Q(POSK)
-          IF (DI.GE.D(QK)) GO TO 20
-          Q(POS) = QK
-          L(QK) = POS
-          POS = POSK
-   15   CONTINUE
-C End of dummy loop; this point is never reached
-      ENDIF
-C End of dummy if; this point is never reached
-   20 Q(POS) = I
-      L(I) = POS
-
-      RETURN
-      END
- 
-C**********************************************************************
-      SUBROUTINE MC64ED(QLEN,N,Q,D,L,IWAY)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER QLEN,N,IWAY
-      INTEGER Q(N),L(N)
-      DOUBLE PRECISION D(N)
-
-C Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or
-C     MC64W/WD (IWAY = 2)
-C The root node is deleted from the binary heap.
-
-C Local variables and parameters
-      INTEGER I,IDUM,K,POS,POSK
-      PARAMETER (K=2)
-      DOUBLE PRECISION DK,DR,DI
-
-C Move last element to begin of Q
-      I = Q(QLEN)
-      DI = D(I)
-      QLEN = QLEN - 1
-      POS = 1
-      IF (IWAY.EQ.1) THEN
-        DO 10 IDUM = 1,N
-          POSK = K*POS
-          IF (POSK.GT.QLEN) GO TO 20
-          DK = D(Q(POSK))
-          IF (POSK.LT.QLEN) THEN
-            DR = D(Q(POSK+1))
-            IF (DK.LT.DR) THEN
-              POSK = POSK + 1
-              DK = DR
-            ENDIF
-          ENDIF
-          IF (DI.GE.DK) GO TO 20
-C Exchange old last element with larger priority child
-          Q(POS) = Q(POSK)
-          L(Q(POS)) = POS
-          POS = POSK
-   10   CONTINUE
-C End of dummy loop; this point is never reached
-      ELSE
-        DO 15 IDUM = 1,N
-          POSK = K*POS
-          IF (POSK.GT.QLEN) GO TO 20
-          DK = D(Q(POSK))
-          IF (POSK.LT.QLEN) THEN
-            DR = D(Q(POSK+1))
-            IF (DK.GT.DR) THEN
-              POSK = POSK + 1
-              DK = DR
-            ENDIF
-          ENDIF
-          IF (DI.LE.DK) GO TO 20
-C Exchange old last element with smaller child
-          Q(POS) = Q(POSK)
-          L(Q(POS)) = POS
-          POS = POSK
-   15   CONTINUE
-C End of dummy loop; this point is never reached
-      ENDIF
-C End of dummy if; this point is never reached
-   20 Q(POS) = I
-      L(I) = POS
-
-      RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64FD(POS0,QLEN,N,Q,D,L,IWAY)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER POS0,QLEN,N,IWAY
-      INTEGER Q(N),L(N)
-      DOUBLE PRECISION D(N)
-
-C Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or
-C     MC64WD (IWAY = 2).
-C Move last element in the heap 
-
-      INTEGER I,IDUM,K,POS,POSK,QK
-      PARAMETER (K=2)
-      DOUBLE PRECISION DK,DR,DI
- 
-C Quick return, if possible
-      IF (QLEN.EQ.POS0) THEN
-        QLEN = QLEN - 1
-        RETURN
-      ENDIF
-
-C Move last element from queue Q to position POS0
-C POS is current position of node I in the tree
-      I = Q(QLEN)
-      DI = D(I)
-      QLEN = QLEN - 1
-      POS = POS0
-      IF (IWAY.EQ.1) THEN
-        DO 10 IDUM = 1,N
-          IF (POS.LE.1) GO TO 20
-          POSK = POS/K
-          QK = Q(POSK)
-          IF (DI.LE.D(QK)) GO TO 20 
-          Q(POS) = QK
-          L(QK) = POS 
-          POS = POSK
-   10   CONTINUE
-C End of dummy loop; this point is never reached
-   20   Q(POS) = I
-        L(I) = POS
-        DO 30 IDUM = 1,N
-          POSK = K*POS
-          IF (POSK.GT.QLEN) GO TO 40
-          DK = D(Q(POSK))
-          IF (POSK.LT.QLEN) THEN
-            DR = D(Q(POSK+1))
-            IF (DK.LT.DR) THEN
-              POSK = POSK + 1
-              DK = DR
-            ENDIF
-          ENDIF
-          IF (DI.GE.DK) GO TO 40
-          QK = Q(POSK)
-          Q(POS) = QK
-          L(QK) = POS
-          POS = POSK
-   30   CONTINUE
-C End of dummy loop; this point is never reached
-      ELSE
-        DO 32 IDUM = 1,N
-          IF (POS.LE.1) GO TO 34
-          POSK = POS/K
-          QK = Q(POSK)
-          IF (DI.GE.D(QK)) GO TO 34 
-          Q(POS) = QK
-          L(QK) = POS 
-          POS = POSK
-   32   CONTINUE
-C End of dummy loop; this point is never reached
-   34   Q(POS) = I
-        L(I) = POS
-        DO 36 IDUM = 1,N
-          POSK = K*POS
-          IF (POSK.GT.QLEN) GO TO 40
-          DK = D(Q(POSK))
-          IF (POSK.LT.QLEN) THEN
-            DR = D(Q(POSK+1))
-            IF (DK.GT.DR) THEN
-              POSK = POSK + 1
-              DK = DR
-            ENDIF
-          ENDIF
-          IF (DI.LE.DK) GO TO 40
-          QK = Q(POSK)
-          Q(POS) = QK
-          L(QK) = POS
-          POS = POSK
-   36   CONTINUE
-C End of dummy loop; this point is never reached
-      ENDIF
-C End of dummy if; this point is never reached
-   40 Q(POS) = I
-      L(I) = POS
-
-      RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64RD(N,NE,IP,IRN,A)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER N,NE
-      INTEGER IP(N+1),IRN(NE)
-      DOUBLE PRECISION A(NE)
-
-C This subroutine sorts the entries in each column of the 
-C sparse matrix (defined by N,NE,IP,IRN,A) by decreasing
-C numerical value.
-
-C Local constants
-      INTEGER THRESH,TDLEN
-      PARAMETER (THRESH=15,TDLEN=50)
-C Local variables
-      INTEGER J,IPJ,K,LEN,R,S,HI,FIRST,MID,LAST,TD
-      DOUBLE PRECISION HA,KEY
-C Local arrays
-      INTEGER TODO(TDLEN)
-      
-      DO 100 J = 1,N
-        LEN = IP(J+1) - IP(J)
-        IF (LEN.LE.1) GO TO 100
-        IPJ = IP(J)
-
-C Sort array roughly with partial quicksort
-        IF (LEN.LT.THRESH) GO TO 400
-        TODO(1) = IPJ
-        TODO(2) = IPJ + LEN
-        TD = 2
-  500   CONTINUE
-        FIRST = TODO(TD-1)
-        LAST = TODO(TD)
-C KEY is the smallest of two values present in interval [FIRST,LAST)
-        KEY = A((FIRST+LAST)/2)
-        DO 475 K = FIRST,LAST-1
-          HA = A(K)
-          IF (HA.EQ.KEY) GO TO 475
-          IF (HA.GT.KEY) GO TO 470
-          KEY = HA
-          GO TO 470
-  475   CONTINUE
-C Only one value found in interval, so it is already sorted
-        TD = TD - 2
-        GO TO 425
-
-C Reorder interval [FIRST,LAST) such that entries before MID are gt KEY
-  470   MID = FIRST
-        DO 450 K = FIRST,LAST-1
-          IF (A(K).LE.KEY) GO TO 450
-          HA = A(MID)
-          A(MID) = A(K)
-          A(K) = HA
-          HI = IRN(MID)
-          IRN(MID) = IRN(K)
-          IRN(K) = HI
-          MID = MID + 1
-  450   CONTINUE
-C Both subintervals [FIRST,MID), [MID,LAST) are nonempty
-C Stack the longest of the two subintervals first
-        IF (MID-FIRST.GE.LAST-MID) THEN
-          TODO(TD+2) = LAST
-          TODO(TD+1) = MID
-          TODO(TD) = MID
-C          TODO(TD-1) = FIRST
-        ELSE
-          TODO(TD+2) = MID
-          TODO(TD+1) = FIRST
-          TODO(TD) = LAST
-          TODO(TD-1) = MID
-        ENDIF
-        TD = TD + 2
-
-  425   CONTINUE
-        IF (TD.EQ.0) GO TO 400 
-C There is still work to be done
-        IF (TODO(TD)-TODO(TD-1).GE.THRESH) GO TO 500
-C Next interval is already short enough for straightforward insertion
-        TD = TD - 2
-        GO TO 425
-
-C Complete sorting with straightforward insertion
-  400   DO 200 R = IPJ+1,IPJ+LEN-1
-          IF (A(R-1) .LT. A(R)) THEN
-            HA = A(R)
-            HI = IRN(R)
-            A(R) = A(R-1)
-            IRN(R) = IRN(R-1)
-            DO 300 S = R-1,IPJ+1,-1
-              IF (A(S-1) .LT. HA) THEN
-                A(S) = A(S-1)
-                IRN(S) = IRN(S-1)
-              ELSE
-                A(S) = HA
-                IRN(S) = HI
-                GO TO 200 
-              END IF
-  300       CONTINUE
-            A(IPJ) = HA
-            IRN(IPJ) = HI
-          END IF
-  200   CONTINUE
-
-  100 CONTINUE
-
-      RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64SD(N,NE,IP,IRN,A,IPERM,NUMX,
-     &           W,LEN,LENL,LENH,FC,IW,IW4)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER N,NE,NUMX
-      INTEGER IP(N+1),IRN(NE),IPERM(N), 
-     &        W(N),LEN(N),LENL(N),LENH(N),FC(N),IW(N),IW4(4*N)
-      DOUBLE PRECISION A(NE)
-
-C N, NE, IP, IRN, are described in MC64A/AD.
-C A is a REAL (DOUBLE PRECISION in the D-version) array of length NE.
-C   A(K), K=1..NE, must be set to the value of the entry that 
-C   corresponds to IRN(k). The entries in each column must be 
-C   non-negative and ordered by decreasing value. 
-C IPERM is an INTEGER array of length N. On exit, it contains the 
-C   bottleneck matching: IPERM(I) - 0 or row I is matched to column 
-C   IPERM(I).
-C NUMX is an INTEGER variable. On exit, it contains the cardinality 
-C   of the matching stored in IPERM.
-C IW is an INTEGER work array of length 10N.
-
-C FC is an integer array of length N that contains the list of 
-C   unmatched columns. 
-C LEN(J), LENL(J), LENH(J) are integer arrays of length N that point 
-C   to entries in matrix column J.
-C   In the matrix defined by the column parts IP(J)+LENL(J) we know 
-C   a matching does not exist; in the matrix defined by the column 
-C   parts IP(J)+LENH(J) we know one exists.
-C   LEN(J) lies between LENL(J) and LENH(J) and determines the matrix
-C   that is tested for a maximum matching.
-C W is an integer array of length N and contains the indices of the 
-C   columns for which LENL ne LENH.
-C WLEN is number of indices stored in array W.
-C IW is integer work array of length N.
-C IW4 is integer work array of length 4N used by MC64U/UD.
-
-      INTEGER NUM,NVAL,WLEN,II,I,J,K,L,CNT,MOD,IDUM1,IDUM2,IDUM3
-      DOUBLE PRECISION BVAL,BMIN,BMAX,RINF
-c      EXTERNAL FD05AD,MC64QD,MC64UD
-c      DOUBLE PRECISION FD05AD
-      EXTERNAL MC64QD,MC64UD, HY_DLAMCH
-      DOUBLE PRECISION HY_DLAMCH
-
-C BMIN and BMAX are such that a maximum matching exists for the input
-C   matrix in which all entries smaller than BMIN are dropped. 
-C   For BMAX, a maximum matching does not exist.
-C BVAL is a value between BMIN and BMAX.
-C CNT is the number of calls made to MC64U/UD so far.
-C NUM is the cardinality of last matching found.
-
-C Set RINF to largest positive real number
-c XSL      RINF = FD05AD(5)
-      RINF = HY_DLAMCH('Overflow')
-
-C Compute a first maximum matching from scratch on whole matrix.
-      DO 20 J = 1,N
-        FC(J) = J
-        IW(J) = 0
-        LEN(J) = IP(J+1) - IP(J)
-   20 CONTINUE
-C The first call to MC64U/UD
-      CNT = 1
-      MOD = 1
-      NUMX = 0
-      CALL MC64UD(CNT,MOD,N,IRN,NE,IP,LEN,FC,IW,NUMX,N,
-     &            IW4(1),IW4(N+1),IW4(2*N+1),IW4(3*N+1))
-
-C IW contains a maximum matching of length NUMX.
-      NUM = NUMX
-
-      IF (NUM.NE.N) THEN
-C Matrix is structurally singular
-        BMAX = RINF
-      ELSE
-C Matrix is structurally nonsingular, NUM=NUMX=N;
-C Set BMAX just above the smallest of all the maximum absolute 
-C values of the columns
-        BMAX = RINF
-        DO 30 J = 1,N
-          BVAL = 0.0
-          DO 25 K = IP(J),IP(J+1)-1
-            IF (A(K).GT.BVAL) BVAL = A(K)
-   25     CONTINUE
-          IF (BVAL.LT.BMAX) BMAX = BVAL
-   30   CONTINUE
-        BMAX = 1.001 * BMAX
-      ENDIF
-
-C Initialize BVAL,BMIN
-      BVAL = 0.0
-      BMIN = 0.0
-C Initialize LENL,LEN,LENH,W,WLEN according to BMAX.
-C Set LEN(J), LENH(J) just after last entry in column J. 
-C Set LENL(J) just after last entry in column J with value ge BMAX.
-      WLEN = 0
-      DO 48 J = 1,N
-        L = IP(J+1) - IP(J)
-        LENH(J) = L
-        LEN(J) = L
-        DO 45 K = IP(J),IP(J+1)-1
-          IF (A(K).LT.BMAX) GO TO 46
-   45   CONTINUE
-C Column J is empty or all entries are ge BMAX
-        K = IP(J+1)
-   46   LENL(J) = K - IP(J)
-C Add J to W if LENL(J) ne LENH(J)
-        IF (LENL(J).EQ.L) GO TO 48
-        WLEN = WLEN + 1
-        W(WLEN) = J
-   48 CONTINUE
-
-C Main loop
-      DO 90 IDUM1 = 1,NE
-        IF (NUM.EQ.NUMX) THEN
-C We have a maximum matching in IW; store IW in IPERM
-          DO 50 I = 1,N
-            IPERM(I) = IW(I)
-   50     CONTINUE
-C Keep going round this loop until matching IW is no longer maximum.
-          DO 80 IDUM2 = 1,NE
-            BMIN = BVAL
-            IF (BMAX .EQ. BMIN) GO TO 99
-C Find splitting value BVAL
-            CALL MC64QD(IP,LENL,LEN,W,WLEN,A,NVAL,BVAL)
-            IF (NVAL.LE.1) GO TO 99 
-C Set LEN such that all matrix entries with value lt BVAL are 
-C discarded. Store old LEN in LENH. Do this for all columns W(K).
-C Each step, either K is incremented or WLEN is decremented.
-            K = 1
-            DO 70 IDUM3 = 1,N
-              IF (K.GT.WLEN) GO TO 71
-              J = W(K)
-              DO 55 II = IP(J)+LEN(J)-1,IP(J)+LENL(J),-1
-                IF (A(II).GE.BVAL) GO TO 60 
-                I = IRN(II)
-                IF (IW(I).NE.J) GO TO 55
-C Remove entry from matching
-                IW(I) = 0
-                NUM = NUM - 1
-                FC(N-NUM) = J
-   55         CONTINUE
-   60         LENH(J) = LEN(J)
-C IP(J)+LEN(J)-1 is last entry in column ge BVAL
-              LEN(J) = II - IP(J) + 1
-C If LENH(J) = LENL(J), remove J from W
-              IF (LENL(J).EQ.LENH(J)) THEN
-                W(K) = W(WLEN)
-                WLEN = WLEN - 1
-              ELSE
-                K = K + 1
-              ENDIF
-   70       CONTINUE
-   71       IF (NUM.LT.NUMX) GO TO 81
-   80     CONTINUE
-C End of dummy loop; this point is never reached
-C Set mode for next call to MC64U/UD
-   81     MOD = 1
-        ELSE
-C We do not have a maximum matching in IW. 
-          BMAX = BVAL
-C BMIN is the bottleneck value of a maximum matching;
-C for BMAX the matching is not maximum, so BMAX>BMIN
-C          IF (BMAX .EQ. BMIN) GO TO 99
-C Find splitting value BVAL
-          CALL MC64QD(IP,LEN,LENH,W,WLEN,A,NVAL,BVAL)
-          IF (NVAL.EQ.0. OR. BVAL.EQ.BMIN) GO TO 99 
-C Set LEN such that all matrix entries with value ge BVAL are
-C inside matrix. Store old LEN in LENL. Do this for all columns W(K).
-C Each step, either K is incremented or WLEN is decremented.
-          K = 1
-          DO 87 IDUM3 = 1,N
-            IF (K.GT.WLEN) GO TO 88
-            J = W(K)
-            DO 85 II = IP(J)+LEN(J),IP(J)+LENH(J)-1
-              IF (A(II).LT.BVAL) GO TO 86
-   85       CONTINUE
-   86       LENL(J) = LEN(J)
-            LEN(J) = II - IP(J)
-            IF (LENL(J).EQ.LENH(J)) THEN
-              W(K) = W(WLEN)
-              WLEN = WLEN - 1
-            ELSE
-              K = K + 1
-            ENDIF
-   87     CONTINUE
-C End of dummy loop; this point is never reached
-C Set mode for next call to MC64U/UD
-   88     MOD = 0
-        ENDIF
-        CNT = CNT + 1
-        CALL MC64UD(CNT,MOD,N,IRN,NE,IP,LEN,FC,IW,NUM,NUMX,
-     &            IW4(1),IW4(N+1),IW4(2*N+1),IW4(3*N+1))
-
-C IW contains maximum matching of length NUM
-   90 CONTINUE 
-C End of dummy loop; this point is never reached
-
-C BMIN is bottleneck value of final matching
-   99 IF (NUMX.EQ.N) GO TO 1000
-C The matrix is structurally singular, complete IPERM
-C W, IW are work arrays
-      DO 300 J = 1,N
-        W(J) = 0
-  300 CONTINUE
-      K = 0
-      DO 310 I = 1,N
-        IF (IPERM(I).EQ.0) THEN
-          K = K + 1
-          IW(K) = I
-        ELSE
-          J = IPERM(I)
-          W(J) = I
-        ENDIF
-  310 CONTINUE
-      K = 0
-      DO 320 J = 1,N
-        IF (W(J).NE.0) GO TO 320
-        K = K + 1
-        IDUM1 = IW(K)
-        IPERM(IDUM1) = J
-  320 CONTINUE
-
- 1000 RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64QD(IP,LENL,LENH,W,WLEN,A,NVAL,VAL)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER WLEN,NVAL
-      INTEGER IP(*),LENL(*),LENH(*),W(*)
-      DOUBLE PRECISION A(*),VAL
-
-C This routine searches for at most XX different numerical values 
-C in the columns W(1:WLEN). XX>=2. 
-C Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 
-C until XX values are found or all columns have been considered.
-C On output, NVAL is the number of different values that is found 
-C and SPLIT(1:NVAL) contains the values in decreasing order.
-C If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). 
-C
-      INTEGER XX,J,K,II,S,POS
-      PARAMETER (XX=10)
-      DOUBLE PRECISION SPLIT(XX),HA
-
-C Scan columns in W(1:WLEN). For each encountered value, if value not
-C already present in SPLIT(1:NVAL), insert value such that SPLIT 
-C remains sorted by decreasing value. 
-C The sorting is done by straightforward insertion; therefore the use
-C of this routine should be avoided for large XX (XX < 20).
-      NVAL = 0 
-      DO 10 K = 1,WLEN
-        J = W(K)
-        DO 15 II = IP(J)+LENL(J),IP(J)+LENH(J)-1
-          HA = A(II)
-          IF (NVAL.EQ.0) THEN
-            SPLIT(1) = HA
-            NVAL = 1
-          ELSE
-C Check presence of HA in SPLIT
-            DO 20 S = NVAL,1,-1
-              IF (SPLIT(S).EQ.HA) GO TO 15
-              IF (SPLIT(S).GT.HA) THEN
-                POS = S + 1
-                GO TO 21
-              ENDIF
-  20        CONTINUE
-            POS = 1
-C The insertion 
-  21        DO 22 S = NVAL,POS,-1
-              SPLIT(S+1) = SPLIT(S)
-  22        CONTINUE
-            SPLIT(POS) = HA
-            NVAL = NVAL + 1
-          ENDIF
-C Exit loop if XX values are found
-          IF (NVAL.EQ.XX) GO TO 11
-  15    CONTINUE
-  10  CONTINUE
-C Determine VAL
-  11  IF (NVAL.GT.0) VAL = SPLIT((NVAL+1)/2)
-
-      RETURN
-      END
-      
-C**********************************************************************
-      SUBROUTINE MC64UD(ID,MOD,N,IRN,LIRN,IP,LENC,FC,IPERM,NUM,NUMX,
-     &           PR,ARP,CV,OUT)
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER ID,MOD,N,LIRN,NUM,NUMX
-      INTEGER ARP(N),CV(N),IRN(LIRN),IP(N),
-     &        FC(N),IPERM(N),LENC(N),OUT(N),PR(N)
-
-C PR(J) is the previous column to J in the depth first search.
-C   Array PR is used as workspace in the sorting algorithm.
-C Elements (I,IPERM(I)) I=1,..,N are entries at the end of the
-C   algorithm unless N assignments have not been made in which case
-C   N-NUM pairs (I,IPERM(I)) will not be entries in the matrix.
-C CV(I) is the most recent loop number (ID+JORD) at which row I
-C   was visited.
-C ARP(J) is the number of entries in column J which have been scanned 
-C   when looking for a cheap assignment.
-C OUT(J) is one less than the number of entries in column J which have 
-C   not been scanned during one pass through the main loop.
-C NUMX is maximum possible size of matching.
-
-      INTEGER I,II,IN1,IN2,J,J1,JORD,K,KK,LAST,NFC,
-     &        NUM0,NUM1,NUM2,ID0,ID1
-
-      IF (ID.EQ.1) THEN
-C The first call to MC64U/UD.
-C Initialize CV and ARP; parameters MOD, NUMX are not accessed
-        DO 5 I = 1,N
-          CV(I) = 0
-          ARP(I) = 0
-    5   CONTINUE
-        NUM1 = N
-        NUM2 = N
-      ELSE
-C Not the first call to MC64U/UD.
-C Re-initialize ARP if entries were deleted since last call to MC64U/UD
-        IF (MOD.EQ.1) THEN
-          DO 8 I = 1,N
-            ARP(I) = 0
-    8     CONTINUE
-        ENDIF
-        NUM1 = NUMX
-        NUM2 = N - NUMX
-      ENDIF
-      NUM0 = NUM
-
-C NUM0 is size of input matching
-C NUM1 is maximum possible size of matching
-C NUM2 is maximum allowed number of unassigned rows/columns
-C NUM is size of current matching
-
-C Quick return if possible
-C      IF (NUM.EQ.N) GO TO 199
-C NFC is number of rows/columns that could not be assigned
-      NFC = 0
-C Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD,
-C so 1st call uses 1..N, 2nd call uses N+1..2N, etc
-      ID0 = (ID-1)*N 
-
-C Main loop. Each pass round this loop either results in a new
-C assignment or gives a column with no assignment
-
-      DO 100 JORD = NUM0+1,N
-
-C Each pass uses unique number ID1
-        ID1 = ID0 + JORD
-C J is unmatched column
-        J = FC(JORD-NUM0)
-        PR(J) = -1
-        DO 70 K = 1,JORD
-C Look for a cheap assignment
-          IF (ARP(J).GE.LENC(J)) GO TO 30
-          IN1 = IP(J) + ARP(J)
-          IN2 = IP(J) + LENC(J) - 1
-          DO 20 II = IN1,IN2
-            I = IRN(II)
-            IF (IPERM(I).EQ.0) GO TO 80
-   20     CONTINUE
-C No cheap assignment in row
-          ARP(J) = LENC(J)
-C Begin looking for assignment chain starting with row J
-   30     OUT(J) = LENC(J) - 1
-C Inner loop.  Extends chain by one or backtracks
-          DO 60 KK = 1,JORD
-            IN1 = OUT(J)
-            IF (IN1.LT.0) GO TO 50
-            IN2 = IP(J) + LENC(J) - 1
-            IN1 = IN2 - IN1
-C Forward scan
-            DO 40 II = IN1,IN2
-              I = IRN(II)
-              IF (CV(I).EQ.ID1) GO TO 40
-C Column J has not yet been accessed during this pass
-              J1 = J
-              J = IPERM(I)
-              CV(I) = ID1
-              PR(J) = J1
-              OUT(J1) = IN2 - II - 1
-              GO TO 70
-   40       CONTINUE
-C Backtracking step.
-   50       J1 = PR(J)
-            IF (J1.EQ.-1) THEN
-C No augmenting path exists for column J.
-              NFC = NFC + 1
-              FC(NFC) = J
-              IF (NFC.GT.NUM2) THEN
-C A matching of maximum size NUM1 is not possible
-                LAST = JORD
-                GO TO 101
-              ENDIF
-              GO TO 100
-            ENDIF
-            J = J1
-   60     CONTINUE
-C End of dummy loop; this point is never reached
-   70   CONTINUE
-C End of dummy loop; this point is never reached
-
-C New assignment is made.
-   80   IPERM(I) = J
-        ARP(J) = II - IP(J) + 1
-        NUM = NUM + 1
-        DO 90 K = 1,JORD
-          J = PR(J)
-          IF (J.EQ.-1) GO TO 95
-          II = IP(J) + LENC(J) - OUT(J) - 2
-          I = IRN(II)
-          IPERM(I) = J
-   90   CONTINUE
-C End of dummy loop; this point is never reached
-
-   95   IF (NUM.EQ.NUM1) THEN
-C A matching of maximum size NUM1 is found
-          LAST = JORD
-          GO TO 101
-        ENDIF
-C
-  100 CONTINUE
-
-C All unassigned columns have been considered
-      LAST = N
-
-C Now, a transversal is computed or is not possible.
-C Complete FC before returning.
-  101 DO 110 JORD = LAST+1,N
-        NFC = NFC + 1
-        FC(NFC) = FC(JORD-NUM0)
-  110 CONTINUE
-
-C  199 RETURN
-      RETURN
-      END
-
-C**********************************************************************
-      SUBROUTINE MC64WD(N,NE,IP,IRN,A,IPERM,NUM,
-     &           JPERM,OUT,PR,Q,L,U,D) 
-      IMPLICIT NONE
-C
-C *** Copyright (c) 1999  Council for the Central Laboratory of the
-C     Research Councils                                             ***
-C *** Although every effort has been made to ensure robustness and  ***
-C *** reliability of the subroutines in this MC64 suite, we         ***
-C *** disclaim any liability arising through the use or misuse of   ***
-C *** any of the subroutines.                                       ***
-C *** Any problems?   Contact ...
-C     Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no)   ***
-C
-      INTEGER N,NE,NUM
-      INTEGER IP(N+1),IRN(NE),IPERM(N),
-     &        JPERM(N),OUT(N),PR(N),Q(N),L(N)
-      DOUBLE PRECISION A(NE),U(N),D(N)
-
-C N, NE, IP, IRN are described in MC64A/AD.
-C A is a REAL (DOUBLE PRECISION in the D-version) array of length NE.
-C   A(K), K=1..NE, must be set to the value of the entry that
-C   corresponds to IRN(K). It is not altered.
-C   All values A(K) must be non-negative.
-C IPERM is an INTEGER array of length N. On exit, it contains the 
-C   weighted matching: IPERM(I) = 0 or row I is matched to column 
-C   IPERM(I). 
-C NUM is an INTEGER variable. On exit, it contains the cardinality of 
-C   the matching stored in IPERM.
-C IW is an INTEGER work array of length 5N.
-C DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N.
-C   On exit, U = D(1:N) contains the dual row variable and 
-C   V = D(N+1:2N) contains the dual column variable. If the matrix 
-C   is structurally nonsingular (NUM = N), the following holds:
-C      U(I)+V(J) <= A(I,J)  if IPERM(I) |= J
-C      U(I)+V(J)  = A(I,J)  if IPERM(I)  = J
-C      U(I) = 0  if IPERM(I) = 0
-C      V(J) = 0  if there is no I for which IPERM(I) = J
-
-C Local variables
-      INTEGER I,I0,II,J,JJ,JORD,Q0,QLEN,JDUM,ISP,JSP,
-     &        K,K0,K1,K2,KK,KK1,KK2,UP,LOW
-      DOUBLE PRECISION CSP,DI,DMIN,DNEW,DQ0,VJ
-C Local parameters
-      DOUBLE PRECISION RINF,ZERO
-      PARAMETER (ZERO=0.0D+0)
-C External subroutines and/or functions
-c      EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD
-c      DOUBLE PRECISION FD05AD
-      EXTERNAL MC64DD,MC64ED,MC64FD, HY_DLAMCH
-      DOUBLE PRECISION HY_DLAMCH
-
-
-C Set RINF to largest positive real number
-c XSL      RINF = FD05AD(5)
-      RINF = HY_DLAMCH('Overflow')
-
-C Initialization
-      NUM = 0
-      DO 10 K = 1,N
-        U(K) = RINF
-        D(K) = ZERO
-        IPERM(K) = 0
-        JPERM(K) = 0
-        PR(K) = IP(K)
-        L(K) = 0
-   10 CONTINUE
-C Initialize U(I) 
-      DO 30 J = 1,N
-        DO 20 K = IP(J),IP(J+1)-1
-          I = IRN(K)
-          IF (A(K).GT.U(I)) GO TO 20
-          U(I) = A(K)
-          IPERM(I) = J
-          L(I) = K
-   20   CONTINUE
-   30 CONTINUE
-      DO 40 I = 1,N
-        J = IPERM(I)
-        IF (J.EQ.0) GO TO 40
-C Row I is not empty
-        IPERM(I) = 0
-        IF (JPERM(J).NE.0) GO TO 40
-C Assignment of column J to row I
-        NUM = NUM + 1
-        IPERM(I) = J
-        JPERM(J) = L(I)
-   40 CONTINUE
-      IF (NUM.EQ.N) GO TO 1000
-C Scan unassigned columns; improve assignment
-      DO 95 J = 1,N
-C JPERM(J) ne 0 iff column J is already assigned
-        IF (JPERM(J).NE.0) GO TO 95
-        K1 = IP(J)
-        K2 = IP(J+1) - 1
-C Continue only if column J is not empty
-        IF (K1.GT.K2) GO TO 95
-        VJ = RINF
-        DO 50 K = K1,K2
-          I = IRN(K)
-          DI = A(K) - U(I)
-          IF (DI.GT.VJ) GO TO 50
-          IF (DI.LT.VJ .OR. DI.EQ.RINF) GO TO 55
-          IF (IPERM(I).NE.0 .OR. IPERM(I0).EQ.0) GO TO 50
-   55     VJ = DI
-          I0 = I
-          K0 = K
-   50   CONTINUE
-        D(J) = VJ
-        K = K0
-        I = I0
-        IF (IPERM(I).EQ.0) GO TO 90
-        DO 60 K = K0,K2
-          I = IRN(K)
-          IF (A(K)-U(I).GT.VJ) GO TO 60 
-          JJ = IPERM(I)
-C Scan remaining part of assigned column JJ 
-          KK1 = PR(JJ)
-          KK2 = IP(JJ+1) - 1
-          IF (KK1.GT.KK2) GO TO 60
-          DO 70 KK = KK1,KK2
-            II = IRN(KK)
-            IF (IPERM(II).GT.0) GO TO 70
-            IF (A(KK)-U(II).LE.D(JJ)) GO TO 80
-   70     CONTINUE
-          PR(JJ) = KK2 + 1
-   60   CONTINUE
-        GO TO 95
-   80   JPERM(JJ) = KK
-        IPERM(II) = JJ
-        PR(JJ) = KK + 1
-   90   NUM = NUM + 1
-        JPERM(J) = K
-        IPERM(I) = J
-        PR(J) = K + 1
-   95 CONTINUE
-      IF (NUM.EQ.N) GO TO 1000
-  
-C Prepare for main loop
-      DO 99 I = 1,N
-        D(I) = RINF
-        L(I) = 0
-   99 CONTINUE
-
-C Main loop ... each pass round this loop is similar to Dijkstra's 
-C algorithm for solving the single source shortest path problem 
-
-      DO 100 JORD = 1,N
-
-        IF (JPERM(JORD).NE.0) GO TO 100
-C JORD is next unmatched column
-C DMIN is the length of shortest path in the tree
-        DMIN = RINF
-        QLEN = 0
-        LOW = N + 1
-        UP = N + 1
-C CSP is the cost of the shortest augmenting path to unassigned row
-C IRN(ISP). The corresponding column index is JSP.
-        CSP = RINF
-C Build shortest path tree starting from unassigned column (root) JORD
-        J = JORD
-        PR(J) = -1
-
-C Scan column J
-        DO 115 K = IP(J),IP(J+1)-1
-          I = IRN(K)
-          DNEW = A(K) - U(I)
-          IF (DNEW.GE.CSP) GO TO 115
-          IF (IPERM(I).EQ.0) THEN
-            CSP = DNEW
-            ISP = K
-            JSP = J
-          ELSE
-            IF (DNEW.LT.DMIN) DMIN = DNEW
-            D(I) = DNEW
-            QLEN = QLEN + 1
-            Q(QLEN) = K
-          ENDIF
-  115   CONTINUE
-C Initialize heap Q and Q2 with rows held in Q(1:QLEN)
-        Q0 = QLEN
-        QLEN = 0
-        DO 120 KK = 1,Q0
-          K = Q(KK)
-          I = IRN(K)
-          IF (CSP.LE.D(I)) THEN
-            D(I) = RINF
-            GO TO 120
-          ENDIF
-          IF (D(I).LE.DMIN) THEN
-            LOW = LOW - 1
-            Q(LOW) = I
-            L(I) = LOW
-          ELSE
-            QLEN = QLEN + 1
-            L(I) = QLEN
-            CALL MC64DD(I,N,Q,D,L,2)
-          ENDIF
-C Update tree
-          JJ = IPERM(I)
-          OUT(JJ) = K
-          PR(JJ) = J
-  120   CONTINUE
-
-        DO 150 JDUM = 1,NUM
-
-C If Q2 is empty, extract rows from Q 
-          IF (LOW.EQ.UP) THEN
-            IF (QLEN.EQ.0) GO TO 160
-            I = Q(1)
-            IF (D(I).GE.CSP) GO TO 160
-            DMIN = D(I)
-  152       CALL MC64ED(QLEN,N,Q,D,L,2)
-            LOW = LOW - 1
-            Q(LOW) = I
-            L(I) = LOW
-            IF (QLEN.EQ.0) GO TO 153
-            I = Q(1)
-            IF (D(I).GT.DMIN) GO TO 153
-            GO TO 152
-          ENDIF
-C Q0 is row whose distance D(Q0) to the root is smallest
-  153     Q0 = Q(UP-1)
-          DQ0 = D(Q0)
-C Exit loop if path to Q0 is longer than the shortest augmenting path 
-          IF (DQ0.GE.CSP) GO TO 160
-          UP = UP - 1
-
-C Scan column that matches with row Q0
-          J = IPERM(Q0)
-          VJ = DQ0 - A(JPERM(J)) + U(Q0)
-          DO 155 K = IP(J),IP(J+1)-1
-            I = IRN(K)
-            IF (L(I).GE.UP) GO TO 155
-C DNEW is new cost
-            DNEW = VJ + A(K)-U(I)
-C Do not update D(I) if DNEW ge cost of shortest path
-            IF (DNEW.GE.CSP) GO TO 155
-            IF (IPERM(I).EQ.0) THEN
-C Row I is unmatched; update shortest path info
-              CSP = DNEW
-              ISP = K
-              JSP = J
-            ELSE
-C Row I is matched; do not update D(I) if DNEW is larger
-              DI = D(I)
-              IF (DI.LE.DNEW) GO TO 155
-              IF (L(I).GE.LOW) GO TO 155
-              D(I) = DNEW
-              IF (DNEW.LE.DMIN) THEN
-                IF (L(I).NE.0) 
-     *            CALL MC64FD(L(I),QLEN,N,Q,D,L,2)
-                LOW = LOW - 1
-                Q(LOW) = I
-                L(I) = LOW
-              ELSE   
-                IF (L(I).EQ.0) THEN
-                  QLEN = QLEN + 1
-                  L(I) = QLEN
-                ENDIF
-                CALL MC64DD(I,N,Q,D,L,2)
-              ENDIF
-C Update tree
-              JJ = IPERM(I)
-              OUT(JJ) = K
-              PR(JJ) = J
-            ENDIF
-  155     CONTINUE
-  150   CONTINUE
-       
-C If CSP = RINF, no augmenting path is found
-  160   IF (CSP.EQ.RINF) GO TO 190
-C Find augmenting path by tracing backward in PR; update IPERM,JPERM
-        NUM = NUM + 1
-        I = IRN(ISP)
-        IPERM(I) = JSP
-        JPERM(JSP) = ISP
-        J = JSP
-        DO 170 JDUM = 1,NUM
-          JJ = PR(J) 
-          IF (JJ.EQ.-1) GO TO 180
-          K = OUT(J)
-          I = IRN(K)
-          IPERM(I) = JJ
-          JPERM(JJ) = K
-          J = JJ
-  170   CONTINUE
-C End of dummy loop; this point is never reached
-
-C Update U for rows in Q(UP:N)
-  180   DO 185 KK = UP,N
-          I = Q(KK)
-          U(I) = U(I) + D(I) - CSP
-  185   CONTINUE 
-  190   DO 191 KK = LOW,N
-          I = Q(KK)
-          D(I) = RINF
-          L(I) = 0
-  191   CONTINUE 
-        DO 193 KK = 1,QLEN
-          I = Q(KK)
-          D(I) = RINF
-          L(I) = 0
-  193   CONTINUE
-
-  100 CONTINUE
-C End of main loop
-
-
-C Set dual column variable in D(1:N)
- 1000 DO 200 J = 1,N
-        K = JPERM(J)
-        IF (K.NE.0) THEN
-          D(J) = A(K) - U(IRN(K))
-        ELSE
-          D(J) = ZERO
-        ENDIF
-        IF (IPERM(J).EQ.0) U(J) = ZERO
-  200 CONTINUE
-
-      IF (NUM.EQ.N) GO TO 1100
-
-C The matrix is structurally singular, complete IPERM.
-C JPERM, OUT are work arrays
-      DO 300 J = 1,N
-        JPERM(J) = 0
-  300 CONTINUE
-      K = 0
-      DO 310 I = 1,N
-        IF (IPERM(I).EQ.0) THEN
-          K = K + 1
-          OUT(K) = I
-        ELSE
-          J = IPERM(I)
-          JPERM(J) = I
-        ENDIF
-  310 CONTINUE
-      K = 0
-      DO 320 J = 1,N
-        IF (JPERM(J).NE.0) GO TO 320
-        K = K + 1
-        JDUM = OUT(K)
-        IPERM(JDUM) = J
-  320 CONTINUE
- 1100 RETURN
-      END
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,545 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-/*
- * Global variables
- */
-ExpHeader *expanders; /* Array of pointers to 4 types of memory */
-LU_stack_t stack;
-int_t no_expand;
-
-
-/*
- * Prototype
- */
-static int_t memory_usage(const int_t, const int_t, const int_t);
-static void *expand(int_t *, MemType, int_t, int_t,
-		    Glu_freeable_t *);
-
-/*
- * Internal prototypes
- */
-void  SetupSpace (void *, int_t, LU_space_t *);
-
-
-void
-superlu_abort_and_exit_dist(char *msg)
-{
-    /*fprintf(stderr, msg);
-    fflush(stderr);*/
-    printf(msg);
-    exit (-1);
-}
-
-long int superlu_malloc_total = 0;
-
-#if ( DEBUGlevel>=1 )           /* Debug malloc/free. */
-
-#define PAD_FACTOR  2
-#define DWORD  (sizeof(double)) /* Be sure it's no smaller than double. */
-
-void *superlu_malloc_dist(size_t size)
-{
-    char *buf;
-    int iam;
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &iam);
-    buf = (char *) malloc(size + DWORD);
-    if ( !buf ) {
-	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %ld\n",
-	       iam, superlu_malloc_total*1e-6, size);
-	ABORT("superlu_malloc: out of memory");
-    }
-
-    ((int_t *) buf)[0] = size;
-#if 0
-    superlu_malloc_total += size + DWORD;
-#else
-    superlu_malloc_total += size;
-#endif
-    return (void *) (buf + DWORD);
-}
-
-void superlu_free_dist(void *addr)
-{
-    char *p = ((char *) addr) - DWORD;
-
-    if ( !addr )
-	ABORT("superlu_free: tried to free NULL pointer");
-
-    if ( !p )
-	ABORT("superlu_free: tried to free NULL+DWORD pointer");
-
-    { 
-	int_t n = ((int_t *) p)[0];
-	
-	if ( !n )
-	    ABORT("superlu_free: tried to free a freed pointer");
-	*((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */
-#if 0	
-	superlu_malloc_total -= (n + DWORD);
-#else
-	superlu_malloc_total -= n;
-#endif
-
-	if ( superlu_malloc_total < 0 )
-	    ABORT("superlu_malloc_total went negative");
-	
-	/*free (addr);*/
-	free (p);
-    }
-
-}
-
-#else  /* The production mode. */
-
-void *superlu_malloc_dist(size_t size)
-{
-    void *buf;
-    buf = (void *) malloc(size);
-    return (buf);
-}
-
-void superlu_free_dist(void *addr)
-{
-    free (addr);
-}
-
-#endif  /* End debug malloc/free. */
-
-
-
-static void
-copy_mem_int(int_t howmany, void *old, void *new)
-{
-    register int_t i;
-    int_t *iold = old;
-    int_t *inew = new;
-    for (i = 0; i < howmany; i++) inew[i] = iold[i];
-}
-
-
-static void
-user_bcopy(char *src, char *dest, int_t bytes)
-{
-    char *s_ptr, *d_ptr;
-
-    s_ptr = src + bytes - 1;
-    d_ptr = dest + bytes - 1;
-    for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
-}
-
-
-
-int_t *intMalloc_dist(int_t n)
-{
-    int_t *buf;
-    buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t));
-    return (buf);
-}
-
-int_t *intCalloc_dist(int_t n)
-{
-    int_t *buf;
-    register int_t i;
-    buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t));
-    if ( buf )
-	for (i = 0; i < n; ++i) buf[i] = 0;
-    return (buf);
-}
-
-
-void *user_malloc_dist(int_t bytes, int_t which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-void user_free_dist(int_t bytes, int_t which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-/*
- * Setup the memory model to be used for factorization.
- *    lwork = 0: use system malloc;
- *    lwork > 0: use user-supplied work[] space.
- */
-void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel)
-{
-    if ( lwork == 0 ) {
-	*MemModel = SYSTEM; /* malloc/free */
-    } else if ( lwork > 0 ) {
-	*MemModel = USER;   /* user provided space */
-	stack.used = 0;
-	stack.top1 = 0;
-	stack.top2 = (lwork/4)*4; /* must be word addressable */
-	stack.size = stack.top2;
-	stack.array = (void *) work;
-    }
-}
-
-
-
-/*
- * Allocate storage for the data structures common to symbolic factorization
- * routines. For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     If lwork = -1, return the estimated amount of space required, plus n;
- *     otherwise, return the amount of space actually allocated when
- *     memory allocation failure occurred.
- */
-/************************************************************************/
-int_t symbfact_SubInit
-/************************************************************************/
-(
- fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz,
- Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable
- )
-{
-    int_t  iword;
-    int_t  *xsup, *supno;
-    int_t  *lsub, *xlsub;
-    int_t  *usub, *xusub;
-    int_t  nzlmax, nzumax;
-    int_t  FILL = sp_ienv_dist(6);
-
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter symbfact_SubInit()");
-#endif
-
-    no_expand = 0;
-    iword     = sizeof(int_t);
-
-    expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) );
-    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
-    
-    if ( fact == DOFACT || fact == SamePattern ) {
-	/* Guess for L\U factors */
-	nzlmax = FILL * annz;
-	nzumax = FILL/2.0 * annz;
-
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m,1)
-		    + (nzlmax+nzumax)*iword + n );
-        } else {
-	    SetupSpace(work, lwork, &Glu_freeable->MemModel);
-	}
-	
-#if ( PRNTlevel>=2 )
-	printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", 
-		annz, nzlmax, nzumax);
-#endif	
-	
-	/* Integer pointers for L\U factors */
-	if ( Glu_freeable->MemModel == SYSTEM ) {
-	    xsup   = intMalloc_dist(n+1);
-	    supno  = intMalloc_dist(n+1);
-	    xlsub  = intMalloc_dist(n+1);
-	    xusub  = intMalloc_dist(n+1);
-	} else {
-	    xsup   = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	    supno  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	    xlsub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	    xusub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	}
-
-	lsub  = (int_t *) expand(&nzlmax, LSUB, 0, 0, Glu_freeable);
-	usub  = (int_t *) expand(&nzumax, USUB, 0, 0, Glu_freeable);
-
-	while ( !lsub || !usub ) {
-	    if ( Glu_freeable->MemModel == SYSTEM ) {
-		SUPERLU_FREE(lsub); 
-		SUPERLU_FREE(usub);
-	    } else {
-		user_free_dist((nzlmax+nzumax)*iword, HEAD);
-	    }
-	    nzlmax /= 2;
-	    nzumax /= 2;
-	    if ( nzumax < annz/2 ) {
-		printf("Not enough memory to perform factorization.\n");
-		return (memory_usage(nzlmax, nzumax, n) + n);
-	    }
-#if ( PRNTlevel>=1 )
-	    printf(".. symbfact_SubInit() reduce size:"
-		   "nzlmax %ld, nzumax %ld\n", nzlmax, nzumax);
-	    fflush(stdout);
-#endif
-	    lsub  = (int_t *) expand( &nzlmax, LSUB, 0, 0, Glu_freeable );
-	    usub  = (int_t *) expand( &nzumax, USUB, 0, 1, Glu_freeable );
-	}
-
-	Glu_persist->xsup    = xsup;
-	Glu_persist->supno   = supno;
-	Glu_freeable->lsub   = lsub;
-	Glu_freeable->xlsub  = xlsub;
-	Glu_freeable->usub   = usub;
-	Glu_freeable->xusub  = xusub;
-	Glu_freeable->nzlmax = nzlmax;
-	Glu_freeable->nzumax = nzumax;
-    } else {
-	/* fact == SamePattern_SameRowPerm */
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, 1)
-		    + (nzlmax+nzumax)*iword + n );
-        } else if ( lwork == 0 ) {
-	    Glu_freeable->MemModel = SYSTEM;
-	} else {
-	    Glu_freeable->MemModel = USER;
-	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
-	    stack.size = stack.top2;
-	}
-	
-	expanders[USUB].mem = Glu_freeable->usub;
-	expanders[LSUB].mem = Glu_freeable->lsub;
-	expanders[USUB].size = nzumax;
-	expanders[LSUB].size = nzlmax;
-    }
-
-    ++no_expand;
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed: xsup, supno */
-    CHECK_MALLOC(iam, "Exit symbfact_SubInit()");
-#endif
-
-    return 0;
-    
-} /* SYMBFACT_SUBINIT */
-
-/*
- * Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-/************************************************************************/
-int_t symbfact_SubXpand
-/************************************************************************/
-(
- int_t n,           /* total number of columns */
- int_t jcol,        /* current column */
- int_t next,        /* number of elements currently in the factors */
- MemType mem_type,  /* which type of memory to expand  */
- int_t *maxlen,     /* modified - maximum length of a data structure */
- Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
- )
-{
-    void   *new_mem;
-    
-#if ( DEBUGlevel>=1 )
-    printf("symbfact_SubXpand(): jcol %d, next %ld, maxlen %ld, MemType %d\n",
-	   jcol, next, *maxlen, mem_type);
-#endif    
-
-    new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
-    
-    if ( !new_mem ) {
-	int_t    nzlmax  = Glu_freeable->nzlmax;
-	int_t    nzumax  = Glu_freeable->nzumax;
-    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
-    	return (memory_usage(nzlmax, nzumax, n) + n);
-    }
-
-    if ( mem_type == LSUB ) {
-	Glu_freeable->lsub   = (int_t *) new_mem;
-	Glu_freeable->nzlmax = *maxlen;
-    } else if ( mem_type == USUB ) {
-	Glu_freeable->usub   = (int_t *) new_mem;
-	Glu_freeable->nzumax = *maxlen;
-    } else ABORT("Tries to expand nonexisting memory type.\n");
-    
-    return 0;
-    
-} /* LUSUB_XPAND */
-
-/*
- * Deallocate storage of the data structures common to symbolic
- * factorization routines.
- */
-/************************************************************************/
-int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable)
-/************************************************************************/
-{
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter symbfact_SubFree()");
-#endif
-    
-    SUPERLU_FREE(expanders);
-    SUPERLU_FREE(Glu_freeable->lsub);
-    SUPERLU_FREE(Glu_freeable->xlsub);
-    SUPERLU_FREE(Glu_freeable->usub);
-    SUPERLU_FREE(Glu_freeable->xusub);
-
-#if ( DEBUGlevel>=1 )    
-    CHECK_MALLOC(iam, "Exit symbfact_SubFree()");
-#endif
-    return 0;
-} /* SYMBFACT_SUBFREE */
-
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-/************************************************************************/
-static void *expand
-/************************************************************************/
-(
- int_t *prev_len,   /* length used from previous call */
- MemType type,    /* which part of the memory to expand */
- int_t len_to_copy, /* size of the memory to be copied to new store */
- int_t keep_prev,   /* = 1: use prev_len;
-		     = 0: compute new_len to expand */
- Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
- )
-{
-    float    EXPAND = 1.5;
-    float    alpha;
-    void     *new_mem;
-    int_t    new_len, tries, lword, extra, bytes_to_copy;
-
-    alpha = EXPAND;
-    lword = sizeof(int_t);
-
-    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
-        new_len = *prev_len;
-    else {
-	new_len = alpha * *prev_len;
-    }
-
-    if ( Glu_freeable->MemModel == SYSTEM ) {
-	new_mem = (void *) SUPERLU_MALLOC((size_t) new_len * lword);
-	/*new_mem = (void *) calloc(new_len, lword); */
-	if ( no_expand != 0 ) {
-	    tries = 0;
-	    if ( keep_prev ) {
-		if ( !new_mem ) return (NULL);
-	    } else {
-		while ( !new_mem ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    new_mem = (void*) SUPERLU_MALLOC((size_t)new_len * lword); 
-		    /* new_mem = (void *) calloc(new_len, lword); */
-		}
-	    }
-	    copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
-	    SUPERLU_FREE (expanders[type].mem);
-	}
-	expanders[type].mem = (void *) new_mem;
-	
-    } else { /* MemModel == USER */
-	if ( no_expand == 0 ) {
-	    new_mem = user_malloc_dist((size_t)new_len * lword, HEAD);
-	    expanders[type].mem = (void *) new_mem;
-	}
-	else {
-	    tries = 0;
-	    extra = (new_len - *prev_len) * lword;
-	    if ( keep_prev ) {
-		if ( StackFull(extra) ) return (NULL);
-	    } else {
-		while ( StackFull(extra) ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    extra = (new_len - *prev_len) * lword;	    
-		}
-	    }
-
-	    if ( type != USUB ) {
-		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
-		bytes_to_copy = (char*)stack.array + stack.top1
-		    - (char*)expanders[type + 1].mem;
-		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
-
-		if ( type < USUB ) {
-		    Glu_freeable->usub = expanders[USUB].mem =
-			(void*)((char*)expanders[USUB].mem + extra);
-		}
-		if ( type < LSUB ) {
-		    Glu_freeable->lsub = expanders[LSUB].mem =
-			(void*)((char*)expanders[LSUB].mem + extra);
-		}
-		stack.top1 += extra;
-		stack.used += extra;
-		
-	    } /* if ... */
-
-	} /* else ... */
-    }
-
-    expanders[type].size = new_len;
-    *prev_len = new_len;
-    if ( no_expand ) ++no_expand;
-    
-    return (void *) expanders[type].mem;
-    
-} /* EXPAND */
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-/************************************************************************/
-int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable,
-		      mem_usage_t *mem_usage)
-/************************************************************************/
-{
-    register int_t iword = sizeof(int_t);
-    extern int_t no_expand;
-
-    /* For the adjacency graphs of L and U. */
-    /*mem_usage->for_lu = (float)( (4*n + 3) * iword +
-				Glu_freeable->xlsub[n]*iword );*/
-    mem_usage->for_lu = (float)( (4*n + 3) * iword +
-				lsub_size * iword );
-    mem_usage->for_lu += (float)( (n + 1) * iword +
-				 Glu_freeable->xusub[n]*iword );
-
-    /* Working storage to support factorization */
-    mem_usage->total = mem_usage->for_lu + 9*n*iword;
-
-    mem_usage->expansions = --no_expand;
-    return 0;
-} /* QUERYSPACE_DIST */
-
-static int_t
-memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n)
-{
-    register int_t iword = sizeof(int_t);
-    return (10*n*iword + (nzlmax+nzumax)*iword);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c.orig hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c.orig
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.c.orig	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.c.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,546 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-#define PRNTlevel 0
-/*
- * Global variables
- */
-ExpHeader *expanders; /* Array of pointers to 4 types of memory */
-LU_stack_t stack;
-int_t no_expand;
-
-
-/*
- * Prototype
- */
-static int_t memory_usage(const int_t, const int_t, const int_t);
-static void *expand(int_t *, MemType, int_t, int_t,
-		    Glu_freeable_t *);
-
-/*
- * Internal prototypes
- */
-void  SetupSpace (void *, int_t, LU_space_t *);
-
-
-void
-superlu_abort_and_exit_dist(char *msg)
-{
-    /*fprintf(stderr, msg);
-    fflush(stderr);*/
-    printf(msg);
-    exit (-1);
-}
-
-long int superlu_malloc_total = 0;
-
-#if ( DEBUGlevel>=1 )           /* Debug malloc/free. */
-
-#define PAD_FACTOR  2
-#define DWORD  (sizeof(double)) /* Be sure it's no smaller than double. */
-
-void *superlu_malloc_dist(size_t size)
-{
-    char *buf;
-    int iam;
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &iam);
-    buf = (char *) malloc(size + DWORD);
-    if ( !buf ) {
-	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %d\n",
-	       iam, superlu_malloc_total*1e-6, size);
-	ABORT("superlu_malloc: out of memory");
-    }
-
-    ((int_t *) buf)[0] = size;
-#if 0
-    superlu_malloc_total += size + DWORD;
-#else
-    superlu_malloc_total += size;
-#endif
-    return (void *) (buf + DWORD);
-}
-
-void superlu_free_dist(void *addr)
-{
-    char *p = ((char *) addr) - DWORD;
-
-    if ( !addr )
-	ABORT("superlu_free: tried to free NULL pointer");
-
-    if ( !p )
-	ABORT("superlu_free: tried to free NULL+DWORD pointer");
-
-    { 
-	int_t n = ((int_t *) p)[0];
-	
-	if ( !n )
-	    ABORT("superlu_free: tried to free a freed pointer");
-	*((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */
-#if 0	
-	superlu_malloc_total -= (n + DWORD);
-#else
-	superlu_malloc_total -= n;
-#endif
-
-	if ( superlu_malloc_total < 0 )
-	    ABORT("superlu_malloc_total went negative");
-	
-	/*free (addr);*/
-	free (p);
-    }
-
-}
-
-#else  /* The production mode. */
-
-void *superlu_malloc_dist(size_t size)
-{
-    void *buf;
-    buf = (void *) malloc(size);
-    return (buf);
-}
-
-void superlu_free_dist(void *addr)
-{
-    free (addr);
-}
-
-#endif  /* End debug malloc/free. */
-
-
-
-static void
-copy_mem_int(int_t howmany, void *old, void *new)
-{
-    register int_t i;
-    int_t *iold = old;
-    int_t *inew = new;
-    for (i = 0; i < howmany; i++) inew[i] = iold[i];
-}
-
-
-static void
-user_bcopy(char *src, char *dest, int_t bytes)
-{
-    char *s_ptr, *d_ptr;
-
-    s_ptr = src + bytes - 1;
-    d_ptr = dest + bytes - 1;
-    for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
-}
-
-
-
-int_t *intMalloc_dist(int_t n)
-{
-    int_t *buf;
-    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
-    return (buf);
-}
-
-int_t *intCalloc_dist(int_t n)
-{
-    int_t *buf;
-    register int_t i;
-    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
-    if ( buf )
-	for (i = 0; i < n; ++i) buf[i] = 0;
-    return (buf);
-}
-
-
-void *user_malloc_dist(int_t bytes, int_t which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-void user_free_dist(int_t bytes, int_t which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-/*
- * Setup the memory model to be used for factorization.
- *    lwork = 0: use system malloc;
- *    lwork > 0: use user-supplied work[] space.
- */
-void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel)
-{
-    if ( lwork == 0 ) {
-	*MemModel = SYSTEM; /* malloc/free */
-    } else if ( lwork > 0 ) {
-	*MemModel = USER;   /* user provided space */
-	stack.used = 0;
-	stack.top1 = 0;
-	stack.top2 = (lwork/4)*4; /* must be word addressable */
-	stack.size = stack.top2;
-	stack.array = (void *) work;
-    }
-}
-
-
-
-/*
- * Allocate storage for the data structures common to symbolic factorization
- * routines. For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     If lwork = -1, return the estimated amount of space required, plus n;
- *     otherwise, return the amount of space actually allocated when
- *     memory allocation failure occurred.
- */
-/************************************************************************/
-int_t symbfact_SubInit
-/************************************************************************/
-(
- fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz,
- Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable
- )
-{
-    int_t  iword;
-    int_t  *xsup, *supno;
-    int_t  *lsub, *xlsub;
-    int_t  *usub, *xusub;
-    int_t  nzlmax, nzumax;
-    int_t  FILL = sp_ienv_dist(6);
-
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter symbfact_SubInit()");
-#endif
-
-    no_expand = 0;
-    iword     = sizeof(int_t);
-
-    expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) );
-    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
-    
-    if ( fact == DOFACT || fact == SamePattern ) {
-	/* Guess for L\U factors */
-	nzlmax = FILL * annz;
-	nzumax = FILL/2.0 * annz;
-
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m,1)
-		    + (nzlmax+nzumax)*iword + n );
-        } else {
-	    SetupSpace(work, lwork, &Glu_freeable->MemModel);
-	}
-	
-#if ( PRNTlevel>=1 )
-	printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", 
-		annz, nzlmax, nzumax);
-#endif	
-	
-	/* Integer pointers for L\U factors */
-	if ( Glu_freeable->MemModel == SYSTEM ) {
-	    xsup   = intMalloc_dist(n+1);
-	    supno  = intMalloc_dist(n+1);
-	    xlsub  = intMalloc_dist(n+1);
-	    xusub  = intMalloc_dist(n+1);
-	} else {
-	    xsup   = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	    supno  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	    xlsub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	    xusub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
-	}
-
-	lsub  = (int_t *) expand(&nzlmax, LSUB, 0, 0, Glu_freeable);
-	usub  = (int_t *) expand(&nzumax, USUB, 0, 0, Glu_freeable);
-
-	while ( !lsub || !usub ) {
-	    if ( Glu_freeable->MemModel == SYSTEM ) {
-		SUPERLU_FREE(lsub); 
-		SUPERLU_FREE(usub);
-	    } else {
-		user_free_dist((nzlmax+nzumax)*iword, HEAD);
-	    }
-	    nzlmax /= 2;
-	    nzumax /= 2;
-	    if ( nzumax < annz/2 ) {
-		printf("Not enough memory to perform factorization.\n");
-		return (memory_usage(nzlmax, nzumax, n) + n);
-	    }
-#if ( PRNTlevel>=1 )
-	    printf(".. symbfact_SubInit() reduce size:"
-		   "nzlmax %ld, nzumax %ld\n", nzlmax, nzumax);
-	    fflush(stdout);
-#endif
-	    lsub  = (int_t *) expand( &nzlmax, LSUB, 0, 0, Glu_freeable );
-	    usub  = (int_t *) expand( &nzumax, USUB, 0, 1, Glu_freeable );
-	}
-
-	Glu_persist->xsup    = xsup;
-	Glu_persist->supno   = supno;
-	Glu_freeable->lsub   = lsub;
-	Glu_freeable->xlsub  = xlsub;
-	Glu_freeable->usub   = usub;
-	Glu_freeable->xusub  = xusub;
-	Glu_freeable->nzlmax = nzlmax;
-	Glu_freeable->nzumax = nzumax;
-    } else {
-	/* fact == SamePattern_SameRowPerm */
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, 1)
-		    + (nzlmax+nzumax)*iword + n );
-        } else if ( lwork == 0 ) {
-	    Glu_freeable->MemModel = SYSTEM;
-	} else {
-	    Glu_freeable->MemModel = USER;
-	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
-	    stack.size = stack.top2;
-	}
-	
-	expanders[USUB].mem = Glu_freeable->usub;
-	expanders[LSUB].mem = Glu_freeable->lsub;
-	expanders[USUB].size = nzumax;
-	expanders[LSUB].size = nzlmax;
-    }
-
-    ++no_expand;
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed: xsup, supno */
-    CHECK_MALLOC(iam, "Exit symbfact_SubInit()");
-#endif
-
-    return 0;
-    
-} /* SYMBFACT_SUBINIT */
-
-/*
- * Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-/************************************************************************/
-int_t symbfact_SubXpand
-/************************************************************************/
-(
- int_t n,           /* total number of columns */
- int_t jcol,        /* current column */
- int_t next,        /* number of elements currently in the factors */
- MemType mem_type,  /* which type of memory to expand  */
- int_t *maxlen,     /* modified - maximum length of a data structure */
- Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
- )
-{
-    void   *new_mem;
-    
-#if ( DEBUGlevel>=1 )
-    printf("symbfact_SubXpand(): jcol %d, next %ld, maxlen %ld, MemType %d\n",
-	   jcol, next, *maxlen, mem_type);
-#endif    
-
-    new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
-    
-    if ( !new_mem ) {
-	int_t    nzlmax  = Glu_freeable->nzlmax;
-	int_t    nzumax  = Glu_freeable->nzumax;
-    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
-    	return (memory_usage(nzlmax, nzumax, n) + n);
-    }
-
-    if ( mem_type == LSUB ) {
-	Glu_freeable->lsub   = (int_t *) new_mem;
-	Glu_freeable->nzlmax = *maxlen;
-    } else if ( mem_type == USUB ) {
-	Glu_freeable->usub   = (int_t *) new_mem;
-	Glu_freeable->nzumax = *maxlen;
-    } else ABORT("Tries to expand nonexisting memory type.\n");
-    
-    return 0;
-    
-} /* LUSUB_XPAND */
-
-/*
- * Deallocate storage of the data structures common to symbolic
- * factorization routines.
- */
-/************************************************************************/
-int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable)
-/************************************************************************/
-{
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter symbfact_SubFree()");
-#endif
-    
-    SUPERLU_FREE(expanders);
-    SUPERLU_FREE(Glu_freeable->lsub);
-    SUPERLU_FREE(Glu_freeable->xlsub);
-    SUPERLU_FREE(Glu_freeable->usub);
-    SUPERLU_FREE(Glu_freeable->xusub);
-
-#if ( DEBUGlevel>=1 )    
-    CHECK_MALLOC(iam, "Exit symbfact_SubFree()");
-#endif
-    return 0;
-} /* SYMBFACT_SUBFREE */
-
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-/************************************************************************/
-static void *expand
-/************************************************************************/
-(
- int_t *prev_len,   /* length used from previous call */
- MemType type,    /* which part of the memory to expand */
- int_t len_to_copy, /* size of the memory to be copied to new store */
- int_t keep_prev,   /* = 1: use prev_len;
-		     = 0: compute new_len to expand */
- Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
- )
-{
-    float    EXPAND = 1.5;
-    float    alpha;
-    void     *new_mem;
-    int_t    new_len, tries, lword, extra, bytes_to_copy;
-
-    alpha = EXPAND;
-    lword = sizeof(int_t);
-
-    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
-        new_len = *prev_len;
-    else {
-	new_len = alpha * *prev_len;
-    }
-
-    if ( Glu_freeable->MemModel == SYSTEM ) {
-	new_mem = (void *) SUPERLU_MALLOC(new_len * lword);
-	/*new_mem = (void *) calloc(new_len, lword); */
-	if ( no_expand != 0 ) {
-	    tries = 0;
-	    if ( keep_prev ) {
-		if ( !new_mem ) return (NULL);
-	    } else {
-		while ( !new_mem ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    new_mem = (void *) SUPERLU_MALLOC(new_len * lword); 
-		    /* new_mem = (void *) calloc(new_len, lword); */
-		}
-	    }
-	    copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
-	    SUPERLU_FREE (expanders[type].mem);
-	}
-	expanders[type].mem = (void *) new_mem;
-	
-    } else { /* MemModel == USER */
-	if ( no_expand == 0 ) {
-	    new_mem = user_malloc_dist(new_len * lword, HEAD);
-	    expanders[type].mem = (void *) new_mem;
-	}
-	else {
-	    tries = 0;
-	    extra = (new_len - *prev_len) * lword;
-	    if ( keep_prev ) {
-		if ( StackFull(extra) ) return (NULL);
-	    } else {
-		while ( StackFull(extra) ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    extra = (new_len - *prev_len) * lword;	    
-		}
-	    }
-
-	    if ( type != USUB ) {
-		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
-		bytes_to_copy = (char*)stack.array + stack.top1
-		    - (char*)expanders[type + 1].mem;
-		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
-
-		if ( type < USUB ) {
-		    Glu_freeable->usub = expanders[USUB].mem =
-			(void*)((char*)expanders[USUB].mem + extra);
-		}
-		if ( type < LSUB ) {
-		    Glu_freeable->lsub = expanders[LSUB].mem =
-			(void*)((char*)expanders[LSUB].mem + extra);
-		}
-		stack.top1 += extra;
-		stack.used += extra;
-		
-	    } /* if ... */
-
-	} /* else ... */
-    }
-
-    expanders[type].size = new_len;
-    *prev_len = new_len;
-    if ( no_expand ) ++no_expand;
-    
-    return (void *) expanders[type].mem;
-    
-} /* EXPAND */
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-/************************************************************************/
-int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable,
-		      mem_usage_t *mem_usage)
-/************************************************************************/
-{
-    register int_t iword = sizeof(int_t);
-    extern int_t no_expand;
-
-    /* For the adjacency graphs of L and U. */
-    /*mem_usage->for_lu = (float)( (4*n + 3) * iword +
-				Glu_freeable->xlsub[n]*iword );*/
-    mem_usage->for_lu = (float)( (4*n + 3) * iword +
-				lsub_size * iword );
-    mem_usage->for_lu += (float)( (n + 1) * iword +
-				 Glu_freeable->xusub[n]*iword );
-
-    /* Working storage to support factorization */
-    mem_usage->total = mem_usage->for_lu + 9*n*iword;
-
-    mem_usage->expansions = --no_expand;
-    return 0;
-} /* QUERYSPACE_DIST */
-
-static int_t
-memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n)
-{
-    register int_t iword = sizeof(int_t);
-    return (10*n*iword + (nzlmax+nzumax)*iword);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.patch hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.patch
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/memory.patch	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/memory.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,10 +0,0 @@
-118d117
-< 
-144c143
-<     buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
----
->     buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t));
-152c151
-<     buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
----
->     buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t));
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mmd.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mmd.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/mmd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/mmd.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1019 +0,0 @@
-#ifdef _CRAY
-#define int short
-typedef short shortint;
-#elif defined (_LONGINT)
-#define int long
-typedef long shortint;
-#else
-typedef int shortint;
-#endif
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* ****     GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE     **** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */
-/*        ALGORITHM.  IT MAKES USE OF THE IMPLICIT REPRESENTATION */
-/*        OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */
-/*        NOTION OF INDISTINGUISHABLE NODES.  IT ALSO IMPLEMENTS */
-/*        THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */
-/*        EXTERNAL DEGREE. */
-/*        --------------------------------------------- */
-/*        CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */
-/*        DESTROYED. */
-/*        --------------------------------------------- */
-
-/*     INPUT PARAMETERS - */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */
-/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
-/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */
-/*                 (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */
-/*                 NODES. */
-
-/*     OUTPUT PARAMETERS - */
-/*        PERM   - THE MINIMUM DEGREE ORDERING. */
-/*        INVP   - THE INVERSE OF PERM. */
-/*        NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */
-/*                 SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */
-
-/*     WORKING PARAMETERS - */
-/*        DHEAD  - VECTOR FOR HEAD OF DEGREE LISTS. */
-/*        INVP   - USED TEMPORARILY FOR DEGREE FORWARD LINK. */
-/*        PERM   - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */
-/*        QSIZE  - VECTOR FOR SIZE OF SUPERNODES. */
-/*        LLIST  - VECTOR FOR TEMPORARY LINKED LISTS. */
-/*        MARKER - A TEMPORARY MARKER VECTOR. */
-
-/*     PROGRAM SUBROUTINES - */
-/*        MMDELM, MMDINT, MMDNUM, MMDUPD. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int genmmd_dist_(int *neqns, int *xadj, shortint *adjncy, 
-	shortint *invp, shortint *perm, int *delta, shortint *dhead, 
-	shortint *qsize, shortint *llist, shortint *marker, int *maxint, 
-	int *nofsub)
-{
-    /* System generated locals */
-    int i__1;
-
-    /* Local variables */
-    static int mdeg, ehead, i, mdlmt, mdnode;
-    extern /* Subroutine */ int mmdelm_dist(int *, int *, shortint *, 
-	    shortint *, shortint *, shortint *, shortint *, shortint *, 
-	    shortint *, int *, int *), mmdupd_dist(int *, int *, 
-	    int *, shortint *, int *, int *, shortint *, shortint 
-	    *, shortint *, shortint *, shortint *, shortint *, int *, 
-	    int *), mmdint_dist(int *, int *, shortint *, shortint *, 
-	    shortint *, shortint *, shortint *, shortint *, shortint *), 
-	    mmdnum_dist(int *, shortint *, shortint *, shortint *);
-    static int nextmd, tag, num;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dhead;
-    --perm;
-    --invp;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    if (*neqns <= 0) {
-	return 0;
-    }
-
-/*        ------------------------------------------------ */
-/*        INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */
-/*        ------------------------------------------------ */
-    *nofsub = 0;
-    mmdint_dist(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1],
-		&qsize[1], &llist[1], &marker[1]);
-
-/*        ---------------------------------------------- */
-/*        NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */
-/*        ---------------------------------------------- */
-    num = 1;
-
-/*        ----------------------------- */
-/*        ELIMINATE ALL ISOLATED NODES. */
-/*        ----------------------------- */
-    nextmd = dhead[1];
-L100:
-    if (nextmd <= 0) {
-	goto L200;
-    }
-    mdnode = nextmd;
-    nextmd = invp[mdnode];
-    marker[mdnode] = *maxint;
-    invp[mdnode] = -num;
-    ++num;
-    goto L100;
-
-L200:
-/*        ---------------------------------------- */
-/*        SEARCH FOR NODE OF THE MINIMUM DEGREE. */
-/*        MDEG IS THE CURRENT MINIMUM DEGREE; */
-/*        TAG IS USED TO FACILITATE MARKING NODES. */
-/*        ---------------------------------------- */
-    if (num > *neqns) {
-	goto L1000;
-    }
-    tag = 1;
-    dhead[1] = 0;
-    mdeg = 2;
-L300:
-    if (dhead[mdeg] > 0) {
-	goto L400;
-    }
-    ++mdeg;
-    goto L300;
-L400:
-/*            ------------------------------------------------- */
-/*            USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */
-/*            WHEN A DEGREE UPDATE IS TO BE PERFORMED. */
-/*            ------------------------------------------------- */
-    mdlmt = mdeg + *delta;
-    ehead = 0;
-
-L500:
-    mdnode = dhead[mdeg];
-    if (mdnode > 0) {
-	goto L600;
-    }
-    ++mdeg;
-    if (mdeg > mdlmt) {
-	goto L900;
-    }
-    goto L500;
-L600:
-/*                ---------------------------------------- */
-/*                REMOVE MDNODE FROM THE DEGREE STRUCTURE. */
-/*                ---------------------------------------- */
-    nextmd = invp[mdnode];
-    dhead[mdeg] = nextmd;
-    if (nextmd > 0) {
-	perm[nextmd] = -mdeg;
-    }
-    invp[mdnode] = -num;
-    *nofsub = *nofsub + mdeg + qsize[mdnode] - 2;
-    if (num + qsize[mdnode] > *neqns) {
-	goto L1000;
-    }
-/*                ---------------------------------------------- */
-/*                ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */
-/*                TRANSFORMATION.  RESET TAG VALUE IF NECESSARY. */
-/*                ---------------------------------------------- */
-    ++tag;
-    if (tag < *maxint) {
-	goto L800;
-    }
-    tag = 1;
-    i__1 = *neqns;
-    for (i = 1; i <= i__1; ++i) {
-	if (marker[i] < *maxint) {
-	    marker[i] = 0;
-	}
-/* L700: */
-    }
-L800:
-    mmdelm_dist(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1],
-		&qsize[1], &llist[1], &marker[1], maxint, &tag);
-    num += qsize[mdnode];
-    llist[mdnode] = ehead;
-    ehead = mdnode;
-    if (*delta >= 0) {
-	goto L500;
-    }
-L900:
-/*            ------------------------------------------- */
-/*            UPDATE DEGREES OF THE NODES INVOLVED IN THE */
-/*            MINIMUM DEGREE NODES ELIMINATION. */
-/*            ------------------------------------------- */
-    if (num > *neqns) {
-	goto L1000;
-    }
-    mmdupd_dist(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1],
-		&invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint,
-		&tag);
-    goto L300;
-
-L1000:
-    mmdnum_dist(neqns, &perm[1], &invp[1], &qsize[1]);
-    return 0;
-
-} /* genmmd_dist_ */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* ***     MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION     *** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */
-/*        MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */
-/*        ALGORITHM. */
-
-/*     INPUT PARAMETERS - */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
-
-/*     OUTPUT PARAMETERS - */
-/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
-/*        QSIZE  - SIZE OF SUPERNODE (INITIALIZED TO ONE). */
-/*        LLIST  - LINKED LIST. */
-/*        MARKER - MARKER VECTOR. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdint_dist(int *neqns, int *xadj, shortint *adjncy, 
-	shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, 
-	shortint *llist, shortint *marker)
-{
-    /* System generated locals */
-    int i__1;
-
-    /* Local variables */
-    static int ndeg, node, fnode;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dbakw;
-    --dforw;
-    --dhead;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	dhead[node] = 0;
-	qsize[node] = 1;
-	marker[node] = 0;
-	llist[node] = 0;
-/* L100: */
-    }
-/*        ------------------------------------------ */
-/*        INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */
-/*        ------------------------------------------ */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	ndeg = xadj[node + 1] - xadj[node] + 1;
-	fnode = dhead[ndeg];
-	dforw[node] = fnode;
-	dhead[ndeg] = node;
-	if (fnode > 0) {
-	    dbakw[fnode] = node;
-	}
-	dbakw[node] = -ndeg;
-/* L200: */
-    }
-    return 0;
-
-} /* mmdint_dist */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* **     MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION     *** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */
-/*        MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */
-/*        IS STORED IN THE QUOTIENT GRAPH FORMAT.  IT ALSO */
-/*        TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */
-/*        ELIMINATION GRAPH. */
-
-/*     INPUT PARAMETERS - */
-/*        MDNODE - NODE OF MINIMUM DEGREE. */
-/*        MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */
-/*                 INT. */
-/*        TAG    - TAG VALUE. */
-
-/*     UPDATED PARAMETERS - */
-/*        (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */
-/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
-/*        QSIZE  - SIZE OF SUPERNODE. */
-/*        MARKER - MARKER VECTOR. */
-/*        LLIST  - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdelm_dist(int *mdnode, int *xadj, shortint *adjncy,
-	 shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, 
- 	 shortint *llist, shortint *marker, int *maxint, int *tag)
-{
-    /* System generated locals */
-    int i__1, i__2;
-
-    /* Local variables */
-    static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, 
-	    istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-/*        ----------------------------------------------- */
-/*        FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */
-/*        ----------------------------------------------- */
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dbakw;
-    --dforw;
-    --dhead;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    marker[*mdnode] = *tag;
-    istrt = xadj[*mdnode];
-    istop = xadj[*mdnode + 1] - 1;
-/*        ------------------------------------------------------- */
-/*        ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */
-/*        NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */
-/*        FOR THE NEXT REACHABLE NODE. */
-/*        ------------------------------------------------------- */
-    elmnt = 0;
-    rloc = istrt;
-    rlmt = istop;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	nabor = adjncy[i];
-	if (nabor == 0) {
-	    goto L300;
-	}
-	if (marker[nabor] >= *tag) {
-	    goto L200;
-	}
-	marker[nabor] = *tag;
-	if (dforw[nabor] < 0) {
-	    goto L100;
-	}
-	adjncy[rloc] = nabor;
-	++rloc;
-	goto L200;
-L100:
-	llist[nabor] = elmnt;
-	elmnt = nabor;
-L200:
-	;
-    }
-L300:
-/*            ----------------------------------------------------- */
-/*            MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */
-/*            ----------------------------------------------------- */
-    if (elmnt <= 0) {
-	goto L1000;
-    }
-    adjncy[rlmt] = -elmnt;
-    link = elmnt;
-L400:
-    jstrt = xadj[link];
-    jstop = xadj[link + 1] - 1;
-    i__1 = jstop;
-    for (j = jstrt; j <= i__1; ++j) {
-	node = adjncy[j];
-	link = -node;
-	if (node < 0) {
-	    goto L400;
-	} else if (node == 0) {
-	    goto L900;
-	} else {
-	    goto L500;
-	}
-L500:
-	if (marker[node] >= *tag || dforw[node] < 0) {
-	    goto L800;
-	}
-	marker[node] = *tag;
-/*                            --------------------------------- */
-/*                            USE STORAGE FROM ELIMINATED NODES */
-/*                            IF NECESSARY. */
-/*                            --------------------------------- */
-L600:
-	if (rloc < rlmt) {
-	    goto L700;
-	}
-	link = -adjncy[rlmt];
-	rloc = xadj[link];
-	rlmt = xadj[link + 1] - 1;
-	goto L600;
-L700:
-	adjncy[rloc] = node;
-	++rloc;
-L800:
-	;
-    }
-L900:
-    elmnt = llist[elmnt];
-    goto L300;
-L1000:
-    if (rloc <= rlmt) {
-	adjncy[rloc] = 0;
-    }
-/*        -------------------------------------------------------- */
-/*        FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */
-/*        -------------------------------------------------------- */
-    link = *mdnode;
-L1100:
-    istrt = xadj[link];
-    istop = xadj[link + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	rnode = adjncy[i];
-	link = -rnode;
-	if (rnode < 0) {
-	    goto L1100;
-	} else if (rnode == 0) {
-	    goto L1800;
-	} else {
-	    goto L1200;
-	}
-L1200:
-/*                -------------------------------------------- */
-/*                IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */
-/*                -------------------------------------------- */
-	pvnode = dbakw[rnode];
-	if (pvnode == 0 || pvnode == -(*maxint)) {
-	    goto L1300;
-	}
-/*                    ------------------------------------- */
-/*                    THEN REMOVE RNODE FROM THE STRUCTURE. */
-/*                    ------------------------------------- */
-	nxnode = dforw[rnode];
-	if (nxnode > 0) {
-	    dbakw[nxnode] = pvnode;
-	}
-	if (pvnode > 0) {
-	    dforw[pvnode] = nxnode;
-	}
-	npv = -pvnode;
-	if (pvnode < 0) {
-	    dhead[npv] = nxnode;
-	}
-L1300:
-/*                ---------------------------------------- */
-/*                PURGE INACTIVE QUOTIENT NABORS OF RNODE. */
-/*                ---------------------------------------- */
-	jstrt = xadj[rnode];
-	jstop = xadj[rnode + 1] - 1;
-	xqnbr = jstrt;
-	i__2 = jstop;
-	for (j = jstrt; j <= i__2; ++j) {
-	    nabor = adjncy[j];
-	    if (nabor == 0) {
-		goto L1500;
-	    }
-	    if (marker[nabor] >= *tag) {
-		goto L1400;
-	    }
-	    adjncy[xqnbr] = nabor;
-	    ++xqnbr;
-L1400:
-	    ;
-	}
-L1500:
-/*                ---------------------------------------- */
-/*                IF NO ACTIVE NABOR AFTER THE PURGING ... */
-/*                ---------------------------------------- */
-	nqnbrs = xqnbr - jstrt;
-	if (nqnbrs > 0) {
-	    goto L1600;
-	}
-/*                    ----------------------------- */
-/*                    THEN MERGE RNODE WITH MDNODE. */
-/*                    ----------------------------- */
-	qsize[*mdnode] += qsize[rnode];
-	qsize[rnode] = 0;
-	marker[rnode] = *maxint;
-	dforw[rnode] = -(*mdnode);
-	dbakw[rnode] = -(*maxint);
-	goto L1700;
-L1600:
-/*                -------------------------------------- */
-/*                ELSE FLAG RNODE FOR DEGREE UPDATE, AND */
-/*                ADD MDNODE AS A NABOR OF RNODE. */
-/*                -------------------------------------- */
-	dforw[rnode] = nqnbrs + 1;
-	dbakw[rnode] = 0;
-	adjncy[xqnbr] = *mdnode;
-	++xqnbr;
-	if (xqnbr <= jstop) {
-	    adjncy[xqnbr] = 0;
-	}
-
-L1700:
-	;
-    }
-L1800:
-    return 0;
-
-} /* mmdelm_dist */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* *****     MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE     ***** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */
-/*        AFTER A MULTIPLE ELIMINATION STEP. */
-
-/*     INPUT PARAMETERS - */
-/*        EHEAD  - THE BEGINNING OF THE LIST OF ELIMINATED */
-/*                 NODES (I.E., NEWLY FORMED ELEMENTS). */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
-/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
-/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */
-/*                 INTEGER. */
-
-/*     UPDATED PARAMETERS - */
-/*        MDEG   - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */
-/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
-/*        QSIZE  - SIZE OF SUPERNODE. */
-/*        LLIST  - WORKING LINKED LIST. */
-/*        MARKER - MARKER VECTOR FOR DEGREE UPDATE. */
-/*        TAG    - TAG VALUE. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdupd_dist(int *ehead, int *neqns, int *xadj, 
-	shortint *adjncy, int *delta, int *mdeg, shortint *dhead, 
-	shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, 
-	shortint *marker, int *maxint, int *tag)
-{
-    /* System generated locals */
-    int i__1, i__2;
-
-    /* Local variables */
-    static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, 
-	    istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dbakw;
-    --dforw;
-    --dhead;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    mdeg0 = *mdeg + *delta;
-    elmnt = *ehead;
-L100:
-/*            ------------------------------------------------------- */
-/*            FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */
-/*            (RESET TAG VALUE IF NECESSARY.) */
-/*            ------------------------------------------------------- */
-    if (elmnt <= 0) {
-	return 0;
-    }
-    mtag = *tag + mdeg0;
-    if (mtag < *maxint) {
-	goto L300;
-    }
-    *tag = 1;
-    i__1 = *neqns;
-    for (i = 1; i <= i__1; ++i) {
-	if (marker[i] < *maxint) {
-	    marker[i] = 0;
-	}
-/* L200: */
-    }
-    mtag = *tag + mdeg0;
-L300:
-/*            --------------------------------------------- */
-/*            CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */
-/*            WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */
-/*            ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */
-/*            THAN TWO NABORS (QXHEAD).  ALSO COMPUTE DEG0, */
-/*            NUMBER OF NODES IN THIS ELEMENT. */
-/*            --------------------------------------------- */
-    q2head = 0;
-    qxhead = 0;
-    deg0 = 0;
-    link = elmnt;
-L400:
-    istrt = xadj[link];
-    istop = xadj[link + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	enode = adjncy[i];
-	link = -enode;
-	if (enode < 0) {
-	    goto L400;
-	} else if (enode == 0) {
-	    goto L800;
-	} else {
-	    goto L500;
-	}
-
-L500:
-	if (qsize[enode] == 0) {
-	    goto L700;
-	}
-	deg0 += qsize[enode];
-	marker[enode] = mtag;
-/*                        ---------------------------------- */
-/*                        IF ENODE REQUIRES A DEGREE UPDATE, */
-/*                        THEN DO THE FOLLOWING. */
-/*                        ---------------------------------- */
-	if (dbakw[enode] != 0) {
-	    goto L700;
-	}
-/*                            --------------------------------------- 
-*/
-/*                            PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. 
-*/
-/*                            --------------------------------------- 
-*/
-	if (dforw[enode] == 2) {
-	    goto L600;
-	}
-	llist[enode] = qxhead;
-	qxhead = enode;
-	goto L700;
-L600:
-	llist[enode] = q2head;
-	q2head = enode;
-L700:
-	;
-    }
-L800:
-/*            -------------------------------------------- */
-/*            FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */
-/*            -------------------------------------------- */
-    enode = q2head;
-    iq2 = 1;
-L900:
-    if (enode <= 0) {
-	goto L1500;
-    }
-    if (dbakw[enode] != 0) {
-	goto L2200;
-    }
-    ++(*tag);
-    deg = deg0;
-/*                    ------------------------------------------ */
-/*                    IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */
-/*                    ------------------------------------------ */
-    istrt = xadj[enode];
-    nabor = adjncy[istrt];
-    if (nabor == elmnt) {
-	nabor = adjncy[istrt + 1];
-    }
-/*                    ------------------------------------------------ */
-/*                    IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */
-/*                    ------------------------------------------------ */
-    link = nabor;
-    if (dforw[nabor] < 0) {
-	goto L1000;
-    }
-    deg += qsize[nabor];
-    goto L2100;
-L1000:
-/*                        -------------------------------------------- */
-/*                        OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */
-/*                        DO THE FOLLOWING. */
-/*                        -------------------------------------------- */
-    istrt = xadj[link];
-    istop = xadj[link + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	node = adjncy[i];
-	link = -node;
-	if (node == enode) {
-	    goto L1400;
-	}
-	if (node < 0) {
-	    goto L1000;
-	} else if (node == 0) {
-	    goto L2100;
-	} else {
-	    goto L1100;
-	}
-
-L1100:
-	if (qsize[node] == 0) {
-	    goto L1400;
-	}
-	if (marker[node] >= *tag) {
-	    goto L1200;
-	}
-/*                                -----------------------------------
--- */
-/*                                CASE WHEN NODE IS NOT YET CONSIDERED
-. */
-/*                                -----------------------------------
--- */
-	marker[node] = *tag;
-	deg += qsize[node];
-	goto L1400;
-L1200:
-/*                            ----------------------------------------
- */
-/*                            CASE WHEN NODE IS INDISTINGUISHABLE FROM
- */
-/*                            ENODE.  MERGE THEM INTO A NEW SUPERNODE.
- */
-/*                            ----------------------------------------
- */
-	if (dbakw[node] != 0) {
-	    goto L1400;
-	}
-	if (dforw[node] != 2) {
-	    goto L1300;
-	}
-	qsize[enode] += qsize[node];
-	qsize[node] = 0;
-	marker[node] = *maxint;
-	dforw[node] = -enode;
-	dbakw[node] = -(*maxint);
-	goto L1400;
-L1300:
-/*                            -------------------------------------- 
-*/
-/*                            CASE WHEN NODE IS OUTMATCHED BY ENODE. 
-*/
-/*                            -------------------------------------- 
-*/
-	if (dbakw[node] == 0) {
-	    dbakw[node] = -(*maxint);
-	}
-L1400:
-	;
-    }
-    goto L2100;
-L1500:
-/*                ------------------------------------------------ */
-/*                FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */
-/*                ------------------------------------------------ */
-    enode = qxhead;
-    iq2 = 0;
-L1600:
-    if (enode <= 0) {
-	goto L2300;
-    }
-    if (dbakw[enode] != 0) {
-	goto L2200;
-    }
-    ++(*tag);
-    deg = deg0;
-/*                        --------------------------------- */
-/*                        FOR EACH UNMARKED NABOR OF ENODE, */
-/*                        DO THE FOLLOWING. */
-/*                        --------------------------------- */
-    istrt = xadj[enode];
-    istop = xadj[enode + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	nabor = adjncy[i];
-	if (nabor == 0) {
-	    goto L2100;
-	}
-	if (marker[nabor] >= *tag) {
-	    goto L2000;
-	}
-	marker[nabor] = *tag;
-	link = nabor;
-/*                                ------------------------------ */
-/*                                IF UNELIMINATED, INCLUDE IT IN */
-/*                                DEG COUNT. */
-/*                                ------------------------------ */
-	if (dforw[nabor] < 0) {
-	    goto L1700;
-	}
-	deg += qsize[nabor];
-	goto L2000;
-L1700:
-/*                                    ------------------------------- 
-*/
-/*                                    IF ELIMINATED, INCLUDE UNMARKED 
-*/
-/*                                    NODES IN THIS ELEMENT INTO THE 
-*/
-/*                                    DEGREE COUNT. */
-/*                                    ------------------------------- 
-*/
-	jstrt = xadj[link];
-	jstop = xadj[link + 1] - 1;
-	i__2 = jstop;
-	for (j = jstrt; j <= i__2; ++j) {
-	    node = adjncy[j];
-	    link = -node;
-	    if (node < 0) {
-		goto L1700;
-	    } else if (node == 0) {
-		goto L2000;
-	    } else {
-		goto L1800;
-	    }
-
-L1800:
-	    if (marker[node] >= *tag) {
-		goto L1900;
-	    }
-	    marker[node] = *tag;
-	    deg += qsize[node];
-L1900:
-	    ;
-	}
-L2000:
-	;
-    }
-L2100:
-/*                    ------------------------------------------- */
-/*                    UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */
-/*                    STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */
-/*                    ------------------------------------------- */
-    deg = deg - qsize[enode] + 1;
-    fnode = dhead[deg];
-    dforw[enode] = fnode;
-    dbakw[enode] = -deg;
-    if (fnode > 0) {
-	dbakw[fnode] = enode;
-    }
-    dhead[deg] = enode;
-    if (deg < *mdeg) {
-	*mdeg = deg;
-    }
-L2200:
-/*                    ---------------------------------- */
-/*                    GET NEXT ENODE IN CURRENT ELEMENT. */
-/*                    ---------------------------------- */
-    enode = llist[enode];
-    if (iq2 == 1) {
-	goto L900;
-    }
-    goto L1600;
-L2300:
-/*            ----------------------------- */
-/*            GET NEXT ELEMENT IN THE LIST. */
-/*            ----------------------------- */
-    *tag = mtag;
-    elmnt = llist[elmnt];
-    goto L100;
-
-} /* mmdupd_dist */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* *****     MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING     ***** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */
-/*        PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */
-/*        VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */
-/*        MINIMUM DEGREE ORDERING ALGORITHM. */
-
-/*     INPUT PARAMETERS - */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        QSIZE  - SIZE OF SUPERNODES AT ELIMINATION. */
-
-/*     UPDATED PARAMETERS - */
-/*        INVP   - INVERSE PERMUTATION VECTOR.  ON INPUT, */
-/*                 IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */
-/*                 INTO THE NODE -INVP(NODE); OTHERWISE, */
-/*                 -INVP(NODE) IS ITS INVERSE LABELLING. */
-
-/*     OUTPUT PARAMETERS - */
-/*        PERM   - THE PERMUTATION VECTOR. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdnum_dist(int *neqns, shortint *perm, shortint *invp,
-				 shortint *qsize)
-{
-    /* System generated locals */
-    int i__1;
-
-    /* Local variables */
-    static int node, root, nextf, father, nqsize, num;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --qsize;
-    --invp;
-    --perm;
-
-    /* Function Body */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	nqsize = qsize[node];
-	if (nqsize <= 0) {
-	    perm[node] = invp[node];
-	}
-	if (nqsize > 0) {
-	    perm[node] = -invp[node];
-	}
-/* L100: */
-    }
-/*        ------------------------------------------------------ */
-/*        FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */
-/*        ------------------------------------------------------ */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	if (perm[node] > 0) {
-	    goto L500;
-	}
-/*                ----------------------------------------- */
-/*                TRACE THE MERGED TREE UNTIL ONE WHICH HAS */
-/*                NOT BEEN MERGED, CALL IT ROOT. */
-/*                ----------------------------------------- */
-	father = node;
-L200:
-	if (perm[father] > 0) {
-	    goto L300;
-	}
-	father = -perm[father];
-	goto L200;
-L300:
-/*                ----------------------- */
-/*                NUMBER NODE AFTER ROOT. */
-/*                ----------------------- */
-	root = father;
-	num = perm[root] + 1;
-	invp[node] = -num;
-	perm[root] = num;
-/*                ------------------------ */
-/*                SHORTEN THE MERGED TREE. */
-/*                ------------------------ */
-	father = node;
-L400:
-	nextf = -perm[father];
-	if (nextf <= 0) {
-	    goto L500;
-	}
-	perm[father] = -root;
-	father = nextf;
-	goto L400;
-L500:
-	;
-    }
-/*        ---------------------- */
-/*        READY TO COMPUTE PERM. */
-/*        ---------------------- */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	num = -invp[node];
-	invp[node] = num;
-	perm[num] = node;
-/* L600: */
-    }
-    return 0;
-
-} /* mmdnum_dist */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,2583 +0,0 @@
-/* ========================================================================== */
-/* === colamd - a sparse matrix column ordering algorithm =================== */
-/* ========================================================================== */
-
-/*
-    colamd:  An approximate minimum degree column ordering algorithm.
-
-    Purpose:
-
-	Colamd computes a permutation Q such that the Cholesky factorization of
-	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
-	than A'A.  This also provides a good ordering for sparse partial
-	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
-	factorization, and P is computed during numerical factorization via
-	conventional partial pivoting with row interchanges.  Colamd is the
-	column ordering method used in SuperLU, part of the ScaLAPACK library.
-	It is also available as user-contributed software for Matlab 5.2,
-	available from MathWorks, Inc. (http://www.mathworks.com).  This
-	routine can be used in place of COLMMD in Matlab.  By default, the \
-	and / operators in Matlab perform a column ordering (using COLMMD)
-	prior to LU factorization using sparse partial pivoting, in the
-	built-in Matlab LU(A) routine.
-
-    Authors:
-
-	The authors of the code itself are Stefan I. Larimore and Timothy A.
-	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
-	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
-	Ng, Oak Ridge National Laboratory.
-
-    Date:
-
-	August 3, 1998.  Version 1.0.
-
-    Acknowledgements:
-
-	This work was supported by the National Science Foundation, under
-	grants DMS-9504974 and DMS-9803599.
-
-    Notice:
-
-	Copyright (c) 1998 by the University of Florida.  All Rights Reserved.
-
-	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-
-	Permission is hereby granted to use or copy this program for any
-	purpose, provided the above notices are retained on all copies.
-	User documentation of any code that uses this code must cite the
-	Authors, the Copyright, and "Used by permission."  If this code is
-	accessible from within Matlab, then typing "help colamd" or "colamd"
-	(with no arguments) must cite the Authors.  Permission to modify the
-	code and to distribute modified code is granted, provided the above
-	notices are retained, and a notice that the code was modified is
-	included with the above copyright notice.  You must also retain the
-	Availability information below, of the original version.
-
-	This software is provided free of charge.
-
-    Availability:
-
-	This file is located at
-
-		http://www.cise.ufl.edu/~davis/colamd/colamd.c
-
-	The colamd.h file is required, located in the same directory.
-	The colamdmex.c file provides a Matlab interface for colamd.
-	The symamdmex.c file provides a Matlab interface for symamd, which is
-	a symmetric ordering based on this code, colamd.c.  All codes are
-	purely ANSI C compliant (they use no Unix-specific routines, include
-	files, etc.).
-*/
-
-/* ========================================================================== */
-/* === Description of user-callable routines ================================ */
-/* ========================================================================== */
-
-/*
-    Each user-callable routine (declared as PUBLIC) is briefly described below.
-    Refer to the comments preceding each routine for more details.
-
-    ----------------------------------------------------------------------------
-    colamd_recommended:
-    ----------------------------------------------------------------------------
-
-	Usage:
-
-	    Alen = colamd_recommended (nnz, n_row, n_col) ;
-
-	Purpose:
-
-	    Returns recommended value of Alen for use by colamd.  Returns -1
-	    if any input argument is negative.
-
-	Arguments:
-
-	    int nnz ;		Number of nonzeros in the matrix A.  This must
-				be the same value as p [n_col] in the call to
-				colamd - otherwise you will get a wrong value
-				of the recommended memory to use.
-	    int n_row ;		Number of rows in the matrix A.
-	    int n_col ;		Number of columns in the matrix A.
-
-    ----------------------------------------------------------------------------
-    colamd_set_defaults:
-    ----------------------------------------------------------------------------
-
-	Usage:
-
-	    colamd_set_defaults (knobs) ;
-
-	Purpose:
-
-	    Sets the default parameters.
-
-	Arguments:
-
-	    double knobs [COLAMD_KNOBS] ;	Output only.
-
-		Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries
-		are removed prior to ordering.  Columns with more than
-		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed
-		prior to ordering, and placed last in the output column
-		ordering.  Default values of these two knobs are both 0.5.
-		Currently, only knobs [0] and knobs [1] are used, but future
-		versions may use more knobs.  If so, they will be properly set
-		to their defaults by the future version of colamd_set_defaults,
-		so that the code that calls colamd will not need to change,
-		assuming that you either use colamd_set_defaults, or pass a
-		(double *) NULL pointer as the knobs array to colamd.
-
-    ----------------------------------------------------------------------------
-    colamd:
-    ----------------------------------------------------------------------------
-
-	Usage:
-
-	    colamd (n_row, n_col, Alen, A, p, knobs) ;
-
-	Purpose:
-
-	    Computes a column ordering (Q) of A such that P(AQ)=LU or
-	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
-	    operations than factorizing the unpermuted matrix A or A'A,
-	    respectively.
-
-	Arguments:
-
-	    int n_row ;
-
-		Number of rows in the matrix A.
-		Restriction:  n_row >= 0.
-		Colamd returns FALSE if n_row is negative.
-
-	    int n_col ;
-
-		Number of columns in the matrix A.
-		Restriction:  n_col >= 0.
-		Colamd returns FALSE if n_col is negative.
-
-	    int Alen ;
-
-		Restriction (see note):
-		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS
-		Colamd returns FALSE if these conditions are not met.
-
-		Note:  this restriction makes an modest assumption regarding
-		the size of the two typedef'd structures, below.  We do,
-		however, guarantee that
-		Alen >= colamd_recommended (nnz, n_row, n_col)
-		will be sufficient.
-
-	    int A [Alen] ;	Input argument, stats on output.
-
-		A is an integer array of size Alen.  Alen must be at least as
-		large as the bare minimum value given above, but this is very
-		low, and can result in excessive run time.  For best
-		performance, we recommend that Alen be greater than or equal to
-		colamd_recommended (nnz, n_row, n_col), which adds
-		nnz/5 to the bare minimum value given above.
-
-		On input, the row indices of the entries in column c of the
-		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
-		in a given column c need not be in ascending order, and
-		duplicate row indices may be be present.  However, colamd will
-		work a little faster if both of these conditions are met
-		(Colamd puts the matrix into this format, if it finds that the
-		the conditions are not met).
-
-		The matrix is 0-based.  That is, rows are in the range 0 to
-		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
-		returns FALSE if any row index is out of range.
-
-		The contents of A are modified during ordering, and are thus
-		undefined on output with the exception of a few statistics
-		about the ordering (A [0..COLAMD_STATS-1]):
-		A [0]:  number of dense or empty rows ignored.
-		A [1]:  number of dense or empty columns ignored (and ordered
-			last in the output permutation p)
-		A [2]:  number of garbage collections performed.
-		A [3]:  0, if all row indices in each column were in sorted
-			  order, and no duplicates were present.
-			1, otherwise (in which case colamd had to do more work)
-		Note that a row can become "empty" if it contains only
-		"dense" and/or "empty" columns, and similarly a column can
-		become "empty" if it only contains "dense" and/or "empty" rows.
-		Future versions may return more statistics in A, but the usage
-		of these 4 entries in A will remain unchanged.
-
-	    int p [n_col+1] ;	Both input and output argument.
-
-		p is an integer array of size n_col+1.  On input, it holds the
-		"pointers" for the column form of the matrix A.  Column c of
-		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
-		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
-		for all c in the range 0 to n_col-1.  The value p [n_col] is
-		thus the total number of entries in the pattern of the matrix A.
-		Colamd returns FALSE if these conditions are not met.
-
-		On output, if colamd returns TRUE, the array p holds the column
-		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
-		the first column index in the new ordering, and p [n_col-1] is
-		the last.  That is, p [k] = j means that column j of A is the
-		kth pivot column, in AQ, where k is in the range 0 to n_col-1
-		(p [0] = j means that column j of A is the first column in AQ).
-
-		If colamd returns FALSE, then no permutation is returned, and
-		p is undefined on output.
-
-	    double knobs [COLAMD_KNOBS] ;	Input only.
-
-		See colamd_set_defaults for a description.  If the knobs array
-		is not present (that is, if a (double *) NULL pointer is passed
-		in its place), then the default values of the parameters are
-		used instead.
-
-*/
-
-
-/* ========================================================================== */
-/* === Include files ======================================================== */
-/* ========================================================================== */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-
-/* colamd.h:  knob array size, stats output size, and global prototypes */
-#include "colamd.h"
-
-/* ========================================================================== */
-/* === Scaffolding code definitions  ======================================== */
-/* ========================================================================== */
-
-/* Ensure that debugging is turned off: */
-#ifndef NDEBUG
-#define NDEBUG
-#endif
-
-/* assert.h:  the assert macro (no debugging if NDEBUG is defined) */
-#include <assert.h>
-
-/*
-   Our "scaffolding code" philosophy:  In our opinion, well-written library
-   code should keep its "debugging" code, and just normally have it turned off
-   by the compiler so as not to interfere with performance.  This serves
-   several purposes:
-
-   (1) assertions act as comments to the reader, telling you what the code
-	expects at that point.  All assertions will always be true (unless
-	there really is a bug, of course).
-
-   (2) leaving in the scaffolding code assists anyone who would like to modify
-	the code, or understand the algorithm (by reading the debugging output,
-	one can get a glimpse into what the code is doing).
-
-   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
-	and "should" be fully functional and bug-free ... but you never know...
-
-    To enable debugging, comment out the "#define NDEBUG" above.  The code will
-    become outrageously slow when debugging is enabled.  To control the level of
-    debugging output, set an environment variable D to 0 (little), 1 (some),
-    2, 3, or 4 (lots).
-*/
-
-/* ========================================================================== */
-/* === Row and Column structures ============================================ */
-/* ========================================================================== */
-
-typedef struct ColInfo_struct
-{
-    int start ;		/* index for A of first row in this column, or DEAD */
-			/* if column is dead */
-    int length ;	/* number of rows in this column */
-    union
-    {
-	int thickness ;	/* number of original columns represented by this */
-			/* col, if the column is alive */
-	int parent ;	/* parent in parent tree super-column structure, if */
-			/* the column is dead */
-    } shared1 ;
-    union
-    {
-	int score ;	/* the score used to maintain heap, if col is alive */
-	int order ;	/* pivot ordering of this column, if col is dead */
-    } shared2 ;
-    union
-    {
-	int headhash ;	/* head of a hash bucket, if col is at the head of */
-			/* a degree list */
-	int hash ;	/* hash value, if col is not in a degree list */
-	int prev ;	/* previous column in degree list, if col is in a */
-			/* degree list (but not at the head of a degree list) */
-    } shared3 ;
-    union
-    {
-	int degree_next ;	/* next column, if col is in a degree list */
-	int hash_next ;		/* next column, if col is in a hash list */
-    } shared4 ;
-
-} ColInfo ;
-
-typedef struct RowInfo_struct
-{
-    int start ;		/* index for A of first col in this row */
-    int length ;	/* number of principal columns in this row */
-    union
-    {
-	int degree ;	/* number of principal & non-principal columns in row */
-	int p ;		/* used as a row pointer in init_rows_cols () */
-    } shared1 ;
-    union
-    {
-	int mark ;	/* for computing set differences and marking dead rows*/
-	int first_column ;/* first column in row (used in garbage collection) */
-    } shared2 ;
-
-} RowInfo ;
-
-/* ========================================================================== */
-/* === Definitions ========================================================== */
-/* ========================================================================== */
-
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-#define ONES_COMPLEMENT(r) (-(r)-1)
-
-#define TRUE	(1)
-#define FALSE	(0)
-#define EMPTY	(-1)
-
-/* Row and column status */
-#define ALIVE	(0)
-#define DEAD	(-1)
-
-/* Column status */
-#define DEAD_PRINCIPAL		(-1)
-#define DEAD_NON_PRINCIPAL	(-2)
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
-#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
-
-/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
-#define PUBLIC
-#define PRIVATE static
-
-/* ========================================================================== */
-/* === Prototypes of PRIVATE routines ======================================= */
-/* ========================================================================== */
-
-PRIVATE int init_rows_cols
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int p []
-) ;
-
-PRIVATE void init_scoring
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int head [],
-    double knobs [COLAMD_KNOBS],
-    int *p_n_row2,
-    int *p_n_col2,
-    int *p_max_deg
-) ;
-
-PRIVATE int find_ordering
-(
-    int n_row,
-    int n_col,
-    int Alen,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int head [],
-    int n_col2,
-    int max_deg,
-    int pfree
-) ;
-
-PRIVATE void order_children
-(
-    int n_col,
-    ColInfo Col [],
-    int p []
-) ;
-
-PRIVATE void detect_super_cols
-(
-#ifndef NDEBUG
-    int n_col,
-    RowInfo Row [],
-#endif
-    ColInfo Col [],
-    int A [],
-    int head [],
-    int row_start,
-    int row_length
-) ;
-
-PRIVATE int garbage_collection
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int *pfree
-) ;
-
-PRIVATE int clear_mark
-(
-    int n_row,
-    RowInfo Row []
-) ;
-
-/* ========================================================================== */
-/* === Debugging definitions ================================================ */
-/* ========================================================================== */
-
-#ifndef NDEBUG
-
-/* === With debugging ======================================================= */
-
-/* stdlib.h: for getenv and atoi, to get debugging level from environment */
-#include <stdlib.h>
-
-/* stdio.h:  for printf (no printing if debugging is turned off) */
-#include <stdio.h>
-
-PRIVATE void debug_deg_lists
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int head [],
-    int min_score,
-    int should,
-    int max_deg
-) ;
-
-PRIVATE void debug_mark
-(
-    int n_row,
-    RowInfo Row [],
-    int tag_mark,
-    int max_mark
-) ;
-
-PRIVATE void debug_matrix
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A []
-) ;
-
-PRIVATE void debug_structures
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int n_col2
-) ;
-
-/* the following is the *ONLY* global variable in this file, and is only */
-/* present when debugging */
-
-PRIVATE int debug_colamd ;	/* debug print level */
-
-#define DEBUG0(params) { (void) printf params ; }
-#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; }
-#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; }
-#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; }
-#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; }
-
-#else
-
-/* === No debugging ========================================================= */
-
-#define DEBUG0(params) ;
-#define DEBUG1(params) ;
-#define DEBUG2(params) ;
-#define DEBUG3(params) ;
-#define DEBUG4(params) ;
-
-#endif
-
-/* ========================================================================== */
-
-
-/* ========================================================================== */
-/* === USER-CALLABLE ROUTINES: ============================================== */
-/* ========================================================================== */
-
-
-/* ========================================================================== */
-/* === colamd_recommended =================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd_recommended routine returns the suggested size for Alen.  This
-    value has been determined to provide good balance between the number of
-    garbage collections and the memory requirements for colamd.
-*/
-
-PUBLIC int colamd_recommended	/* returns recommended value of Alen. */
-(
-    /* === Parameters ======================================================= */
-
-    int nnz,			/* number of nonzeros in A */
-    int n_row,			/* number of rows in A */
-    int n_col			/* number of columns in A */
-)
-{
-    /* === Local variables ================================================== */
-
-    int minimum ;		/* bare minimum requirements */
-    int recommended ;		/* recommended value of Alen */
-
-    if (nnz < 0 || n_row < 0 || n_col < 0)
-    {
-	/* return -1 if any input argument is corrupted */
-	DEBUG0 (("colamd_recommended error!")) ;
-	DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ;
-	return (-1) ;
-    }
-
-    minimum =
-	2 * (nnz)		/* for A */
-	+ (((n_col) + 1) * sizeof (ColInfo) / sizeof (int))	/* for Col */
-	+ (((n_row) + 1) * sizeof (RowInfo) / sizeof (int))	/* for Row */
-	+ n_col			/* minimum elbow room to guarrantee success */
-	+ COLAMD_STATS ;	/* for output statistics */
-
-    /* recommended is equal to the minumum plus enough memory to keep the */
-    /* number garbage collections low */
-    recommended = minimum + nnz/5 ;
-
-    return (recommended) ;
-}
-
-
-/* ========================================================================== */
-/* === colamd_set_defaults ================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd_set_defaults routine sets the default values of the user-
-    controllable parameters for colamd:
-
-	knobs [0]	rows with knobs[0]*n_col entries or more are removed
-			prior to ordering.
-
-	knobs [1]	columns with knobs[1]*n_row entries or more are removed
-			prior to ordering, and placed last in the column
-			permutation.
-
-	knobs [2..19]	unused, but future versions might use this
-*/
-
-PUBLIC void colamd_set_defaults
-(
-    /* === Parameters ======================================================= */
-
-    double knobs [COLAMD_KNOBS]		/* knob array */
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;
-
-    if (!knobs)
-    {
-	return ;			/* no knobs to initialize */
-    }
-    for (i = 0 ; i < COLAMD_KNOBS ; i++)
-    {
-	knobs [i] = 0 ;
-    }
-    knobs [COLAMD_DENSE_ROW] = 0.5 ;	/* ignore rows over 50% dense */
-    knobs [COLAMD_DENSE_COL] = 0.5 ;	/* ignore columns over 50% dense */
-}
-
-
-/* ========================================================================== */
-/* === colamd =============================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd routine computes a column ordering Q of a sparse matrix
-    A such that the LU factorization P(AQ) = LU remains sparse, where P is
-    selected via partial pivoting.   The routine can also be viewed as
-    providing a permutation Q such that the Cholesky factorization
-    (AQ)'(AQ) = LL' remains sparse.
-
-    On input, the nonzero patterns of the columns of A are stored in the
-    array A, in order 0 to n_col-1.  A is held in 0-based form (rows in the
-    range 0 to n_row-1 and columns in the range 0 to n_col-1).  Row indices
-    for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0,
-    and thus p [n_col] is the number of entries in A.  The matrix is
-    destroyed on output.  The row indices within each column do not have to
-    be sorted (from small to large row indices), and duplicate row indices
-    may be present.  However, colamd will work a little faster if columns are
-    sorted and no duplicates are present.  Matlab 5.2 always passes the matrix
-    with sorted columns, and no duplicates.
-
-    The integer array A is of size Alen.  Alen must be at least of size
-    (where nnz is the number of entries in A):
-
-	nnz			for the input column form of A
-	+ nnz			for a row form of A that colamd generates
-	+ 6*(n_col+1)		for a ColInfo Col [0..n_col] array
-				(this assumes sizeof (ColInfo) is 6 int's).
-	+ 4*(n_row+1)		for a RowInfo Row [0..n_row] array
-				(this assumes sizeof (RowInfo) is 4 int's).
-	+ elbow_room		must be at least n_col.  We recommend at least
-				nnz/5 in addition to that.  If sufficient,
-				changes in the elbow room affect the ordering
-				time only, not the ordering itself.
-	+ COLAMD_STATS		for the output statistics
-
-    Colamd returns FALSE is memory is insufficient, or TRUE otherwise.
-
-    On input, the caller must specify:
-
-	n_row			the number of rows of A
-	n_col			the number of columns of A
-	Alen			the size of the array A
-	A [0 ... nnz-1]		the row indices, where nnz = p [n_col]
-	A [nnz ... Alen-1]	(need not be initialized by the user)
-	p [0 ... n_col]		the column pointers,  p [0] = 0, and p [n_col]
-				is the number of entries in A.  Column c of A
-				is stored in A [p [c] ... p [c+1]-1].
-	knobs [0 ... 19]	a set of parameters that control the behavior
-				of colamd.  If knobs is a NULL pointer the
-				defaults are used.  The user-callable
-				colamd_set_defaults routine sets the default
-				parameters.  See that routine for a description
-				of the user-controllable parameters.
-
-    If the return value of Colamd is TRUE, then on output:
-
-	p [0 ... n_col-1]	the column permutation. p [0] is the first
-				column index, and p [n_col-1] is the last.
-				That is, p [k] = j means that column j of A
-				is the kth column of AQ.
-
-	A			is undefined on output (the matrix pattern is
-				destroyed), except for the following statistics:
-
-	A [0]			the number of dense (or empty) rows ignored
-	A [1]			the number of dense (or empty) columms.  These
-				are ordered last, in their natural order.
-	A [2]			the number of garbage collections performed.
-				If this is excessive, then you would have
-				gotten your results faster if Alen was larger.
-	A [3]			0, if all row indices in each column were in
-				sorted order and no duplicates were present.
-				1, if there were unsorted or duplicate row
-				indices in the input.  You would have gotten
-				your results faster if A [3] was returned as 0.
-
-    If the return value of Colamd is FALSE, then A and p are undefined on
-    output.
-*/
-
-PUBLIC int colamd		/* returns TRUE if successful */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows in A */
-    int n_col,			/* number of columns in A */
-    int Alen,			/* length of A */
-    int A [],			/* row indices of A */
-    int p [],			/* pointers to columns in A */
-    double knobs [COLAMD_KNOBS]	/* parameters (uses defaults if NULL) */
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;			/* loop index */
-    int nnz ;			/* nonzeros in A */
-    int Row_size ;		/* size of Row [], in integers */
-    int Col_size ;		/* size of Col [], in integers */
-    int elbow_room ;		/* remaining free space */
-    RowInfo *Row ;		/* pointer into A of Row [0..n_row] array */
-    ColInfo *Col ;		/* pointer into A of Col [0..n_col] array */
-    int n_col2 ;		/* number of non-dense, non-empty columns */
-    int n_row2 ;		/* number of non-dense, non-empty rows */
-    int ngarbage ;		/* number of garbage collections performed */
-    int max_deg ;		/* maximum row degree */
-    double default_knobs [COLAMD_KNOBS] ;	/* default knobs knobs array */
-    int init_result ;		/* return code from initialization */
-
-#ifndef NDEBUG
-    debug_colamd = 0 ;		/* no debug printing */
-    /* get "D" environment variable, which gives the debug printing level */
-    if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ;
-    DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ;
-#endif
-
-    /* === Check the input arguments ======================================== */
-
-    if (n_row < 0 || n_col < 0 || !A || !p)
-    {
-	/* n_row and n_col must be non-negative, A and p must be present */
-	DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ;
-	return (FALSE) ;
-    }
-    nnz = p [n_col] ;
-    if (nnz < 0 || p [0] != 0)
-    {
-	/* nnz must be non-negative, and p [0] must be zero */
-	DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ;
-	return (FALSE) ;
-    }
-
-    /* === If no knobs, set default parameters ============================== */
-
-    if (!knobs)
-    {
-	knobs = default_knobs ;
-	colamd_set_defaults (knobs) ;
-    }
-
-    /* === Allocate the Row and Col arrays from array A ===================== */
-
-    Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ;
-    Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ;
-    elbow_room = Alen - (2*nnz + Col_size + Row_size) ;
-    if (elbow_room < n_col + COLAMD_STATS)
-    {
-	/* not enough space in array A to perform the ordering */
-	DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ;
-	return (FALSE) ;
-    }
-    Alen = 2*nnz + elbow_room ;
-    Col  = (ColInfo *) &A [Alen] ;
-    Row  = (RowInfo *) &A [Alen + Col_size] ;
-
-    /* === Construct the row and column data structures ===================== */
-
-    init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ;
-    if (init_result == -1)
-    {
-	/* input matrix is invalid */
-	DEBUG0 (("colamd error! matrix invalid\n")) ;
-	return (FALSE) ;
-    }
-
-    /* === Initialize scores, kill dense rows/columns ======================= */
-
-    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
-	&n_row2, &n_col2, &max_deg) ;
-
-    /* === Order the supercolumns =========================================== */
-
-    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
-	n_col2, max_deg, 2*nnz) ;
-
-    /* === Order the non-principal columns ================================== */
-
-    order_children (n_col, Col, p) ;
-
-    /* === Return statistics in A =========================================== */
-
-    for (i = 0 ; i < COLAMD_STATS ; i++)
-    {
-	A [i] = 0 ;
-    }
-    A [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-    A [COLAMD_DENSE_COL] = n_col - n_col2 ;
-    A [COLAMD_DEFRAG_COUNT] = ngarbage ;
-    A [COLAMD_JUMBLED_COLS] = init_result ;
-
-    return (TRUE) ;
-}
-
-
-/* ========================================================================== */
-/* === NON-USER-CALLABLE ROUTINES: ========================================== */
-/* ========================================================================== */
-
-/* There are no user-callable routines beyond this point in the file */
-
-
-/* ========================================================================== */
-/* === init_rows_cols ======================================================= */
-/* ========================================================================== */
-
-/*
-    Takes the column form of the matrix in A and creates the row form of the
-    matrix.  Also, row and column attributes are stored in the Col and Row
-    structs.  If the columns are un-sorted or contain duplicate row indices,
-    this routine will also sort and remove duplicate row indices from the
-    column form of the matrix.  Returns -1 on error, 1 if columns jumbled,
-    or 0 if columns not jumbled.  Not user-callable.
-*/
-
-PRIVATE int init_rows_cols	/* returns status code */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    RowInfo Row [],		/* of size n_row+1 */
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* row indices of A, of size Alen */
-    int p []			/* pointers to columns in A, of size n_col+1 */
-)
-{
-    /* === Local variables ================================================== */
-
-    int col ;			/* a column index */
-    int row ;			/* a row index */
-    int *cp ;			/* a column pointer */
-    int *cp_end ;		/* a pointer to the end of a column */
-    int *rp ;			/* a row pointer */
-    int *rp_end ;		/* a pointer to the end of a row */
-    int last_start ;		/* start index of previous column in A */
-    int start ;			/* start index of column in A */
-    int last_row ;		/* previous row */
-    int jumbled_columns ;	/* indicates if columns are jumbled */
-
-    /* === Initialize columns, and check column pointers ==================== */
-
-    last_start = 0 ;
-    for (col = 0 ; col < n_col ; col++)
-    {
-	start = p [col] ;
-	if (start < last_start)
-	{
-	    /* column pointers must be non-decreasing */
-	    DEBUG0 (("colamd error!  last p %d p [col] %d\n",last_start,start));
-	    return (-1) ;
-	}
-	Col [col].start = start ;
-	Col [col].length = p [col+1] - start ;
-	Col [col].shared1.thickness = 1 ;
-	Col [col].shared2.score = 0 ;
-	Col [col].shared3.prev = EMPTY ;
-	Col [col].shared4.degree_next = EMPTY ;
-	last_start = start ;
-    }
-    /* must check the end pointer for last column */
-    if (p [n_col] < last_start)
-    {
-	/* column pointers must be non-decreasing */
-	DEBUG0 (("colamd error!  last p %d p [n_col] %d\n",p[col],last_start)) ;
-	return (-1) ;
-    }
-
-    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
-
-    /* === Scan columns, compute row degrees, and check row indices ========= */
-
-    jumbled_columns = FALSE ;
-
-    for (row = 0 ; row < n_row ; row++)
-    {
-	Row [row].length = 0 ;
-	Row [row].shared2.mark = -1 ;
-    }
-
-    for (col = 0 ; col < n_col ; col++)
-    {
-	last_row = -1 ;
-
-	cp = &A [p [col]] ;
-	cp_end = &A [p [col+1]] ;
-
-	while (cp < cp_end)
-	{
-	    row = *cp++ ;
-
-	    /* make sure row indices within range */
-	    if (row < 0 || row >= n_row)
-	    {
-		DEBUG0 (("colamd error!  col %d row %d last_row %d\n",
-			 col, row, last_row)) ;
-		return (-1) ;
-	    }
-	    else if (row <= last_row)
-	    {
-		/* row indices are not sorted or repeated, thus cols */
-		/* are jumbled */
-		jumbled_columns = TRUE ;
-	    }
-	    /* prevent repeated row from being counted */
-	    if (Row [row].shared2.mark != col)
-	    {
-		Row [row].length++ ;
-		Row [row].shared2.mark = col ;
-		last_row = row ;
-	    }
-	    else
-	    {
-		/* this is a repeated entry in the column, */
-		/* it will be removed */
-		Col [col].length-- ;
-	    }
-	}
-    }
-
-    /* === Compute row pointers ============================================= */
-
-    /* row form of the matrix starts directly after the column */
-    /* form of matrix in A */
-    Row [0].start = p [n_col] ;
-    Row [0].shared1.p = Row [0].start ;
-    Row [0].shared2.mark = -1 ;
-    for (row = 1 ; row < n_row ; row++)
-    {
-	Row [row].start = Row [row-1].start + Row [row-1].length ;
-	Row [row].shared1.p = Row [row].start ;
-	Row [row].shared2.mark = -1 ;
-    }
-
-    /* === Create row form ================================================== */
-
-    if (jumbled_columns)
-    {
-	/* if cols jumbled, watch for repeated row indices */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    cp = &A [p [col]] ;
-	    cp_end = &A [p [col+1]] ;
-	    while (cp < cp_end)
-	    {
-		row = *cp++ ;
-		if (Row [row].shared2.mark != col)
-		{
-		    A [(Row [row].shared1.p)++] = col ;
-		    Row [row].shared2.mark = col ;
-		}
-	    }
-	}
-    }
-    else
-    {
-	/* if cols not jumbled, we don't need the mark (this is faster) */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    cp = &A [p [col]] ;
-	    cp_end = &A [p [col+1]] ;
-	    while (cp < cp_end)
-	    {
-		A [(Row [*cp++].shared1.p)++] = col ;
-	    }
-	}
-    }
-
-    /* === Clear the row marks and set row degrees ========================== */
-
-    for (row = 0 ; row < n_row ; row++)
-    {
-	Row [row].shared2.mark = 0 ;
-	Row [row].shared1.degree = Row [row].length ;
-    }
-
-    /* === See if we need to re-create columns ============================== */
-
-    if (jumbled_columns)
-    {
-
-#ifndef NDEBUG
-	/* make sure column lengths are correct */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    p [col] = Col [col].length ;
-	}
-	for (row = 0 ; row < n_row ; row++)
-	{
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		p [*rp++]-- ;
-	    }
-	}
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    assert (p [col] == 0) ;
-	}
-	/* now p is all zero (different than when debugging is turned off) */
-#endif
-
-	/* === Compute col pointers ========================================= */
-
-	/* col form of the matrix starts at A [0]. */
-	/* Note, we may have a gap between the col form and the row */
-	/* form if there were duplicate entries, if so, it will be */
-	/* removed upon the first garbage collection */
-	Col [0].start = 0 ;
-	p [0] = Col [0].start ;
-	for (col = 1 ; col < n_col ; col++)
-	{
-	    /* note that the lengths here are for pruned columns, i.e. */
-	    /* no duplicate row indices will exist for these columns */
-	    Col [col].start = Col [col-1].start + Col [col-1].length ;
-	    p [col] = Col [col].start ;
-	}
-
-	/* === Re-create col form =========================================== */
-
-	for (row = 0 ; row < n_row ; row++)
-	{
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		A [(p [*rp++])++] = row ;
-	    }
-	}
-	return (1) ;
-    }
-    else
-    {
-	/* no columns jumbled (this is faster) */
-	return (0) ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === init_scoring ========================================================= */
-/* ========================================================================== */
-
-/*
-    Kills dense or empty columns and rows, calculates an initial score for
-    each column, and places all columns in the degree lists.  Not user-callable.
-*/
-
-PRIVATE void init_scoring
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    RowInfo Row [],		/* of size n_row+1 */
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* column form and row form of A */
-    int head [],		/* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
-    int *p_n_row2,		/* number of non-dense, non-empty rows */
-    int *p_n_col2,		/* number of non-dense, non-empty columns */
-    int *p_max_deg		/* maximum row degree */
-)
-{
-    /* === Local variables ================================================== */
-
-    int c ;			/* a column index */
-    int r, row ;		/* a row index */
-    int *cp ;			/* a column pointer */
-    int deg ;			/* degree (# entries) of a row or column */
-    int *cp_end ;		/* a pointer to the end of a column */
-    int *new_cp ;		/* new column pointer */
-    int col_length ;		/* length of pruned column */
-    int score ;			/* current column score */
-    int n_col2 ;		/* number of non-dense, non-empty columns */
-    int n_row2 ;		/* number of non-dense, non-empty rows */
-    int dense_row_count ;	/* remove rows with more entries than this */
-    int dense_col_count ;	/* remove cols with more entries than this */
-    int min_score ;		/* smallest column score */
-    int max_deg ;		/* maximum row degree */
-    int next_col ;		/* Used to add to degree list.*/
-#ifndef NDEBUG
-    int debug_count ;		/* debug only. */
-#endif
-
-    /* === Extract knobs ==================================================== */
-
-    dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-    dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
-    DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ;
-    max_deg = 0 ;
-    n_col2 = n_col ;
-    n_row2 = n_row ;
-
-    /* === Kill empty columns =============================================== */
-
-    /* Put the empty columns at the end in their natural, so that LU */
-    /* factorization can proceed as far as possible. */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	deg = Col [c].length ;
-	if (deg == 0)
-	{
-	    /* this is a empty column, kill and order it last */
-	    Col [c].shared2.order = --n_col2 ;
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-    }
-    DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ;
-
-    /* === Kill dense columns =============================================== */
-
-    /* Put the dense columns at the end, in their natural order */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* skip any dead columns */
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	deg = Col [c].length ;
-	if (deg > dense_col_count)
-	{
-	    /* this is a dense column, kill and order it last */
-	    Col [c].shared2.order = --n_col2 ;
-	    /* decrement the row degrees */
-	    cp = &A [Col [c].start] ;
-	    cp_end = cp + Col [c].length ;
-	    while (cp < cp_end)
-	    {
-		Row [*cp++].shared1.degree-- ;
-	    }
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-    }
-    DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ;
-
-    /* === Kill dense and empty rows ======================================== */
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	deg = Row [r].shared1.degree ;
-	assert (deg >= 0 && deg <= n_col) ;
-	if (deg > dense_row_count || deg == 0)
-	{
-	    /* kill a dense or empty row */
-	    KILL_ROW (r) ;
-	    --n_row2 ;
-	}
-	else
-	{
-	    /* keep track of max degree of remaining rows */
-	    max_deg = MAX (max_deg, deg) ;
-	}
-    }
-    DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ;
-
-    /* === Compute initial column scores ==================================== */
-
-    /* At this point the row degrees are accurate.  They reflect the number */
-    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
-    /* Some "live" columns may contain only dead rows, however.  These are */
-    /* pruned in the code below. */
-
-    /* now find the initial matlab score for each column */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* skip dead column */
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	score = 0 ;
-	cp = &A [Col [c].start] ;
-	new_cp = cp ;
-	cp_end = cp + Col [c].length ;
-	while (cp < cp_end)
-	{
-	    /* get a row */
-	    row = *cp++ ;
-	    /* skip if dead */
-	    if (ROW_IS_DEAD (row))
-	    {
-		continue ;
-	    }
-	    /* compact the column */
-	    *new_cp++ = row ;
-	    /* add row's external degree */
-	    score += Row [row].shared1.degree - 1 ;
-	    /* guard against integer overflow */
-	    score = MIN (score, n_col) ;
-	}
-	/* determine pruned column length */
-	col_length = (int) (new_cp - &A [Col [c].start]) ;
-	if (col_length == 0)
-	{
-	    /* a newly-made null column (all rows in this col are "dense" */
-	    /* and have already been killed) */
-	    DEBUG0 (("Newly null killed: %d\n", c)) ;
-	    Col [c].shared2.order = --n_col2 ;
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-	else
-	{
-	    /* set column length and set score */
-	    assert (score >= 0) ;
-	    assert (score <= n_col) ;
-	    Col [c].length = col_length ;
-	    Col [c].shared2.score = score ;
-	}
-    }
-    DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ;
-
-    /* At this point, all empty rows and columns are dead.  All live columns */
-    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
-    /* yet).  Rows may contain dead columns, but all live rows contain at */
-    /* least one live column. */
-
-#ifndef NDEBUG
-    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
-#endif
-
-    /* === Initialize degree lists ========================================== */
-
-#ifndef NDEBUG
-    debug_count = 0 ;
-#endif
-
-    /* clear the hash buckets */
-    for (c = 0 ; c <= n_col ; c++)
-    {
-	head [c] = EMPTY ;
-    }
-    min_score = n_col ;
-    /* place in reverse order, so low column indices are at the front */
-    /* of the lists.  This is to encourage natural tie-breaking */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* only add principal columns to degree lists */
-	if (COL_IS_ALIVE (c))
-	{
-	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
-		c, Col [c].shared2.score, min_score, n_col)) ;
-
-	    /* === Add columns score to DList =============================== */
-
-	    score = Col [c].shared2.score ;
-
-	    assert (min_score >= 0) ;
-	    assert (min_score <= n_col) ;
-	    assert (score >= 0) ;
-	    assert (score <= n_col) ;
-	    assert (head [score] >= EMPTY) ;
-
-	    /* now add this column to dList at proper score location */
-	    next_col = head [score] ;
-	    Col [c].shared3.prev = EMPTY ;
-	    Col [c].shared4.degree_next = next_col ;
-
-	    /* if there already was a column with the same score, set its */
-	    /* previous pointer to this new column */
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = c ;
-	    }
-	    head [score] = c ;
-
-	    /* see if this score is less than current min */
-	    min_score = MIN (min_score, score) ;
-
-#ifndef NDEBUG
-	    debug_count++ ;
-#endif
-	}
-    }
-
-#ifndef NDEBUG
-    DEBUG0 (("Live cols %d out of %d, non-princ: %d\n",
-	debug_count, n_col, n_col-debug_count)) ;
-    assert (debug_count == n_col2) ;
-    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
-#endif
-
-    /* === Return number of remaining columns, and max row degree =========== */
-
-    *p_n_col2 = n_col2 ;
-    *p_n_row2 = n_row2 ;
-    *p_max_deg = max_deg ;
-}
-
-
-/* ========================================================================== */
-/* === find_ordering ======================================================== */
-/* ========================================================================== */
-
-/*
-    Order the principal columns of the supercolumn form of the matrix
-    (no supercolumns on input).  Uses a minimum approximate column minimum
-    degree ordering method.  Not user-callable.
-*/
-
-PRIVATE int find_ordering	/* return the number of garbage collections */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    int Alen,			/* size of A, 2*nnz + elbow_room or larger */
-    RowInfo Row [],		/* of size n_row+1 */
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* column form and row form of A */
-    int head [],		/* of size n_col+1 */
-    int n_col2,			/* Remaining columns to order */
-    int max_deg,		/* Maximum row degree */
-    int pfree			/* index of first free slot (2*nnz on entry) */
-)
-{
-    /* === Local variables ================================================== */
-
-    int k ;			/* current pivot ordering step */
-    int pivot_col ;		/* current pivot column */
-    int *cp ;			/* a column pointer */
-    int *rp ;			/* a row pointer */
-    int pivot_row ;		/* current pivot row */
-    int *new_cp ;		/* modified column pointer */
-    int *new_rp ;		/* modified row pointer */
-    int pivot_row_start ;	/* pointer to start of pivot row */
-    int pivot_row_degree ;	/* # of columns in pivot row */
-    int pivot_row_length ;	/* # of supercolumns in pivot row */
-    int pivot_col_score ;	/* score of pivot column */
-    int needed_memory ;		/* free space needed for pivot row */
-    int *cp_end ;		/* pointer to the end of a column */
-    int *rp_end ;		/* pointer to the end of a row */
-    int row ;			/* a row index */
-    int col ;			/* a column index */
-    int max_score ;		/* maximum possible score */
-    int cur_score ;		/* score of current column */
-    unsigned int hash ;		/* hash value for supernode detection */
-    int head_column ;		/* head of hash bucket */
-    int first_col ;		/* first column in hash bucket */
-    int tag_mark ;		/* marker value for mark array */
-    int row_mark ;		/* Row [row].shared2.mark */
-    int set_difference ;	/* set difference size of row with pivot row */
-    int min_score ;		/* smallest column score */
-    int col_thickness ;		/* "thickness" (# of columns in a supercol) */
-    int max_mark ;		/* maximum value of tag_mark */
-    int pivot_col_thickness ;	/* number of columns represented by pivot col */
-    int prev_col ;		/* Used by Dlist operations. */
-    int next_col ;		/* Used by Dlist operations. */
-    int ngarbage ;		/* number of garbage collections performed */
-#ifndef NDEBUG
-    int debug_d ;		/* debug loop counter */
-    int debug_step = 0 ;	/* debug loop counter */
-#endif
-
-    /* === Initialization and clear mark ==================================== */
-
-    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
-    tag_mark = clear_mark (n_row, Row) ;
-    min_score = 0 ;
-    ngarbage = 0 ;
-    DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ;
-
-    /* === Order the columns ================================================ */
-
-    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
-    {
-
-#ifndef NDEBUG
-	if (debug_step % 100 == 0)
-	{
-	    DEBUG0 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
-	}
-	else
-	{
-	    DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
-	}
-	debug_step++ ;
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k, max_deg) ;
-	debug_matrix (n_row, n_col, Row, Col, A) ;
-#endif
-
-	/* === Select pivot column, and order it ============================ */
-
-	/* make sure degree list isn't empty */
-	assert (min_score >= 0) ;
-	assert (min_score <= n_col) ;
-	assert (head [min_score] >= EMPTY) ;
-
-#ifndef NDEBUG
-	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
-	{
-	    assert (head [debug_d] == EMPTY) ;
-	}
-#endif
-
-	/* get pivot column from head of minimum degree list */
-	while (head [min_score] == EMPTY && min_score < n_col)
-	{
-	    min_score++ ;
-	}
-	pivot_col = head [min_score] ;
-	assert (pivot_col >= 0 && pivot_col <= n_col) ;
-	next_col = Col [pivot_col].shared4.degree_next ;
-	head [min_score] = next_col ;
-	if (next_col != EMPTY)
-	{
-	    Col [next_col].shared3.prev = EMPTY ;
-	}
-
-	assert (COL_IS_ALIVE (pivot_col)) ;
-	DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
-
-	/* remember score for defrag check */
-	pivot_col_score = Col [pivot_col].shared2.score ;
-
-	/* the pivot column is the kth column in the pivot order */
-	Col [pivot_col].shared2.order = k ;
-
-	/* increment order count by column thickness */
-	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
-	k += pivot_col_thickness ;
-	assert (pivot_col_thickness > 0) ;
-
-	/* === Garbage_collection, if necessary ============================= */
-
-	needed_memory = MIN (pivot_col_score, n_col - k) ;
-	if (pfree + needed_memory >= Alen)
-	{
-	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
-	    ngarbage++ ;
-	    /* after garbage collection we will have enough */
-	    assert (pfree + needed_memory < Alen) ;
-	    /* garbage collection has wiped out the Row[].shared2.mark array */
-	    tag_mark = clear_mark (n_row, Row) ;
-#ifndef NDEBUG
-	    debug_matrix (n_row, n_col, Row, Col, A) ;
-#endif
-	}
-
-	/* === Compute pivot row pattern ==================================== */
-
-	/* get starting location for this new merged row */
-	pivot_row_start = pfree ;
-
-	/* initialize new row counts to zero */
-	pivot_row_degree = 0 ;
-
-	/* tag pivot column as having been visited so it isn't included */
-	/* in merged pivot row */
-	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
-
-	/* pivot row is the union of all rows in the pivot column pattern */
-	cp = &A [Col [pivot_col].start] ;
-	cp_end = cp + Col [pivot_col].length ;
-	while (cp < cp_end)
-	{
-	    /* get a row */
-	    row = *cp++ ;
-	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
-	    /* skip if row is dead */
-	    if (ROW_IS_DEAD (row))
-	    {
-		continue ;
-	    }
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		/* get a column */
-		col = *rp++ ;
-		/* add the column, if alive and untagged */
-		col_thickness = Col [col].shared1.thickness ;
-		if (col_thickness > 0 && COL_IS_ALIVE (col))
-		{
-		    /* tag column in pivot row */
-		    Col [col].shared1.thickness = -col_thickness ;
-		    assert (pfree < Alen) ;
-		    /* place column in pivot row */
-		    A [pfree++] = col ;
-		    pivot_row_degree += col_thickness ;
-		}
-	    }
-	}
-
-	/* clear tag on pivot column */
-	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-	max_deg = MAX (max_deg, pivot_row_degree) ;
-
-#ifndef NDEBUG
-	DEBUG3 (("check2\n")) ;
-	debug_mark (n_row, Row, tag_mark, max_mark) ;
-#endif
-
-	/* === Kill all rows used to construct pivot row ==================== */
-
-	/* also kill pivot row, temporarily */
-	cp = &A [Col [pivot_col].start] ;
-	cp_end = cp + Col [pivot_col].length ;
-	while (cp < cp_end)
-	{
-	    /* may be killing an already dead row */
-	    row = *cp++ ;
-	    DEBUG2 (("Kill row in pivot col: %d\n", row)) ;
-	    KILL_ROW (row) ;
-	}
-
-	/* === Select a row index to use as the new pivot row =============== */
-
-	pivot_row_length = pfree - pivot_row_start ;
-	if (pivot_row_length > 0)
-	{
-	    /* pick the "pivot" row arbitrarily (first row in col) */
-	    pivot_row = A [Col [pivot_col].start] ;
-	    DEBUG2 (("Pivotal row is %d\n", pivot_row)) ;
-	}
-	else
-	{
-	    /* there is no pivot row, since it is of zero length */
-	    pivot_row = EMPTY ;
-	    assert (pivot_row_length == 0) ;
-	}
-	assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
-
-	/* === Approximate degree computation =============================== */
-
-	/* Here begins the computation of the approximate degree.  The column */
-	/* score is the sum of the pivot row "length", plus the size of the */
-	/* set differences of each row in the column minus the pattern of the */
-	/* pivot row itself.  The column ("thickness") itself is also */
-	/* excluded from the column score (we thus use an approximate */
-	/* external degree). */
-
-	/* The time taken by the following code (compute set differences, and */
-	/* add them up) is proportional to the size of the data structure */
-	/* being scanned - that is, the sum of the sizes of each column in */
-	/* the pivot row.  Thus, the amortized time to compute a column score */
-	/* is proportional to the size of that column (where size, in this */
-	/* context, is the column "length", or the number of row indices */
-	/* in that column).  The number of row indices in a column is */
-	/* monotonically non-decreasing, from the length of the original */
-	/* column on input to colamd. */
-
-	/* === Compute set differences ====================================== */
-
-	DEBUG1 (("** Computing set differences phase. **\n")) ;
-
-	/* pivot row is currently dead - it will be revived later. */
-
-	DEBUG2 (("Pivot row: ")) ;
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    col = *rp++ ;
-	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
-	    DEBUG2 (("Col: %d\n", col)) ;
-
-	    /* clear tags used to construct pivot row pattern */
-	    col_thickness = -Col [col].shared1.thickness ;
-	    assert (col_thickness > 0) ;
-	    Col [col].shared1.thickness = col_thickness ;
-
-	    /* === Remove column from degree list =========================== */
-
-	    cur_score = Col [col].shared2.score ;
-	    prev_col = Col [col].shared3.prev ;
-	    next_col = Col [col].shared4.degree_next ;
-	    assert (cur_score >= 0) ;
-	    assert (cur_score <= n_col) ;
-	    assert (cur_score >= EMPTY) ;
-	    if (prev_col == EMPTY)
-	    {
-		head [cur_score] = next_col ;
-	    }
-	    else
-	    {
-		Col [prev_col].shared4.degree_next = next_col ;
-	    }
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = prev_col ;
-	    }
-
-	    /* === Scan the column ========================================== */
-
-	    cp = &A [Col [col].start] ;
-	    cp_end = cp + Col [col].length ;
-	    while (cp < cp_end)
-	    {
-		/* get a row */
-		row = *cp++ ;
-		row_mark = Row [row].shared2.mark ;
-		/* skip if dead */
-		if (ROW_IS_MARKED_DEAD (row_mark))
-		{
-		    continue ;
-		}
-		assert (row != pivot_row) ;
-		set_difference = row_mark - tag_mark ;
-		/* check if the row has been seen yet */
-		if (set_difference < 0)
-		{
-		    assert (Row [row].shared1.degree <= max_deg) ;
-		    set_difference = Row [row].shared1.degree ;
-		}
-		/* subtract column thickness from this row's set difference */
-		set_difference -= col_thickness ;
-		assert (set_difference >= 0) ;
-		/* absorb this row if the set difference becomes zero */
-		if (set_difference == 0)
-		{
-		    DEBUG1 (("aggressive absorption. Row: %d\n", row)) ;
-		    KILL_ROW (row) ;
-		}
-		else
-		{
-		    /* save the new mark */
-		    Row [row].shared2.mark = set_difference + tag_mark ;
-		}
-	    }
-	}
-
-#ifndef NDEBUG
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k-pivot_row_degree, max_deg) ;
-#endif
-
-	/* === Add up set differences for each column ======================= */
-
-	DEBUG1 (("** Adding set differences phase. **\n")) ;
-
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    /* get a column */
-	    col = *rp++ ;
-	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
-	    hash = 0 ;
-	    cur_score = 0 ;
-	    cp = &A [Col [col].start] ;
-	    /* compact the column */
-	    new_cp = cp ;
-	    cp_end = cp + Col [col].length ;
-
-	    DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ;
-
-	    while (cp < cp_end)
-	    {
-		/* get a row */
-		row = *cp++ ;
-		assert(row >= 0 && row < n_row) ;
-		row_mark = Row [row].shared2.mark ;
-		/* skip if dead */
-		if (ROW_IS_MARKED_DEAD (row_mark))
-		{
-		    continue ;
-		}
-		assert (row_mark > tag_mark) ;
-		/* compact the column */
-		*new_cp++ = row ;
-		/* compute hash function */
-		hash += row ;
-		/* add set difference */
-		cur_score += row_mark - tag_mark ;
-		/* integer overflow... */
-		cur_score = MIN (cur_score, n_col) ;
-	    }
-
-	    /* recompute the column's length */
-	    Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
-
-	    /* === Further mass elimination ================================= */
-
-	    if (Col [col].length == 0)
-	    {
-		DEBUG1 (("further mass elimination. Col: %d\n", col)) ;
-		/* nothing left but the pivot row in this column */
-		KILL_PRINCIPAL_COL (col) ;
-		pivot_row_degree -= Col [col].shared1.thickness ;
-		assert (pivot_row_degree >= 0) ;
-		/* order it */
-		Col [col].shared2.order = k ;
-		/* increment order count by column thickness */
-		k += Col [col].shared1.thickness ;
-	    }
-	    else
-	    {
-		/* === Prepare for supercolumn detection ==================== */
-
-		DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ;
-
-		/* save score so far */
-		Col [col].shared2.score = cur_score ;
-
-		/* add column to hash table, for supercolumn detection */
-		hash %= n_col + 1 ;
-
-		DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
-		assert (hash <= n_col) ;
-
-		head_column = head [hash] ;
-		if (head_column > EMPTY)
-		{
-		    /* degree list "hash" is non-empty, use prev (shared3) of */
-		    /* first column in degree list as head of hash bucket */
-		    first_col = Col [head_column].shared3.headhash ;
-		    Col [head_column].shared3.headhash = col ;
-		}
-		else
-		{
-		    /* degree list "hash" is empty, use head as hash bucket */
-		    first_col = - (head_column + 2) ;
-		    head [hash] = - (col + 2) ;
-		}
-		Col [col].shared4.hash_next = first_col ;
-
-		/* save hash function in Col [col].shared3.hash */
-		Col [col].shared3.hash = (int) hash ;
-		assert (COL_IS_ALIVE (col)) ;
-	    }
-	}
-
-	/* The approximate external column degree is now computed.  */
-
-	/* === Supercolumn detection ======================================== */
-
-	DEBUG1 (("** Supercolumn detection phase. **\n")) ;
-
-	detect_super_cols (
-#ifndef NDEBUG
-		n_col, Row,
-#endif
-		Col, A, head, pivot_row_start, pivot_row_length) ;
-
-	/* === Kill the pivotal column ====================================== */
-
-	KILL_PRINCIPAL_COL (pivot_col) ;
-
-	/* === Clear mark =================================================== */
-
-	tag_mark += (max_deg + 1) ;
-	if (tag_mark >= max_mark)
-	{
-	    DEBUG1 (("clearing tag_mark\n")) ;
-	    tag_mark = clear_mark (n_row, Row) ;
-	}
-#ifndef NDEBUG
-	DEBUG3 (("check3\n")) ;
-	debug_mark (n_row, Row, tag_mark, max_mark) ;
-#endif
-
-	/* === Finalize the new pivot row, and column scores ================ */
-
-	DEBUG1 (("** Finalize scores phase. **\n")) ;
-
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	/* compact the pivot row */
-	new_rp = rp ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    col = *rp++ ;
-	    /* skip dead columns */
-	    if (COL_IS_DEAD (col))
-	    {
-		continue ;
-	    }
-	    *new_rp++ = col ;
-	    /* add new pivot row to column */
-	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
-
-	    /* retrieve score so far and add on pivot row's degree. */
-	    /* (we wait until here for this in case the pivot */
-	    /* row's degree was reduced due to mass elimination). */
-	    cur_score = Col [col].shared2.score + pivot_row_degree ;
-
-	    /* calculate the max possible score as the number of */
-	    /* external columns minus the 'k' value minus the */
-	    /* columns thickness */
-	    max_score = n_col - k - Col [col].shared1.thickness ;
-
-	    /* make the score the external degree of the union-of-rows */
-	    cur_score -= Col [col].shared1.thickness ;
-
-	    /* make sure score is less or equal than the max score */
-	    cur_score = MIN (cur_score, max_score) ;
-	    assert (cur_score >= 0) ;
-
-	    /* store updated score */
-	    Col [col].shared2.score = cur_score ;
-
-	    /* === Place column back in degree list ========================= */
-
-	    assert (min_score >= 0) ;
-	    assert (min_score <= n_col) ;
-	    assert (cur_score >= 0) ;
-	    assert (cur_score <= n_col) ;
-	    assert (head [cur_score] >= EMPTY) ;
-	    next_col = head [cur_score] ;
-	    Col [col].shared4.degree_next = next_col ;
-	    Col [col].shared3.prev = EMPTY ;
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = col ;
-	    }
-	    head [cur_score] = col ;
-
-	    /* see if this score is less than current min */
-	    min_score = MIN (min_score, cur_score) ;
-
-	}
-
-#ifndef NDEBUG
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k, max_deg) ;
-#endif
-
-	/* === Resurrect the new pivot row ================================== */
-
-	if (pivot_row_degree > 0)
-	{
-	    /* update pivot row length to reflect any cols that were killed */
-	    /* during super-col detection and mass elimination */
-	    Row [pivot_row].start  = pivot_row_start ;
-	    Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
-	    Row [pivot_row].shared1.degree = pivot_row_degree ;
-	    Row [pivot_row].shared2.mark = 0 ;
-	    /* pivot row is no longer dead */
-	}
-    }
-
-    /* === All principal columns have now been ordered ====================== */
-
-    return (ngarbage) ;
-}
-
-
-/* ========================================================================== */
-/* === order_children ======================================================= */
-/* ========================================================================== */
-
-/*
-    The find_ordering routine has ordered all of the principal columns (the
-    representatives of the supercolumns).  The non-principal columns have not
-    yet been ordered.  This routine orders those columns by walking up the
-    parent tree (a column is a child of the column which absorbed it).  The
-    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
-    being the first column, and p [n_col-1] being the last.  It doesn't look
-    like it at first glance, but be assured that this routine takes time linear
-    in the number of columns.  Although not immediately obvious, the time
-    taken by this routine is O (n_col), that is, linear in the number of
-    columns.  Not user-callable.
-*/
-
-PRIVATE void order_children
-(
-    /* === Parameters ======================================================= */
-
-    int n_col,			/* number of columns of A */
-    ColInfo Col [],		/* of size n_col+1 */
-    int p []			/* p [0 ... n_col-1] is the column permutation*/
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;			/* loop counter for all columns */
-    int c ;			/* column index */
-    int parent ;		/* index of column's parent */
-    int order ;			/* column's order */
-
-    /* === Order each non-principal column ================================== */
-
-    for (i = 0 ; i < n_col ; i++)
-    {
-	/* find an un-ordered non-principal column */
-	assert (COL_IS_DEAD (i)) ;
-	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
-	{
-	    parent = i ;
-	    /* once found, find its principal parent */
-	    do
-	    {
-		parent = Col [parent].shared1.parent ;
-	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
-
-	    /* now, order all un-ordered non-principal columns along path */
-	    /* to this parent.  collapse tree at the same time */
-	    c = i ;
-	    /* get order of parent */
-	    order = Col [parent].shared2.order ;
-
-	    do
-	    {
-		assert (Col [c].shared2.order == EMPTY) ;
-
-		/* order this column */
-		Col [c].shared2.order = order++ ;
-		/* collaps tree */
-		Col [c].shared1.parent = parent ;
-
-		/* get immediate parent of this column */
-		c = Col [c].shared1.parent ;
-
-		/* continue until we hit an ordered column.  There are */
-		/* guarranteed not to be anymore unordered columns */
-		/* above an ordered column */
-	    } while (Col [c].shared2.order == EMPTY) ;
-
-	    /* re-order the super_col parent to largest order for this group */
-	    Col [parent].shared2.order = order ;
-	}
-    }
-
-    /* === Generate the permutation ========================================= */
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	p [Col [c].shared2.order] = c ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === detect_super_cols ==================================================== */
-/* ========================================================================== */
-
-/*
-    Detects supercolumns by finding matches between columns in the hash buckets.
-    Check amongst columns in the set A [row_start ... row_start + row_length-1].
-    The columns under consideration are currently *not* in the degree lists,
-    and have already been placed in the hash buckets.
-
-    The hash bucket for columns whose hash function is equal to h is stored
-    as follows:
-
-	if head [h] is >= 0, then head [h] contains a degree list, so:
-
-		head [h] is the first column in degree bucket h.
-		Col [head [h]].headhash gives the first column in hash bucket h.
-
-	otherwise, the degree list is empty, and:
-
-		-(head [h] + 2) is the first column in hash bucket h.
-
-    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
-    column" pointer.  Col [c].shared3.hash is used instead as the hash number
-    for that column.  The value of Col [c].shared4.hash_next is the next column
-    in the same hash bucket.
-
-    Assuming no, or "few" hash collisions, the time taken by this routine is
-    linear in the sum of the sizes (lengths) of each column whose score has
-    just been computed in the approximate degree computation.
-    Not user-callable.
-*/
-
-PRIVATE void detect_super_cols
-(
-    /* === Parameters ======================================================= */
-
-#ifndef NDEBUG
-    /* these two parameters are only needed when debugging is enabled: */
-    int n_col,			/* number of columns of A */
-    RowInfo Row [],		/* of size n_row+1 */
-#endif
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* row indices of A */
-    int head [],		/* head of degree lists and hash buckets */
-    int row_start,		/* pointer to set of columns to check */
-    int row_length		/* number of columns to check */
-)
-{
-    /* === Local variables ================================================== */
-
-    int hash ;			/* hash # for a column */
-    int *rp ;			/* pointer to a row */
-    int c ;			/* a column index */
-    int super_c ;		/* column index of the column to absorb into */
-    int *cp1 ;			/* column pointer for column super_c */
-    int *cp2 ;			/* column pointer for column c */
-    int length ;		/* length of column super_c */
-    int prev_c ;		/* column preceding c in hash bucket */
-    int i ;			/* loop counter */
-    int *rp_end ;		/* pointer to the end of the row */
-    int col ;			/* a column index in the row to check */
-    int head_column ;		/* first column in hash bucket or degree list */
-    int first_col ;		/* first column in hash bucket */
-
-    /* === Consider each column in the row ================================== */
-
-    rp = &A [row_start] ;
-    rp_end = rp + row_length ;
-    while (rp < rp_end)
-    {
-	col = *rp++ ;
-	if (COL_IS_DEAD (col))
-	{
-	    continue ;
-	}
-
-	/* get hash number for this column */
-	hash = Col [col].shared3.hash ;
-	assert (hash <= n_col) ;
-
-	/* === Get the first column in this hash bucket ===================== */
-
-	head_column = head [hash] ;
-	if (head_column > EMPTY)
-	{
-	    first_col = Col [head_column].shared3.headhash ;
-	}
-	else
-	{
-	    first_col = - (head_column + 2) ;
-	}
-
-	/* === Consider each column in the hash bucket ====================== */
-
-	for (super_c = first_col ; super_c != EMPTY ;
-	    super_c = Col [super_c].shared4.hash_next)
-	{
-	    assert (COL_IS_ALIVE (super_c)) ;
-	    assert (Col [super_c].shared3.hash == hash) ;
-	    length = Col [super_c].length ;
-
-	    /* prev_c is the column preceding column c in the hash bucket */
-	    prev_c = super_c ;
-
-	    /* === Compare super_c with all columns after it ================ */
-
-	    for (c = Col [super_c].shared4.hash_next ;
-		 c != EMPTY ; c = Col [c].shared4.hash_next)
-	    {
-		assert (c != super_c) ;
-		assert (COL_IS_ALIVE (c)) ;
-		assert (Col [c].shared3.hash == hash) ;
-
-		/* not identical if lengths or scores are different */
-		if (Col [c].length != length ||
-		    Col [c].shared2.score != Col [super_c].shared2.score)
-		{
-		    prev_c = c ;
-		    continue ;
-		}
-
-		/* compare the two columns */
-		cp1 = &A [Col [super_c].start] ;
-		cp2 = &A [Col [c].start] ;
-
-		for (i = 0 ; i < length ; i++)
-		{
-		    /* the columns are "clean" (no dead rows) */
-		    assert (ROW_IS_ALIVE (*cp1))  ;
-		    assert (ROW_IS_ALIVE (*cp2))  ;
-		    /* row indices will same order for both supercols, */
-		    /* no gather scatter nessasary */
-		    if (*cp1++ != *cp2++)
-		    {
-			break ;
-		    }
-		}
-
-		/* the two columns are different if the for-loop "broke" */
-		if (i != length)
-		{
-		    prev_c = c ;
-		    continue ;
-		}
-
-		/* === Got it!  two columns are identical =================== */
-
-		assert (Col [c].shared2.score == Col [super_c].shared2.score) ;
-
-		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
-		Col [c].shared1.parent = super_c ;
-		KILL_NON_PRINCIPAL_COL (c) ;
-		/* order c later, in order_children() */
-		Col [c].shared2.order = EMPTY ;
-		/* remove c from hash bucket */
-		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
-	    }
-	}
-
-	/* === Empty this hash bucket ======================================= */
-
-	if (head_column > EMPTY)
-	{
-	    /* corresponding degree list "hash" is not empty */
-	    Col [head_column].shared3.headhash = EMPTY ;
-	}
-	else
-	{
-	    /* corresponding degree list "hash" is empty */
-	    head [hash] = EMPTY ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === garbage_collection =================================================== */
-/* ========================================================================== */
-
-/*
-    Defragments and compacts columns and rows in the workspace A.  Used when
-    all avaliable memory has been used while performing row merging.  Returns
-    the index of the first free position in A, after garbage collection.  The
-    time taken by this routine is linear is the size of the array A, which is
-    itself linear in the number of nonzeros in the input matrix.
-    Not user-callable.
-*/
-
-PRIVATE int garbage_collection  /* returns the new value of pfree */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows */
-    int n_col,			/* number of columns */
-    RowInfo Row [],		/* row info */
-    ColInfo Col [],		/* column info */
-    int A [],			/* A [0 ... Alen-1] holds the matrix */
-    int *pfree			/* &A [0] ... pfree is in use */
-)
-{
-    /* === Local variables ================================================== */
-
-    int *psrc ;			/* source pointer */
-    int *pdest ;		/* destination pointer */
-    int j ;			/* counter */
-    int r ;			/* a row index */
-    int c ;			/* a column index */
-    int length ;		/* length of a row or column */
-
-#ifndef NDEBUG
-    int debug_rows ;
-    DEBUG0 (("Defrag..\n")) ;
-    for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ;
-    debug_rows = 0 ;
-#endif
-
-    /* === Defragment the columns =========================================== */
-
-    pdest = &A[0] ;
-    for (c = 0 ; c < n_col ; c++)
-    {
-	if (COL_IS_ALIVE (c))
-	{
-	    psrc = &A [Col [c].start] ;
-
-	    /* move and compact the column */
-	    assert (pdest <= psrc) ;
-	    Col [c].start = (int) (pdest - &A [0]) ;
-	    length = Col [c].length ;
-	    for (j = 0 ; j < length ; j++)
-	    {
-		r = *psrc++ ;
-		if (ROW_IS_ALIVE (r))
-		{
-		    *pdest++ = r ;
-		}
-	    }
-	    Col [c].length = (int) (pdest - &A [Col [c].start]) ;
-	}
-    }
-
-    /* === Prepare to defragment the rows =================================== */
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    if (Row [r].length == 0)
-	    {
-		/* this row is of zero length.  cannot compact it, so kill it */
-		DEBUG0 (("Defrag row kill\n")) ;
-		KILL_ROW (r) ;
-	    }
-	    else
-	    {
-		/* save first column index in Row [r].shared2.first_column */
-		psrc = &A [Row [r].start] ;
-		Row [r].shared2.first_column = *psrc ;
-		assert (ROW_IS_ALIVE (r)) ;
-		/* flag the start of the row with the one's complement of row */
-		*psrc = ONES_COMPLEMENT (r) ;
-#ifndef NDEBUG
-		debug_rows++ ;
-#endif
-	    }
-	}
-    }
-
-    /* === Defragment the rows ============================================== */
-
-    psrc = pdest ;
-    while (psrc < pfree)
-    {
-	/* find a negative number ... the start of a row */
-	if (*psrc++ < 0)
-	{
-	    psrc-- ;
-	    /* get the row index */
-	    r = ONES_COMPLEMENT (*psrc) ;
-	    assert (r >= 0 && r < n_row) ;
-	    /* restore first column index */
-	    *psrc = Row [r].shared2.first_column ;
-	    assert (ROW_IS_ALIVE (r)) ;
-
-	    /* move and compact the row */
-	    assert (pdest <= psrc) ;
-	    Row [r].start = (int) (pdest - &A [0]) ;
-	    length = Row [r].length ;
-	    for (j = 0 ; j < length ; j++)
-	    {
-		c = *psrc++ ;
-		if (COL_IS_ALIVE (c))
-		{
-		    *pdest++ = c ;
-		}
-	    }
-	    Row [r].length = (int) (pdest - &A [Row [r].start]) ;
-#ifndef NDEBUG
-	    debug_rows-- ;
-#endif
-	}
-    }
-    /* ensure we found all the rows */
-    assert (debug_rows == 0) ;
-
-    /* === Return the new value of pfree ==================================== */
-
-    return ((int) (pdest - &A [0])) ;
-}
-
-
-/* ========================================================================== */
-/* === clear_mark =========================================================== */
-/* ========================================================================== */
-
-/*
-    Clears the Row [].shared2.mark array, and returns the new tag_mark.
-    Return value is the new tag_mark.  Not user-callable.
-*/
-
-PRIVATE int clear_mark	/* return the new value for tag_mark */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,		/* number of rows in A */
-    RowInfo Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-
-    DEBUG0 (("Clear mark\n")) ;
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    Row [r].shared2.mark = 0 ;
-	}
-    }
-    return (1) ;
-}
-
-
-/* ========================================================================== */
-/* === debugging routines =================================================== */
-/* ========================================================================== */
-
-/* When debugging is disabled, the remainder of this file is ignored. */
-
-#ifndef NDEBUG
-
-
-/* ========================================================================== */
-/* === debug_structures ===================================================== */
-/* ========================================================================== */
-
-/*
-    At this point, all empty rows and columns are dead.  All live columns
-    are "clean" (containing no dead rows) and simplicial (no supercolumns
-    yet).  Rows may contain dead columns, but all live rows contain at
-    least one live column.
-*/
-
-PRIVATE void debug_structures
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int n_col2
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;
-    int c ;
-    int *cp ;
-    int *cp_end ;
-    int len ;
-    int score ;
-    int r ;
-    int *rp ;
-    int *rp_end ;
-    int deg ;
-
-    /* === Check A, Row, and Col ============================================ */
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	if (COL_IS_ALIVE (c))
-	{
-	    len = Col [c].length ;
-	    score = Col [c].shared2.score ;
-	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
-	    assert (len > 0) ;
-	    assert (score >= 0) ;
-	    assert (Col [c].shared1.thickness == 1) ;
-	    cp = &A [Col [c].start] ;
-	    cp_end = cp + len ;
-	    while (cp < cp_end)
-	    {
-		r = *cp++ ;
-		assert (ROW_IS_ALIVE (r)) ;
-	    }
-	}
-	else
-	{
-	    i = Col [c].shared2.order ;
-	    assert (i >= n_col2 && i < n_col) ;
-	}
-    }
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    i = 0 ;
-	    len = Row [r].length ;
-	    deg = Row [r].shared1.degree ;
-	    assert (len > 0) ;
-	    assert (deg > 0) ;
-	    rp = &A [Row [r].start] ;
-	    rp_end = rp + len ;
-	    while (rp < rp_end)
-	    {
-		c = *rp++ ;
-		if (COL_IS_ALIVE (c))
-		{
-		    i++ ;
-		}
-	    }
-	    assert (i > 0) ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_deg_lists ====================================================== */
-/* ========================================================================== */
-
-/*
-    Prints the contents of the degree lists.  Counts the number of columns
-    in the degree list and compares it to the total it should have.  Also
-    checks the row degrees.
-*/
-
-PRIVATE void debug_deg_lists
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int head [],
-    int min_score,
-    int should,
-    int max_deg
-)
-{
-    /* === Local variables ================================================== */
-
-    int deg ;
-    int col ;
-    int have ;
-    int row ;
-
-    /* === Check the degree lists =========================================== */
-
-    if (n_col > 10000 && debug_colamd <= 0)
-    {
-	return ;
-    }
-    have = 0 ;
-    DEBUG4 (("Degree lists: %d\n", min_score)) ;
-    for (deg = 0 ; deg <= n_col ; deg++)
-    {
-	col = head [deg] ;
-	if (col == EMPTY)
-	{
-	    continue ;
-	}
-	DEBUG4 (("%d:", deg)) ;
-	while (col != EMPTY)
-	{
-	    DEBUG4 ((" %d", col)) ;
-	    have += Col [col].shared1.thickness ;
-	    assert (COL_IS_ALIVE (col)) ;
-	    col = Col [col].shared4.degree_next ;
-	}
-	DEBUG4 (("\n")) ;
-    }
-    DEBUG4 (("should %d have %d\n", should, have)) ;
-    assert (should == have) ;
-
-    /* === Check the row degrees ============================================ */
-
-    if (n_row > 10000 && debug_colamd <= 0)
-    {
-	return ;
-    }
-    for (row = 0 ; row < n_row ; row++)
-    {
-	if (ROW_IS_ALIVE (row))
-	{
-	    assert (Row [row].shared1.degree <= max_deg) ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_mark =========================================================== */
-/* ========================================================================== */
-
-/*
-    Ensures that the tag_mark is less that the maximum and also ensures that
-    each entry in the mark array is less than the tag mark.
-*/
-
-PRIVATE void debug_mark
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    RowInfo Row [],
-    int tag_mark,
-    int max_mark
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-
-    /* === Check the Row marks ============================================== */
-
-    assert (tag_mark > 0 && tag_mark <= max_mark) ;
-    if (n_row > 10000 && debug_colamd <= 0)
-    {
-	return ;
-    }
-    for (r = 0 ; r < n_row ; r++)
-    {
-	assert (Row [r].shared2.mark < tag_mark) ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_matrix ========================================================= */
-/* ========================================================================== */
-
-/*
-    Prints out the contents of the columns and the rows.
-*/
-
-PRIVATE void debug_matrix
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A []
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-    int c ;
-    int *rp ;
-    int *rp_end ;
-    int *cp ;
-    int *cp_end ;
-
-    /* === Dump the rows and columns of the matrix ========================== */
-
-    if (debug_colamd < 3)
-    {
-	return ;
-    }
-    DEBUG3 (("DUMP MATRIX:\n")) ;
-    for (r = 0 ; r < n_row ; r++)
-    {
-	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
-	if (ROW_IS_DEAD (r))
-	{
-	    continue ;
-	}
-	DEBUG3 (("start %d length %d degree %d\n",
-		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
-	rp = &A [Row [r].start] ;
-	rp_end = rp + Row [r].length ;
-	while (rp < rp_end)
-	{
-	    c = *rp++ ;
-	    DEBUG3 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
-	}
-    }
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
-		Col [c].start, Col [c].length,
-		Col [c].shared1.thickness, Col [c].shared2.score)) ;
-	cp = &A [Col [c].start] ;
-	cp_end = cp + Col [c].length ;
-	while (cp < cp_end)
-	{
-	    r = *cp++ ;
-	    DEBUG3 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
-	}
-    }
-}
-
-#endif
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/old_colamd.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/old_colamd.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,73 +0,0 @@
-/* ========================================================================== */
-/* === colamd prototypes and definitions ==================================== */
-/* ========================================================================== */
-
-/*
-    This is the colamd include file,
-
-	http://www.cise.ufl.edu/~davis/colamd/colamd.h
-
-    for use in the colamd.c, colamdmex.c, and symamdmex.c files located at
-
-	http://www.cise.ufl.edu/~davis/colamd/
-
-    See those files for a description of colamd and symamd, and for the
-    copyright notice, which also applies to this file.
-
-    August 3, 1998.  Version 1.0.
-*/
-
-/* ========================================================================== */
-/* === Definitions ========================================================== */
-/* ========================================================================== */
-
-/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
-
-/* number of output statistics.  Only A [0..2] are currently used. */
-#define COLAMD_STATS 20
-
-/* knobs [0] and A [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
-
-/* knobs [1] and A [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
-
-/* A [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
-
-/* A [3]: whether or not the input columns were jumbled or had duplicates */
-#define COLAMD_JUMBLED_COLS 3
-
-/* ========================================================================== */
-/* === Prototypes of user-callable routines ================================= */
-/* ========================================================================== */
-
-#ifdef _CRAY
-#define int short
-#elif defined (_LONGINT)
-#define int long
-#endif
-
-int colamd_recommended		/* returns recommended value of Alen */
-(
-    int nnz,			/* nonzeros in A */
-    int n_row,			/* number of rows in A */
-    int n_col			/* number of columns in A */
-) ;
-
-void colamd_set_defaults	/* sets default parameters */
-(				/* knobs argument is modified on output */
-    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
-) ;
-
-int colamd			/* returns TRUE if successful, FALSE otherwise*/
-(				/* A and p arguments are modified on output */
-    int n_row,			/* number of rows in A */
-    int n_col,			/* number of columns in A */
-    int Alen,			/* size of the array A */
-    int A [],			/* row indices of A, of size Alen */
-    int p [],			/* column pointers of A, of size n_col+1 */
-    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
-) ;
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pddistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pddistribute.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pddistribute.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pddistribute.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1044 +0,0 @@
-
-
-#include "superlu_ddefs.h"
-
-int_t
-dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
-                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
-                double *a[])
-{
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- * Purpose
- * =======
- *   Re-distribute A on the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * A      (input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
- *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * ScalePermstruct (input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * colptr (output) int*
- *
- * rowind (output) int*
- *
- * a      (output) double*
- *
- * Return value
- * ============
- *
- */
-    NRformat_loc *Astore;
-    int_t  *perm_r; /* row permutation vector */
-    int_t  *perm_c; /* column permutation vector */
-    int_t  i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize;
-    int_t  nnz_loc;    /* number of local nonzeros */
-    int_t  nnz_remote; /* number of remote nonzeros to be sent */
-    int_t  SendCnt; /* number of remote nonzeros to be sent */
-    int_t  RecvCnt; /* number of remote nonzeros to be sent */
-    int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-    int_t  *ia, *ja, **ia_send, *index, *itemp;
-    int_t  *ptr_to_send;
-    double *aij, **aij_send, *nzval, *dtemp;
-    double *nzval_a;
-    int    iam, it, p, procs;
-    MPI_Request *send_req;
-    MPI_Status  status;
-    
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter dReDistribute_A()");
-#endif
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    procs = grid->nprow * grid->npcol;
-    Astore = (NRformat_loc *) A->Store;
-    n = A->ncol;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    nnzToRecv = intCalloc_dist(2*procs);
-    nnzToSend = nnzToRecv + procs;
-
-
-    /* ------------------------------------------------------------
-       COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-       THEN ALLOCATE SPACE.
-       THIS ACCOUNTS FOR THE FIRST PASS OF A.
-       ------------------------------------------------------------*/
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-	    jcol = Astore->colind[j];
-	    gbi = BlockNum( irow );
-	    gbj = BlockNum( jcol );
-	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-	    ++nnzToSend[p]; 
-	}
-    }
-
-    /* All-to-all communication */
-    MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		  grid->comm);
-
-    maxnnzToRecv = 0;
-    nnz_loc = SendCnt = RecvCnt = 0;
-
-    for (p = 0; p < procs; ++p) {
-	if ( p != iam ) {
-	    SendCnt += nnzToSend[p];
-	    RecvCnt += nnzToRecv[p];
-	    maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
-	} else {
-	    nnz_loc += nnzToRecv[p];
-	    /*assert(nnzToSend[p] == nnzToRecv[p]);*/
-	}
-    }
-    k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-
-    /* Allocate space for storing the triplets after redistribution. */
-    if ( k ) { /* count can be zero. */
-        if ( !(ia = intMalloc_dist(2*k)) )
-            ABORT("Malloc fails for ia[].");
-        if ( !(aij = doubleMalloc_dist(k)) )
-            ABORT("Malloc fails for aij[].");
-    }
-    ja = ia + k;
-
-    /* Allocate temporary storage for sending/receiving the A triplets. */
-    if ( procs > 1 ) {
-      if ( !(send_req = (MPI_Request *)
-	     SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-      if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
-        ABORT("Malloc fails for ia_send[].");
-      if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) )
-        ABORT("Malloc fails for aij_send[].");
-      if ( SendCnt ) { /* count can be zero */
-          if ( !(index = intMalloc_dist(2*SendCnt)) )
-              ABORT("Malloc fails for index[].");
-          if ( !(nzval = doubleMalloc_dist(SendCnt)) )
-              ABORT("Malloc fails for nzval[].");
-      }
-      if ( !(ptr_to_send = intCalloc_dist(procs)) )
-        ABORT("Malloc fails for ptr_to_send[].");
-      if ( maxnnzToRecv ) { /* count can be zero */
-          if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
-              ABORT("Malloc fails for itemp[].");
-          if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) )
-              ABORT("Malloc fails for dtemp[].");
-      }
-
-      for (i = 0, j = 0, p = 0; p < procs; ++p) {
-          if ( p != iam ) {
-	      ia_send[p] = &index[i];
-	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	      aij_send[p] = &nzval[j];
-	      j += nnzToSend[p];
-	  }
-      }
-    } /* if procs > 1 */
-      
-    if ( !(*colptr = intCalloc_dist(n+1)) )
-        ABORT("Malloc fails for *colptr[].");
-
-    /* ------------------------------------------------------------
-       LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
-       THIS ACCOUNTS FOR THE SECOND PASS OF A.
-       ------------------------------------------------------------*/
-    nnz_loc = 0; /* Reset the local nonzero count. */
-    nzval_a = Astore->nzval;
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-	    jcol = Astore->colind[j];
-	    gbi = BlockNum( irow );
-	    gbj = BlockNum( jcol );
-	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-
-	    if ( p != iam ) { /* remote */
-	        k = ptr_to_send[p];
-	        ia_send[p][k] = irow;
-	        ia_send[p][k + nnzToSend[p]] = jcol;
-		aij_send[p][k] = nzval_a[j];
-		++ptr_to_send[p]; 
-	    } else {          /* local */
-	        ia[nnz_loc] = irow;
-	        ja[nnz_loc] = jcol;
-		aij[nnz_loc] = nzval_a[j];
-		++nnz_loc;
-		++(*colptr)[jcol]; /* Count nonzeros in each column */
-	    }
-	}
-    }
-
-    /* ------------------------------------------------------------
-       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-       NOTE: Can possibly use MPI_Alltoallv.
-       ------------------------------------------------------------*/
-    for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
-	    it = 2*nnzToSend[p];
-	    MPI_Isend( ia_send[p], it, mpi_int_t,
-		       p, iam, grid->comm, &send_req[p] );
-	    it = nnzToSend[p];
-	    MPI_Isend( aij_send[p], it, MPI_DOUBLE,
-	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
-	}
-    }
-
-    for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
-	    it = 2*nnzToRecv[p];
-	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
-	    it = nnzToRecv[p];
-            MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
-		      grid->comm, &status );
-	    for (i = 0; i < nnzToRecv[p]; ++i) {
-	        ia[nnz_loc] = itemp[i];
-		jcol = itemp[i + nnzToRecv[p]];
-		/*assert(jcol<n);*/
-	        ja[nnz_loc] = jcol;
-		aij[nnz_loc] = dtemp[i];
-		++nnz_loc;
-		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
-	    }
-	}
-    }
-
-    for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
-	    MPI_Wait( &send_req[p], &status);
-	    MPI_Wait( &send_req[procs+p], &status);
-	}
-    }
-
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE
-       ------------------------------------------------------------*/
-
-    SUPERLU_FREE(nnzToRecv);
-
-    if ( procs > 1 ) {
-	SUPERLU_FREE(send_req);
-	SUPERLU_FREE(ia_send);
-	SUPERLU_FREE(aij_send);
-	if ( SendCnt ) {
-            SUPERLU_FREE(index);
-            SUPERLU_FREE(nzval);
-        }
-	SUPERLU_FREE(ptr_to_send);
-        if ( maxnnzToRecv ) {
-            SUPERLU_FREE(itemp);
-            SUPERLU_FREE(dtemp);
-        }
-    }
-
-    /* ------------------------------------------------------------
-       CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
-       ------------------------------------------------------------*/
-    if ( nnz_loc ) { /* nnz_loc can be zero */
-        if ( !(*rowind = intMalloc_dist(nnz_loc)) )
-            ABORT("Malloc fails for *rowind[].");
-        if ( !(*a = doubleMalloc_dist(nnz_loc)) )
-            ABORT("Malloc fails for *a[].");
-    }
-
-    /* Initialize the array of column pointers */
-    k = 0;
-    jsize = (*colptr)[0];
-    (*colptr)[0] = 0;
-    for (j = 1; j < n; ++j) {
-	k += jsize;
-	jsize = (*colptr)[j];
-	(*colptr)[j] = k;
-    }
-    
-    /* Copy the triplets into the column oriented storage */
-    for (i = 0; i < nnz_loc; ++i) {
-	j = ja[i];
-	k = (*colptr)[j];
-	(*rowind)[k] = ia[i];
-	(*a)[k] = aij[i];
-	++(*colptr)[j];
-    }
-
-    /* Reset the column pointers to the beginning of each column */
-    for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
-    (*colptr)[0] = 0;
-
-    if ( nnz_loc ) {
-        SUPERLU_FREE(ia);
-        SUPERLU_FREE(aij);
-    }
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit dReDistribute_A()");
-#endif
- 
-} /* dReDistribute_A */
-
-int_t
-pddistribute(fact_t fact, int_t n, SuperMatrix *A,
-	     ScalePermstruct_t *ScalePermstruct,
-	     Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct,
-	     gridinfo_t *grid)
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- *
- * Purpose
- * =======
- *   Distribute the matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (input) int
- *        Dimension of the matrix.
- *
- * A      (input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
- *        Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * ScalePermstruct (input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * LUstruct (input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   > 0, working storage required (in bytes).
- *
- */
-{
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
-          len, len1, nsupc;
-    int_t ljb;  /* local block column number */
-    int_t nrbl; /* number of L blocks in current block column */
-    int_t nrbu; /* number of U blocks in current block column */
-    int_t gb;   /* global block number; 0 < gb <= nsuper */
-    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-    int iam, jbrow, kcol, mycol, myrow, pc, pr;
-    int_t mybufmax[NBUFFERS];
-    NRformat_loc *Astore;
-    double *a;
-    int_t *asub, *xa;
-    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
-    int_t *lsub, *xlsub, *usub, *xusub;
-    int_t nsupers;
-    int_t next_lind;      /* next available position in index[*] */
-    int_t next_lval;      /* next available position in nzval[*] */
-    int_t *index;         /* indices consist of headers and row subscripts */
-    double *lusup, *uval; /* nonzero values in L and U */
-    double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-    double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-
-    /*-- Counts to be used in factorization. --*/
-    int_t  *ToRecv, *ToSendD, **ToSendR;
-
-    /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
-
-    /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
-			      the full array (local)                 */
-
-    /*-- Auxiliary arrays; freed on return --*/
-    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
-    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
-    int_t *Ucbs;       /* number of column blocks in a block row            */
-    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
-    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
-    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
-    double *dense, *dense_col; /* SPA */
-    double zero = 0.0;
-    int_t ldaspa;     /* LDA of SPA */
-    int_t mem_use = 0, iword, dword;
-
-#if ( PRNTlevel>=1 )
-    int_t nLblocks = 0, nUblocks = 0;
-#endif
-#if ( PROFlevel>=1 ) 
-    double t, t_u, t_l;
-    int_t u_blks;
-#endif
-
-    /* Initialization. */
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-    nsupers  = supno[n-1] + 1;
-    Astore   = (NRformat_loc *) A->Store;
-
-#if ( PRNTlevel>=1 )
-    iword = sizeof(int_t);
-    dword = sizeof(double);
-#endif
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pddistribute()");
-#endif
-#if ( PROFlevel>=1 )
-    t = SuperLU_timer_();
-#endif
-
-    dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
-		      grid, &xa, &asub, &a);
-
-#if ( PROFlevel>=1 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf("--------\n"
-		       ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
-#endif
-
-    if ( fact == SamePattern_SameRowPerm ) {
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* We can propagate the new values of A into the existing
-	   L and U data structures.            */
-	ilsum = Llu->ilsum;
-	ldaspa = Llu->ldalsum;
-	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
-	if ( !(Urb_length = intCalloc_dist(nrbu)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-	Unzval_br_ptr = Llu->Unzval_br_ptr;
-#if ( PRNTlevel>=1 )
-	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
-#endif
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_();
-#endif
-
-	/* Initialize Uval to zero. */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
-	    index = Ufstnz_br_ptr[lb];
-	    if ( index ) {
-		uval = Unzval_br_ptr[lb];
-		len = index[1];
-		for (i = 0; i < len; ++i) uval[i] = zero;
-	    } /* if index != NULL */
-	} /* for lb ... */
-
-	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-
- 		/* Scatter A into SPA (for L), or into U directly. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa[j]; i < xa[j+1]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
- 			    if ( gb < jb ) { /* in U */
- 				index = Ufstnz_br_ptr[lb];
- 				uval = Unzval_br_ptr[lb];
- 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
- 				    /* Skip nonzero values in this block */
- 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
- 				    /* Move pointer to the next block */
- 				    Urb_indptr[lb] += UB_DESCRIPTOR
- 					+ SuperSize( k );
- 				}
- 				/*assert(k == jb);*/
- 				/* start fstnz */
- 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
- 				len = Urb_length[lb];
- 				fsupc1 = FstBlockC( gb+1 );
- 				k = j - fsupc;
- 				/* Sum the lengths of the leading columns */
- 				for (jj = 0; jj < k; ++jj)
-				    len += fsupc1 - index[istart++];
-				/*assert(irow>=index[istart]);*/
-				uval[len + irow - index[istart]] = a[i];
-			    } else { /* in L; put in SPA first */
-  				irow = ilsum[lb] + irow - FstBlockC( gb );
-  				dense_col[irow] = a[i];
-  			    }
-  			}
-		    } /* for i ... */
-  		    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif
-
-		/* Gather the values of A from SPA into Lnzval[]. */
-		ljb = LBj( jb, grid ); /* Local block number */
-		index = Lrowind_bc_ptr[ljb];
-		if ( index ) {
-		    nrbl = index[0];   /* Number of row blocks. */
-		    len = index[1];    /* LDA of lusup[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (jj = 0; jj < nrbl; ++jj) {
-			gb = index[next_lind++];
-			len1 = index[next_lind++]; /* Rows in the block. */
-			lb = LBi( gb, grid );
-			for (bnnz = 0; bnnz < len1; ++bnnz) {
-			    irow = index[next_lind++]; /* Global index. */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    k = next_lval++;
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			} /* for bnnz ... */
-		    } /* for jj ... */
-		} /* if index ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-	} /* for jb ... */
-
-	SUPERLU_FREE(dense);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
-			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } else {
-        /* ------------------------------------------------------------
-	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
-	   ------------------------------------------------------------*/
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* We first need to set up the L and U data structures and then
-	 * propagate the values of A into them.
-	 */
-	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
-	xlsub = Glu_freeable->xlsub;
-	usub = Glu_freeable->usub;    /* compressed U subscripts */
-	xusub = Glu_freeable->xusub;
-    
-	if ( !(ToRecv = intCalloc_dist(nsupers)) )
-	    ABORT("Calloc fails for ToRecv[].");
-
-	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
-	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for ToSendR[].");
-	j = k * grid->npcol;
-	if ( !(index = intMalloc_dist(j)) )
-	    ABORT("Malloc fails for index[].");
-#if ( PRNTlevel>=1 )
-	mem_use += k*sizeof(int_t*) + (j + nsupers)*iword;
-#endif
-	for (i = 0; i < j; ++i) index[i] = EMPTY;
-	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-
-	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
-              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
-	    ABORT("Malloc fails for Unzval_br_ptr[].");
-	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
-	if ( !(ToSendD = intCalloc_dist(k)) )
-	    ABORT("Malloc fails for ToSendD[].");
-	if ( !(ilsum = intMalloc_dist(k+1)) )
-	    ABORT("Malloc fails for ilsum[].");
-
-	/* Auxiliary arrays used to set up U block data structures.
-	   They are freed on return. */
-	if ( !(rb_marker = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for rb_marker[].");
-	if ( !(Urb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	if ( !(Urb_fstnz = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_fstnz[].");
-	if ( !(Ucbs = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Ucbs[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword;
-#endif
-	/* Compute ldaspa and ilsum[]. */
-	ldaspa = 0;
-	ilsum[0] = 0;
-	for (gb = 0; gb < nsupers; ++gb) {
-	    if ( myrow == PROW( gb, grid ) ) {
-		i = SuperSize( gb );
-		ldaspa += i;
-		lb = LBi( gb, grid );
-		ilsum[lb + 1] = ilsum[lb] + i;
-	    }
-	}
-	
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_();
-#endif
-	/* ------------------------------------------------------------
-	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
-	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
-	   ------------------------------------------------------------*/
-	
-	/* Loop through each supernode column. */
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    fsupc = FstBlockC( jb );
-	    nsupc = SuperSize( jb );
-	    /* Loop through each column in the block. */
-	    for (j = fsupc; j < fsupc + nsupc; ++j) {
-		/* usub[*] contains only "first nonzero" in each segment. */
-		for (i = xusub[j]; i < xusub[j+1]; ++i) {
-		    irow = usub[i]; /* First nonzero of the segment. */
-		    gb = BlockNum( irow );
-		    kcol = PCOL( gb, grid );
-		    ljb = LBj( gb, grid );
-		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
-		    pr = PROW( gb, grid );
-		    lb = LBi( gb, grid );
-		    if ( mycol == pc ) {
-			if  ( myrow == pr ) {
-			    ToSendD[lb] = YES;
-			    /* Count nonzeros in entire block row. */
-			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
-			    if (rb_marker[lb] <= jb) {/* First see the block */
-				rb_marker[lb] = jb + 1;
-				Urb_fstnz[lb] += nsupc;
-				++Ucbs[lb]; /* Number of column blocks
-					       in block row lb. */
-#if ( PRNTlevel>=1 )
-				++nUblocks;
-#endif
-			    }
-			    ToRecv[gb] = 1;
-			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
-		    }
-		} /* for i ... */
-	    } /* for j ... */
-	} /* for jb ... */
-	
-	/* Set up the initial pointers for each block row in U. */
-	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    len = Urb_length[lb];
-	    rb_marker[lb] = 0; /* Reset block marker. */
-	    if ( len ) {
-		/* Add room for descriptors */
-		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
-		if ( !(index = intMalloc_dist(len1+1)) )
-		    ABORT("Malloc fails for Uindex[].");
-		Ufstnz_br_ptr[lb] = index;
-		if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
-		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
-		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-		index[0] = Ucbs[lb]; /* Number of column blocks */
-		index[1] = len;      /* Total length of nzval[] */
-		index[2] = len1;     /* Total length of index[] */
-		index[len1] = -1;    /* End marker */
-	    } else {
-		Ufstnz_br_ptr[lb] = NULL;
-		Unzval_br_ptr[lb] = NULL;
-	    }
-	    Urb_length[lb] = 0; /* Reset block length. */
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
- 	    Urb_fstnz[lb] = BR_HEADER;
-	} /* for lb ... */
-
-	SUPERLU_FREE(Ucbs);
-
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_() - t;
-	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
-#endif
-#if ( PRNTlevel>=1 )
-        mem_use -= 2*k * iword;
-#endif
-	/* Auxiliary arrays used to set up L block data structures.
-	   They are freed on return.
-	   k is the number of local row blocks.   */
-	if ( !(Lrb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Lrb_length[].");
-	if ( !(Lrb_number = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_number[].");
-	if ( !(Lrb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_indptr[].");
-	if ( !(Lrb_valptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_valptr[].");
-	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-
-	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for bmod[].");
-	/* ------------------------------------------------ */
-#if ( PRNTlevel>=1 )	
-	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword;
-#endif
-	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-
-	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr = 
-              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
-	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
-	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
-	Lrowind_bc_ptr[k-1] = NULL;
-
-	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for fsendx_plist[].");
-	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
-	/* -------------------------------------------------------------- */
-#if ( PRNTlevel>=1 )
-	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
-#endif
-
-	/*------------------------------------------------------------
-	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-	  ------------------------------------------------------------*/
-
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-		ljb = LBj( jb, grid ); /* Local block number */
-		
-		/* Scatter A into SPA. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa[j]; i < xa[j+1]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    dense_col[irow] = a[i];
-			}
-		    }
-		    dense_col += ldaspa;
-		} /* for j ... */
-
-		jbrow = PROW( jb, grid );
-
-		/*------------------------------------------------
-		 * SET UP U BLOCKS.
-		 *------------------------------------------------*/
-#if ( PROFlevel>=1 )
-		t = SuperLU_timer_();
-#endif
-		kseen = 0;
-		dense_col = dense;
-		/* Loop through each column in the block column. */
-		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
-		    istart = xusub[j];
-		    /* NOTE: Only the first nonzero index of the segment
-		       is stored in usub[]. */
-		    for (i = istart; i < xusub[j+1]; ++i) {
-			irow = usub[i]; /* First nonzero in the segment. */
-			gb = BlockNum( irow );
-			pr = PROW( gb, grid );
-			if ( pr != jbrow &&
-			     myrow == jbrow &&  /* diag. proc. owning jb */
-			     bsendx_plist[ljb][pr] == EMPTY ) {
-			    bsendx_plist[ljb][pr] = YES;
-			    ++nbsendx;
-                        }
-			if ( myrow == pr ) {
-			    lb = LBi( gb, grid ); /* Local block number */
-			    index = Ufstnz_br_ptr[lb];
-			    uval = Unzval_br_ptr[lb];
-			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
-							  the block       */
-				rb_marker[lb] = jb + 1;
-				Urb_indptr[lb] = Urb_fstnz[lb];;
-				index[Urb_indptr[lb]] = jb; /* Descriptor */
-				Urb_indptr[lb] += UB_DESCRIPTOR;
-				/* Record the first location in index[] of the
-				   next block */
-				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
-				len = Urb_indptr[lb];/* Start fstnz in index */
-				index[len-1] = 0;
-				for (k = 0; k < nsupc; ++k)
-				    index[len+k] = fsupc1;
-				if ( gb != jb )/* Exclude diagonal block. */
-				    ++bmod[lb];/* Mod. count for back solve */
-				if ( kseen == 0 && myrow != jbrow ) {
-				    ++nbrecvx;
-				    kseen = 1;
-				}
-			    } else { /* Already saw the block */
-				len = Urb_indptr[lb];/* Start fstnz in index */
-			    }
-			    jj = j - fsupc;
-			    index[len+jj] = irow;
-			    /* Load the numerical values */
-			    k = fsupc1 - irow; /* No. of nonzeros in segment */
-			    index[len-1] += k; /* Increment block length in
-						  Descriptor */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (ii = 0; ii < k; ++ii) {
-				uval[Urb_length[lb]++] = dense_col[irow + ii];
-				dense_col[irow + ii] = zero;
-			    }
-			} /* if myrow == pr ... */
-		    } /* for i ... */
-                    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif		
-		/*------------------------------------------------
-		 * SET UP L BLOCKS.
-		 *------------------------------------------------*/
-
-		/* Count number of blocks and length of each block. */
-		nrbl = 0;
-		len = 0; /* Number of row subscripts I own. */
-		kseen = 0;
-		istart = xlsub[fsupc];
-		for (i = istart; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    gb = BlockNum( irow ); /* Global block number */
-		    pr = PROW( gb, grid ); /* Process row owning this block */
-		    if ( pr != jbrow &&
-			 myrow == jbrow &&  /* diag. proc. owning jb */
-			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
-			fsendx_plist[ljb][pr] = YES;
-			++nfsendx;
-                    }
-		    if ( myrow == pr ) {
-			lb = LBi( gb, grid );  /* Local block number */
-			if (rb_marker[lb] <= jb) { /* First see this block */
-			    rb_marker[lb] = jb + 1;
-			    Lrb_length[lb] = 1;
-			    Lrb_number[nrbl++] = gb;
-			    if ( gb != jb ) /* Exclude diagonal block. */
-				++fmod[lb]; /* Mod. count for forward solve */
-			    if ( kseen == 0 && myrow != jbrow ) {
-				++nfrecvx;
-				kseen = 1;
-			    }
-#if ( PRNTlevel>=1 )
-			    ++nLblocks;
-#endif
-			} else {
-			    ++Lrb_length[lb];
-			}
-			++len;
-		    }
-		} /* for i ... */
-
-		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
-		       index[] and nzval[]. */
-		    /* Add room for descriptors */
-		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index[]");
-		    Lrowind_bc_ptr[ljb] = index;
-		    if (!(Lnzval_bc_ptr[ljb] = 
-                         doubleMalloc_dist(len*nsupc))) {
-			fprintf(stderr, "col block %d ", jb);
-			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
-		    }
-		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-		    index[0] = nrbl;  /* Number of row blocks */
-		    index[1] = len;   /* LDA of the nzval[] */
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (k = 0; k < nrbl; ++k) {
-			gb = Lrb_number[k];
-			lb = LBi( gb, grid );
-			len = Lrb_length[lb];
-			Lrb_length[lb] = 0;  /* Reset vector of block length */
-			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
-			Lrb_indptr[lb] = next_lind;
-			Lrb_valptr[lb] = next_lval;
-			next_lind += len;
-			next_lval += len;
-		    }
-		    /* Propagate the compressed row subscripts to Lindex[],
-                       and the initial values of A from SPA into Lnzval[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    len = index[1];  /* LDA of lusup[] */
-		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
-			irow = lsub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    k = Lrb_indptr[lb]++; /* Random access a block */
-			    index[k] = irow;
-			    k = Lrb_valptr[lb]++;
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			}
-		    } /* for i ... */
-		} else {
-		    Lrowind_bc_ptr[ljb] = NULL;
-		    Lnzval_bc_ptr[ljb] = NULL;
-		} /* if nrbl ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-
-	} /* for jb ... */
-
-	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-	Llu->Unzval_br_ptr = Unzval_br_ptr;
-	Llu->ToRecv = ToRecv;
-	Llu->ToSendD = ToSendD;
-	Llu->ToSendR = ToSendR;
-	Llu->fmod = fmod;
-	Llu->fsendx_plist = fsendx_plist;
-	Llu->nfrecvx = nfrecvx;
-	Llu->nfsendx = nfsendx;
-	Llu->bmod = bmod;
-	Llu->bsendx_plist = bsendx_plist;
-	Llu->nbrecvx = nbrecvx;
-	Llu->nbsendx = nbsendx;
-	Llu->ilsum = ilsum;
-	Llu->ldalsum = ldaspa;
-	
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-			   nLblocks, nUblocks);
-#endif
-
-	SUPERLU_FREE(rb_marker);
-	SUPERLU_FREE(Urb_fstnz);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-	SUPERLU_FREE(Lrb_length);
-	SUPERLU_FREE(Lrb_number);
-	SUPERLU_FREE(Lrb_indptr);
-	SUPERLU_FREE(Lrb_valptr);
-	SUPERLU_FREE(dense);
-
-	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
-
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 1st distribute time:\n "
-			   "\tL\t%.2f\n\tU\t%.2f\n"
-			   "\tu_blks %d\tnrbu %d\n--------\n",
-  			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } /* else fact != SamePattern_SameRowPerm */
-
-    if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
-        SUPERLU_FREE(asub);
-        SUPERLU_FREE(a);
-    }
-    SUPERLU_FREE(xa);
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed:
-       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
-    CHECK_MALLOC(iam, "Exit pddistribute()");
-#endif
-    
-    return (mem_use);
-} /* PDDISTRIBUTE */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsequ.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,231 +0,0 @@
-
-
-/*
- * File name:	pdgsequ.c
- * History:     Modified from LAPACK routine DGEEQU
- */
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
-{
-/*    
-    Purpose   
-    =======   
-
-    PDGSEQU computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
-	    
-    R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) double*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= M:  the i-th row of A is exactly zero   
-                  >  M:  the (i-M)-th column of A is exactly zero   
-
-    GRID    (input) gridinof_t*
-            The 2D process mesh.
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NRformat_loc *Astore;
-    double *Aval;
-    int i, j, irow, jcol, m_loc;
-    double rcmin, rcmax;
-    double bignum, smlnum;
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(char *);
-    /* extern double dlamch_(char *); */
-    double tempmax, tempmin;
-    double *loc_max;
-    int *r_sizes, *displs;
-    double *loc_r;
-    int_t  procs;
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("pdgsequ", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    m_loc = Astore->m_loc;
-    
-    /* Get machine constants. */
-    smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("S");
-    /* smlnum = dlamch_("S"); */
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    irow = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) {
-	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) );
-	++irow;
-    }
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-  
-    /* Get the global MAX and MIN for R */
-    tempmax = rcmax;
-    tempmin = rcmin;
-    MPI_Allreduce( &tempmax, &rcmax, 
-		1, MPI_DOUBLE, MPI_MAX, grid->comm);
-    MPI_Allreduce( &tempmin, &rcmin, 
-		1, MPI_DOUBLE, MPI_MIN, grid->comm);
-
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    irow = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	    jcol = Astore->colind[j];
-	    c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] );
-	}
-	++irow;
-    }
-
-    /* Find the global maximum for c[j] */
-    if ( !(loc_max = doubleMalloc_dist(A->ncol)))
-      ABORT("Malloc fails for loc_max[].");
-    for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
-    MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm);
-    SUPERLU_FREE(loc_max);
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* gather R from each process to get the global R.  */
-
-    procs = grid->nprow * grid->npcol;
-    if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
-      ABORT("Malloc fails for r_sizes[].");
-    displs = r_sizes + procs;
-    if ( !(loc_r = doubleMalloc_dist(m_loc)))
-      ABORT("Malloc fails for loc_r[].");
-    j = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
-
-    /* First gather the size of each piece. */
-    MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
-      
-    /* Set up the displacements for allgatherv */
-    displs[0] = 0;
-    for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
-
-    /* Now gather the actual data */
-    MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
-                MPI_DOUBLE, grid->comm);
-      
-    SUPERLU_FREE(r_sizes);
-    SUPERLU_FREE(loc_r);
-
-    return;
-
-} /* pdgsequ */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv_AXglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,301 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-
-static void dcreate_msr_matrix(SuperMatrix *, int_t [], int_t,
-			      double **, int_t **);
-static void dPrintMSRmatrix(int, double [], int_t [], gridinfo_t *);
-
-
-int pdgsmv_AXglobal_setup
-(
- SuperMatrix *A,       /* Matrix A permuted by columns (input).
-			  The type of A can be:
-			  Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */
- Glu_persist_t *Glu_persist, /* input */
- gridinfo_t *grid,     /* input */
- int_t *m,             /* output */
- int_t *update[],      /* output */
- double *val[],        /* output */
- int_t *bindx[],       /* output */
- int_t *mv_sup_to_proc /* output */
- )
-{
-    int n;
-    int input_option;
-    int N_update;    /* Number of variables updated on this process (output) */
-    int iam = grid->iam;
-    int nprocs = grid->nprow * grid->npcol;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *supno = Glu_persist->supno;
-    int_t nsupers;
-    int i, nsup, p, t1, t2, t3;
-
-
-    /* Initialize the list of global indices.
-     * NOTE: the list of global indices must be in ascending order.
-     */
-    n = A->nrow;
-    input_option = SUPER_LINEAR;
-    nsupers = supno[n-1] + 1;
-
-#if ( DEBUGlevel>=2 )
-    if ( !iam ) {
-	PrintInt10("xsup", supno[n-1]+1, xsup);
-	PrintInt10("supno", n, supno);
-    }
-#endif
-
-    if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on
-					     individual rows.  */
-	/* Figure out mv_sup_to_proc[] on all processes. */
-	for (p = 0; p < nprocs; ++p) {
-	    t1 = n / nprocs;       /* Number of rows */
-	    t2 = n - t1 * nprocs;  /* left-over, which will be assigned
-				      to the first t2 processes.  */
-	    if ( p >= t2 ) t2 += (p * t1); /* Starting row number */
-	    else { /* First t2 processes will get one more row. */
- 	        ++t1;              /* Number of rows. */
-		t2 = p * t1;       /* Starting row. */
-	    }
-	    /* Make sure the starting and ending rows are at the
-	       supernode boundaries. */
-	    t3 = t2 + t1;      /* Ending row. */
-	    nsup = supno[t2];
-	    if ( t2 > xsup[nsup] ) { /* Round up the starting row. */
-		t1 -= xsup[nsup+1] - t2;
-		t2 = xsup[nsup+1];
-	    }
-	    nsup = supno[t3];
-	    if ( t3 > xsup[nsup] ) /* Round up the ending row. */
-		t1 += xsup[nsup+1] - t3;
-	    t3 = t2 + t1 - 1;
-	    if ( t1 ) {
-		for (i = supno[t2]; i <= supno[t3]; ++i) {
-		    mv_sup_to_proc[i] = p;
-#if ( DEBUGlevel>=3 )
-		    if ( mv_sup_to_proc[i] == p-1 ) {
-			fprintf(stderr, 
-				"mv_sup_to_proc conflicts at supno %d\n", i);
-			exit(-1);
-		    }
-#endif
-		}
-	    }
-	    
-	    if ( iam == p ) {
-		N_update = t1;
-		if ( N_update ) {
-		    if ( !(*update = intMalloc_dist(N_update)) )
-			ABORT("Malloc fails for update[]");
-		}
-		for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i;
-#if ( DEBUGlevel>=3 )
-		printf("(%2d) N_update = %4d\t"
-		       "supers %4d to %4d\trows %4d to %4d\n",
-		       iam, N_update, supno[t2], supno[t3], t2, t3);
-#endif
-	    }
-	} /* for p ... */
-    } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on
-						   individual supernodes.  */
-	/* This may cause bad load balance, because the blocks are usually
-	   small in the beginning and large toward the end.   */
-	t1 = nsupers / nprocs;
-	t2 = nsupers - t1 * nprocs; /* left-over */
-	if ( iam >= t2 ) t2 += (iam * t1);
-	else {
-	    ++t1;          /* Number of blocks. */
-	    t2 = iam * t1; /* Starting block. */
-	}
-	N_update = xsup[t2+t1] - xsup[t2];
-	if ( !(*update = intMalloc_dist(N_update)) )
-	    ABORT("Malloc fails for update[]");
-	for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i;
-    }
-
-
-    /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */
-    dcreate_msr_matrix(A, *update, N_update, val, bindx);
-
-#if ( DEBUGlevel>=2 )
-    PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc);
-    dPrintMSRmatrix(N_update, *val, *bindx, grid);
-#endif
-
-    *m = N_update;
-    return 0;
-} /* PDGSMV_AXglobal_SETUP */
-
-
-/* Create the distributed modified sparse row (MSR) matrix: bindx/val.
- * For a submatrix of size m-by-n, the MSR arrays are as follows:
- *    bindx[0]      = m + 1
- *    bindx[0..m]   = pointer to start of each row
- *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
- *                    where, ks = bindx[k], ke = bindx[k+1]-1
- *    val[k]        = A(k,k), k < m, diagonal elements
- *    val[m]        = not used
- *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
- * Both arrays are of length nnz + 1.
- */
-static void dcreate_msr_matrix
-(
- SuperMatrix *A,       /* Matrix A permuted by columns (input).
-			  The type of A can be:
-			  Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */
- int_t update[],       /* input (local) */
- int_t N_update,       /* input (local) */
- double **val,         /* output */
- int_t **bindx         /* output */
-)
-{
-    int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag;
-    NCPformat *Astore;
-    double *nzval;
-    int_t *rowcnt;
-    double zero = 0.0;
-    
-    if ( !N_update ) return;
-
-    n = A->ncol;
-    Astore = A->Store;
-    nzval = Astore->nzval;
-
-    /* One pass of original matrix A to count nonzeros of each row. */
-    if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) )
-	ABORT("Malloc fails for rowcnt[]");
-    lo = update[0];
-    hi = update[N_update-1];
-    nnz_local = 0;
-    nnz_diag = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
-	    irow = Astore->rowind[i];
-	    if ( irow >= lo && irow <= hi ) {
-		if ( irow != j ) /* Exclude diagonal */
-		    ++rowcnt[irow - lo];
-		else ++nnz_diag; /* Count nonzero diagonal entries */
-		++nnz_local;
-	    }
-	}
-    }
-
-    /* Add room for the logical diagonal zeros which are not counted
-       in nnz_local. */
-    nnz_local += (N_update - nnz_diag);
-
-    /* Allocate storage for bindx[] and val[]. */
-    if ( !(*val = (double *) doubleMalloc_dist(nnz_local+1)) )
-	ABORT("Malloc fails for val[]");
-    for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */
-    if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) )
-	ABORT("Malloc fails for bindx[]");
-
-    /* Set up row pointers. */
-    (*bindx)[0] = N_update + 1;
-    for (j = 1; j <= N_update; ++j) {
-	(*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1];
-	rowcnt[j-1] = (*bindx)[j-1];
-    }
-
-    /* One pass of original matrix A to fill in matrix entries. */
-    for (j = 0; j < n; ++j) {
-	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
-	    irow = Astore->rowind[i];
-	    if ( irow >= lo && irow <= hi ) {
-		if ( irow == j ) /* Diagonal */
-		    (*val)[irow - lo] = nzval[i];
-		else {
-		    irow -= lo;
-		    k = rowcnt[irow];
-		    (*bindx)[k] = j;
-		    (*val)[k] = nzval[i];
-		    ++rowcnt[irow];
-		}
-	    }
-	}
-    }
-
-    SUPERLU_FREE(rowcnt);
-}
-
-/*
- * Performs sparse matrix-vector multiplication.
- *   - val/bindx stores the distributed MSR matrix A
- *   - X is global
- *   - ax product is distributed the same way as A
- */
-int
-pdgsmv_AXglobal(int_t m, int_t update[], double val[], int_t bindx[],
-                double X[], double ax[])
-{
-    int_t i, j, k;
-
-    if ( m <= 0 ) return; /* number of rows (local) */
-
-    for (i = 0; i < m; ++i) {
-	ax[i] = 0.0;
-
-	for (k = bindx[i]; k < bindx[i+1]; ++k) {
-	    j = bindx[k];       /* column index */
-	    ax[i] += val[k] * X[j];
-	}
-	ax[i] += val[i] * X[update[i]]; /* diagonal */
-    }
-} /* PDGSMV_AXglobal */
- 
-/*
- * Performs sparse matrix-vector multiplication.
- *   - val/bindx stores the distributed MSR matrix A
- *   - X is global
- *   - ax product is distributed the same way as A
- */
-int
-pdgsmv_AXglobal_abs(int_t m, int_t update[], double val[], int_t bindx[],
-	            double X[], double ax[])
-{
-    int_t i, j, k;
-
-    if ( m <= 0 ) return; /* number of rows (local) */
-
-    for (i = 0; i < m; ++i) {
-	ax[i] = 0.0;
-	for (k = bindx[i]; k < bindx[i+1]; ++k) {
-	    j = bindx[k];       /* column index */
-	    ax[i] += fabs(val[k]) * fabs(X[j]);
-	}
-	ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */
-    }
-} /* PDGSMV_AXglobal_ABS */
-
-/*
- * Print the local MSR matrix
- */
-static void dPrintMSRmatrix
-(
- int m,       /* Number of rows of the submatrix. */
- double val[],
- int_t bindx[],
- gridinfo_t *grid
-)
-{
-    int iam, nnzp1;
-
-    if ( !m ) return;
-
-    iam = grid->iam;
-    nnzp1 = bindx[m];
-    printf("(%2d) MSR submatrix has %d rows -->\n", iam, m);
-    PrintDouble5("val", nnzp1, val);
-    PrintInt10("bindx", nnzp1, bindx);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsmv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsmv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,370 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void pdgsmv_init
-(
- SuperMatrix *A,       /* Matrix A permuted by columns (input/output).
-			  The type of A can be:
-			  Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */
- int_t *row_to_proc,   /* Input. Mapping between rows and processes. */
- gridinfo_t *grid,     /* Input */
- pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
- )
-{
-    NRformat_loc *Astore;
-    int iam, p, procs;
-    int *SendCounts, *RecvCounts;
-    int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
-    int_t TotalIndSend, TotalValSend;
-    int_t *colind, *rowptr;
-    int_t *ind_tosend = NULL, *ind_torecv = NULL;
-    int_t *ptr_ind_tosend, *ptr_ind_torecv;
-    int_t *extern_start, *spa, *itemp;
-    double *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
-    MPI_Request *send_req, *recv_req;
-    MPI_Status status;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    iam = grid->iam;
-    procs = grid->nprow * grid->npcol;
-    Astore = (NRformat_loc *) A->Store;
-    m = A->nrow;
-    n = A->ncol;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    colind = Astore->colind;
-    rowptr = Astore->rowptr;
-    nzval = Astore->nzval;
-    if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
-        ABORT("Malloc fails for SendCounts[]");
-    /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
-    RecvCounts = SendCounts + procs;
-    if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
-        ABORT("Malloc fails for ptr_ind_tosend[]");
-    ptr_ind_torecv = ptr_ind_tosend + procs + 1;
-    if ( !(extern_start = intMalloc_dist(m_loc)) )
-        ABORT("Malloc fails for extern_start[]");
-    for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
-
-    /* ------------------------------------------------------------
-       COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
-       THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
-       SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
-       LOCAL PART OF X.
-       THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
-       ------------------------------------------------------------*/
-    if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
-        ABORT("Malloc fails for spa[]");
-    for (p = 0; p < procs; ++p) SendCounts[p] = 0;
-    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-        k = extern_start[i];
-        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
-	    jcol = colind[j];
-            p = row_to_proc[jcol];
-	    if ( p != iam ) { /* External */
-	        if ( spa[jcol] == 0 ) { /* First time see this index */
-		    ++SendCounts[p];
-		    spa[jcol] = 1;
-                }
-	    } else { /* Swap to beginning the part of A corresponding
-			to the local part of X */
-		l = colind[k];
-		t = nzval[k];
-		colind[k] = jcol;
-		nzval[k] = nzval[j];
-		colind[j] = l;
-		nzval[j] = t;
-		++k;
-	    }
-	}
-	extern_start[i] = k;
-    }
-
-    /* ------------------------------------------------------------
-       LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
-       THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
-       ------------------------------------------------------------*/
-    /* Build pointers to ind_tosend[]. */
-    ptr_ind_tosend[0] = 0;
-    for (p = 0, TotalIndSend = 0; p < procs; ++p) {
-        TotalIndSend += SendCounts[p]; /* Total to send. */
-	ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
-    }
-#if 0
-    ptr_ind_tosend[iam] = 0; /* Local part of X */
-#endif
-    if ( TotalIndSend ) {
-        if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
-	    ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
-    }
-
-    /* Build SPA to aid global to local translation. */
-    for (i = 0; i < n; ++i) spa[i] = EMPTY;
-    for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
-        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    jcol = colind[j];
-	    if ( spa[jcol] == EMPTY ) { /* First time see this index */
-	        p = row_to_proc[jcol];
-		if ( p == iam ) { /* Local */
-		  /*assert(jcol>=fst_row);*/
-		  spa[jcol] = jcol - fst_row; /* Relative position in local X */
-		} else {          /* External */
-		  ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
-		  spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
-		  ++ptr_ind_tosend[p];
-		}
-	    }
-	}
-    }
-    
-    /* ------------------------------------------------------------
-       TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
-       THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
-       ------------------------------------------------------------*/
-    for (i = 0; i < m_loc; ++i) {
-        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    jcol = colind[j];
-	    colind[j] = spa[jcol];
-	}
-    }
-
-    /* ------------------------------------------------------------
-       COMMUNICATE THE EXTERNAL INDICES OF X.
-       ------------------------------------------------------------*/
-    MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
-		 grid->comm);
-
-    /* Build pointers to ind_torecv[]. */
-    ptr_ind_torecv[0] = 0;
-    for (p = 0, TotalValSend = 0; p < procs; ++p) {
-        TotalValSend += RecvCounts[p]; /* Total to receive. */
-	ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
-    }
-    if ( TotalValSend ) {
-        if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
-	    ABORT("Malloc fails for ind_torecv[]");
-    }
-
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
-        ABORT("Malloc fails for recv_req[].");
-    recv_req = send_req + procs;
-    for (p = 0; p < procs; ++p) {
-        ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
-        if ( SendCounts[p] ) {
-	    MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
-		      mpi_int_t, p, iam, grid->comm, &send_req[p]);
-	}
-	if ( RecvCounts[p] ) {
-	    MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
-		      mpi_int_t, p, p, grid->comm, &recv_req[p]);
-	}
-    }
-    for (p = 0; p < procs; ++p) {
-        if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);
-	if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status);
-    }
-
-    /* Allocate storage for the X values to to transferred. */
-    if ( TotalIndSend &&
-         !(val_torecv = doubleMalloc_dist(TotalIndSend)) )
-        ABORT("Malloc fails for val_torecv[].");
-    if ( TotalValSend &&
-         !(val_tosend = doubleMalloc_dist(TotalValSend)) )
-        ABORT("Malloc fails for val_tosend[].");
-
-    gsmv_comm->extern_start = extern_start;
-    gsmv_comm->ind_tosend = ind_tosend;
-    gsmv_comm->ind_torecv = ind_torecv;
-    gsmv_comm->ptr_ind_tosend = ptr_ind_tosend;
-    gsmv_comm->ptr_ind_torecv = ptr_ind_torecv;
-    gsmv_comm->SendCounts = SendCounts;
-    gsmv_comm->RecvCounts = RecvCounts;
-    gsmv_comm->val_tosend = val_tosend;
-    gsmv_comm->val_torecv = val_torecv;
-    gsmv_comm->TotalIndSend = TotalIndSend;
-    gsmv_comm->TotalValSend = TotalValSend;
-    
-    SUPERLU_FREE(spa);
-    SUPERLU_FREE(send_req);
-
-#if ( DEBUGlevel>=2 )
-    PrintInt10("pdgsmv_init::rowptr", m_loc+1, rowptr);
-    PrintInt10("pdgsmv_init::extern_start", m_loc, extern_start);
-#endif
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgsmv_init()");
-#endif
-
-} /* PDGSMV_INIT */
-
-
-/*
- * Performs sparse matrix-vector multiplication.
- */
-void
-pdgsmv
-(
- int_t  abs,               /* Input. Do abs(A)*abs(x). */
- SuperMatrix *A_internal,  /* Input. Matrix A permuted by columns.
-			      The column indices are translated into
-			      the relative positions in the gathered x-vector.
-			      The type of A can be:
-			      Stype = NR_loc; Dtype = SLU_D; Mtype = GE. */
- gridinfo_t *grid,         /* Input */
- pdgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */
- double x[],       /* Input. The distributed source vector */
- double ax[]       /* Output. The distributed destination vector */
-)
-{
-    NRformat_loc *Astore;
-    int iam, procs;
-    int_t i, j, p, m, m_loc, n, fst_row, jcol;
-    int_t *colind, *rowptr;
-    int   *SendCounts, *RecvCounts;
-    int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv;
-    int_t *extern_start, TotalValSend;
-    double *nzval, *val_tosend, *val_torecv;
-    double zero = 0.0;
-    MPI_Request *send_req, *recv_req;
-    MPI_Status status;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pdgsmv()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    iam = grid->iam;
-    procs = grid->nprow * grid->npcol;
-    Astore = (NRformat_loc *) A_internal->Store;
-    m = A_internal->nrow;
-    n = A_internal->ncol;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    colind = Astore->colind;
-    rowptr = Astore->rowptr;
-    nzval = (double *) Astore->nzval;
-    extern_start = gsmv_comm->extern_start;
-    ind_torecv = gsmv_comm->ind_torecv;
-    ptr_ind_tosend = gsmv_comm->ptr_ind_tosend;
-    ptr_ind_torecv = gsmv_comm->ptr_ind_torecv;
-    SendCounts = gsmv_comm->SendCounts;
-    RecvCounts = gsmv_comm->RecvCounts;
-    val_tosend = (double *) gsmv_comm->val_tosend;
-    val_torecv = (double *) gsmv_comm->val_torecv;
-    TotalValSend = gsmv_comm->TotalValSend;
-
-    /* ------------------------------------------------------------
-       COPY THE X VALUES INTO THE SEND BUFFER.
-       ------------------------------------------------------------*/
-    for (i = 0; i < TotalValSend; ++i) {
-        j = ind_torecv[i] - fst_row; /* Relative index in x[] */
-	val_tosend[i] = x[j];
-    }
-
-    /* ------------------------------------------------------------
-       COMMUNICATE THE X VALUES.
-       ------------------------------------------------------------*/
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
-        ABORT("Malloc fails for recv_req[].");
-    recv_req = send_req + procs;
-    for (p = 0; p < procs; ++p) {
-        if ( RecvCounts[p] ) {
-	    MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p],
-                      MPI_DOUBLE, p, iam,
-                      grid->comm, &send_req[p]);
-	}
-	if ( SendCounts[p] ) {
-	    MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p],
-                      MPI_DOUBLE, p, p,
-                      grid->comm, &recv_req[p]);
-	}
-    }
-    
-    /* ------------------------------------------------------------
-       PERFORM THE ACTUAL MULTIPLICATION.
-       ------------------------------------------------------------*/
-    if ( abs ) { /* Perform abs(A)*abs(x) */
-        /* Multiply the local part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    ax[i] = 0.0;
-	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
-	        jcol = colind[j];
-		ax[i] += fabs(nzval[j]) * fabs(x[jcol]);
-	    }
-        }
-
-        for (p = 0; p < procs; ++p) {
-            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
-	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
-        }
-
-        /* Multiply the external part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
-	        jcol = colind[j];
-	        ax[i] += fabs(nzval[j]) * fabs(val_torecv[jcol]);
-	    }
-	}
-    } else {
-        /* Multiply the local part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    ax[i] = zero;
-	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
-	        jcol = colind[j];
-		ax[i] += nzval[j] * x[jcol];
-	    }
-        }
-
-        for (p = 0; p < procs; ++p) {
-            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
-	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
-        }
-
-        /* Multiply the external part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
-	        jcol = colind[j];
-	        ax[i] += nzval[j] * val_torecv[jcol];
-	    }
-	}
-    }
-
-    SUPERLU_FREE(send_req);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgsmv()");
-#endif
-
-} /* PDGSMV */
-
-void pdgsmv_finalize(pdgsmv_comm_t *gsmv_comm)
-{
-    int_t *it;
-    double *dt;
-    SUPERLU_FREE(gsmv_comm->extern_start);
-    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
-    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
-    SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
-    SUPERLU_FREE(gsmv_comm->SendCounts);
-    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
-    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,440 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-/*-- Function prototypes --*/
-static void gather_1rhs_diag_to_all(int_t, double [], Glu_persist_t *,
-                                    LocalLU_t *, gridinfo_t *, int_t, int_t [],
-				    int_t [], double [], double []);
-static void redist_all_to_diag(int_t, double [], Glu_persist_t *,
-                               LocalLU_t *, gridinfo_t *, int_t [], double []);
-
-void
-pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-		  gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx,
-		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * pdgsrfs_ABXglobal improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
- *        are permutation matrices. The type of A can be:
- *        Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
- *
- *        NOTE: Currently, A must reside in all processes when calling
- *              this routine.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) double* (global)
- *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *       
- *        NOTE: Currently, B must reside on all processes when calling
- *              this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) double* (global)
- *        On entry, the solution matrix X, as computed by PDGSTRS.
- *        On exit, the improved solution matrix X.
- *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
- *        in order to obtain the solution to the original system.
- *
- *        NOTE: Currently, X must reside on all processes when calling
- *              this routine.
- *
- * ldx    (input) int (global)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    /* 
-     * Data structures used by matrix-vector multiply routine.
-     */
-    int_t  N_update; /* Number of variables updated on this process */
-    int_t  *update;  /* vector elements (global index) updated 
-			on this processor.                     */
-    int_t  *bindx;
-    double *val;
-    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
-			       matrix-vector multiply.  */
-    /*-- end data structures for matrix-vector multiply --*/
-    double *b, *ax, *R, *B_col, *temp, *work, *X_col,
-           *x_trs, *dx_trs;
-    int_t count, ii, j, jj, k, knsupc, lk, lwork,
-          nprow, nsupers, nz, p;
-    int   i, iam, pkk;
-    int_t *ilsum, *xsup;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* NEW STUFF */
-    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
-    int_t *diag_len; /* Length of the X vector on diagonal processes. */
-
-    /*-- Function prototypes --*/
-    extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *,
-			 double *, int, SuperLUStat_t *, int *);
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("pdgsrfs_ABXglobal", &i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-    /* Initialization. */
-    iam = grid->iam;
-    nprow = grid->nprow;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()");
-#endif
-
-    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		   &diag_procs, &diag_len);
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. number of diag processes = %d\n", num_diag_procs);
-	PrintInt10("diag_procs", num_diag_procs, diag_procs);
-	PrintInt10("diag_len", num_diag_procs, diag_len);
-    }
-#endif
-
-    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
-	ABORT("Calloc fails for mv_sup_to_proc[]");
-
-    pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
-		          &val, &bindx, mv_sup_to_proc);
-
-    i = CEILING( nsupers, nprow ); /* Number of local block rows */
-    ii = Llu->ldalsum + i * XK_H;
-    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
-    jj = diag_len[0];
-    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
-    jj = SUPERLU_MAX( jj, N_update );
-    lwork = N_update         /* For ax and R */
-	  + ii               /* For dx_trs */
-	  + ii               /* For x_trs */
-          + k                /* For b */
-	  + jj;              /* for temp */
-    if ( !(work = doubleMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = work;
-    dx_trs = work + N_update;
-    x_trs  = dx_trs + ii;
-    b      = x_trs + ii;
-    temp   = b + k;
-
-#if ( DEBUGlevel>=2 )
-    {
-	double *dwork = doubleMalloc_dist(n);
-	for (i = 0; i < n; ++i) {
-	    if ( i & 1 ) dwork[i] = 1.;
-	    else dwork[i] = 2.;
-        }
-	/* Check correctness of matrix-vector multiply. */
-	pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
-	PrintDouble5("Mult A*x", N_update, ax);
-	SUPERLU_FREE(dwork);
-    }
-#endif
-
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-
-    /* Set SAFE1 essentially to be the underflow threshold times the
-       number of additions in each row. */
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-
-	/* Copy X into x on the diagonal processes. */
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-	for (p = 0; p < num_diag_procs; ++p) {
-	    pkk = diag_procs[p];
-	    if ( iam == pkk ) {
-		for (k = p; k < nsupers; k += num_diag_procs) {
-		    knsupc = SuperSize( k );
-		    lk = LBi( k, grid );
-		    ii = ilsum[lk] + (lk+1)*XK_H;
-		    jj = FstBlockC( k );
-		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
-		    dx_trs[ii-XK_H] = k;/* Block number prepended in header. */
-		}
-	    }
-	}
-	/* Copy B into b distributed the same way as matrix-vector product. */
-        if ( N_update ) ii = update[0];
-	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
-	    
-	    /* Compute residual. */
-	    for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i];
-
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp);
-	    for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < N_update; ++i) {
-		if ( temp[i] > safe2 ) {
-		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
-		} else if ( temp[i] != 0.0 ) {
-                    /* Adding SAFE1 to the numerator guards against
-                       spuriously zero residuals (underflow). */
-		    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]);
-                }
-                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
-                   we know the true residual also must be exactly 0.0. */
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
-				   mv_sup_to_proc, dx_trs);
-		pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
-
-		/* Update solution. */
-		for (p = 0; p < num_diag_procs; ++p) 
-		    if ( iam == diag_procs[p] )
-			for (k = p; k < nsupers; k += num_diag_procs) {
-			    lk = LBi( k, grid );
-			    ii = ilsum[lk] + (lk+1)*XK_H;
-			    knsupc = SuperSize( k );
-			    for (i = 0; i < knsupc; ++i)
-				x_trs[i + ii] += dx_trs[i + ii];
-			}
-		lstres = berr[j];
-		++count;
-		/* Transfer x_trs (on diagonal processes) into X
-		   (on all processes). */
-		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
-					num_diag_procs, diag_procs, diag_len,
-					X_col, temp);
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-
-    /* Deallocate storage used by matrix-vector multiplication. */
-    SUPERLU_FREE(diag_procs);
-    SUPERLU_FREE(diag_len);
-    if ( N_update ) {
-	SUPERLU_FREE(update);
-	SUPERLU_FREE(bindx);
-	SUPERLU_FREE(val);
-    }
-    SUPERLU_FREE(mv_sup_to_proc);
-    SUPERLU_FREE(work);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()");
-#endif
-
-} /* PDGSRFS_ABXGLOBAL */
-
-
-/*
- * r[] is the residual vector distributed the same way as
- * matrix-vector product.
- */
-static void
-redist_all_to_diag(int_t n, double r[], Glu_persist_t *Glu_persist,
-		   LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
-		   double work[])
-{
-    int_t i, ii, k, lk, lr, nsupers;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, psrc, pkk;
-    MPI_Status status;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-    lr = 0;
-
-    for (k = 0; k < nsupers; ++k) {
-	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
-	psrc = mv_sup_to_proc[k];
-	knsupc = SuperSize( k );
-	lk = LBi( k, grid );
-	ii = ilsum[lk] + (lk+1)*XK_H;
-	if ( iam == psrc ) {
-	    if ( iam != pkk ) { /* Send X component. */
-		MPI_Send( &r[lr], knsupc, MPI_DOUBLE, pkk, Xk,
-			 grid->comm );
-	    } else { /* Local copy. */
-		for (i = 0; i < knsupc; ++i)
-		    work[i + ii] = r[i + lr];
-	    }
-	    lr += knsupc;
-	} else {
-	    if ( iam == pkk ) { /* Recv X component. */
-		MPI_Recv( &work[ii], knsupc, MPI_DOUBLE, psrc, Xk,
-			 grid->comm, &status );
-	    }
-	}
-    }
-} /* REDIST_ALL_TO_DIAG */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_1rhs_diag_to_all(int_t n, double x[],
-			Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-			gridinfo_t *grid, int_t num_diag_procs,
-			int_t diag_procs[], int_t diag_len[],
-			double y[], double work[])
-{
-    int_t i, ii, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = ilsum[lk] + (lk+1)*XK_H;
-		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
-		lwork += knsupc;
-	    }
-	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p], MPI_DOUBLE, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
-	    lwork += knsupc;
-	}
-    }
-} /* GATHER_1RHS_DIAG_TO_ALL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs_ABXglobal.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,432 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-/*-- Function prototypes --*/
-static void gather_1rhs_diag_to_all(int_t, double [], Glu_persist_t *,
-                                    LocalLU_t *, gridinfo_t *, int_t, int_t [],
-				    int_t [], double [], double []);
-static void redist_all_to_diag(int_t, double [], Glu_persist_t *,
-                               LocalLU_t *, gridinfo_t *, int_t [], double []);
-
-void
-pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-		  gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx,
-		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * pdgsrfs_ABXglobal improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
- *        are permutation matrices. The type of A can be:
- *        Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
- *
- *        NOTE: Currently, A must reside in all processes when calling
- *              this routine.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) double* (global)
- *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *       
- *        NOTE: Currently, B must reside on all processes when calling
- *              this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) double* (global)
- *        On entry, the solution matrix X, as computed by PDGSTRS.
- *        On exit, the improved solution matrix X.
- *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
- *        in order to obtain the solution to the original system.
- *
- *        NOTE: Currently, X must reside on all processes when calling
- *              this routine.
- *
- * ldx    (input) int (global)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    /* 
-     * Data structures used by matrix-vector multiply routine.
-     */
-    int_t  N_update; /* Number of variables updated on this process */
-    int_t  *update;  /* vector elements (global index) updated 
-			on this processor.                     */
-    int_t  *bindx;
-    double *val;
-    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
-			       matrix-vector multiply.  */
-    /*-- end data structures for matrix-vector multiply --*/
-    double *b, *ax, *R, *B_col, *temp, *work, *X_col,
-           *x_trs, *dx_trs;
-    int_t count, ii, j, jj, k, knsupc, lk, lwork,
-          nprow, nsupers, nz, p;
-    int   i, iam, pkk;
-    int_t *ilsum, *xsup;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* NEW STUFF */
-    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
-    int_t *diag_len; /* Length of the X vector on diagonal processes. */
-
-    /*-- Function prototypes --*/
-    extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *,
-			 double *, int, SuperLUStat_t *, int *);
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("pdgsrfs_ABXglobal", &i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-    /* Initialization. */
-    iam = grid->iam;
-    nprow = grid->nprow;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()");
-#endif
-
-    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		   &diag_procs, &diag_len);
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. number of diag processes = %d\n", num_diag_procs);
-	PrintInt10("diag_procs", num_diag_procs, diag_procs);
-	PrintInt10("diag_len", num_diag_procs, diag_len);
-    }
-#endif
-
-    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
-	ABORT("Calloc fails for mv_sup_to_proc[]");
-
-    pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
-		          &val, &bindx, mv_sup_to_proc);
-
-    i = CEILING( nsupers, nprow ); /* Number of local block rows */
-    ii = Llu->ldalsum + i * XK_H;
-    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
-    jj = diag_len[0];
-    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
-    jj = SUPERLU_MAX( jj, N_update );
-    lwork = N_update         /* For ax and R */
-	  + ii               /* For dx_trs */
-	  + ii               /* For x_trs */
-          + k                /* For b */
-	  + jj;              /* for temp */
-    if ( !(work = doubleMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = work;
-    dx_trs = work + N_update;
-    x_trs  = dx_trs + ii;
-    b      = x_trs + ii;
-    temp   = b + k;
-
-#if ( DEBUGlevel>=2 )
-    {
-	double *dwork = doubleMalloc_dist(n);
-	for (i = 0; i < n; ++i) {
-	    if ( i & 1 ) dwork[i] = 1.;
-	    else dwork[i] = 2.;
-        }
-	/* Check correctness of matrix-vector multiply. */
-	pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
-	PrintDouble5("Mult A*x", N_update, ax);
-	SUPERLU_FREE(dwork);
-    }
-#endif
-
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-
-	/* Copy X into x on the diagonal processes. */
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-	for (p = 0; p < num_diag_procs; ++p) {
-	    pkk = diag_procs[p];
-	    if ( iam == pkk ) {
-		for (k = p; k < nsupers; k += num_diag_procs) {
-		    knsupc = SuperSize( k );
-		    lk = LBi( k, grid );
-		    ii = ilsum[lk] + (lk+1)*XK_H;
-		    jj = FstBlockC( k );
-		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
-		    dx_trs[ii-XK_H] = k;/* Block number prepended in header. */
-		}
-	    }
-	}
-	/* Copy B into b distributed the same way as matrix-vector product. */
-        if ( N_update ) ii = update[0];
-	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
-	    
-	    /* Compute residual. */
-	    for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i];
-
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp);
-	    for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < N_update; ++i) {
-		if ( temp[i] > safe2 )
-		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
-		else
-		    s = SUPERLU_MAX(s, (fabs(R[i])+safe1)/(temp[i]+safe1));
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
-				   mv_sup_to_proc, dx_trs);
-		pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
-
-		/* Update solution. */
-		for (p = 0; p < num_diag_procs; ++p) 
-		    if ( iam == diag_procs[p] )
-			for (k = p; k < nsupers; k += num_diag_procs) {
-			    lk = LBi( k, grid );
-			    ii = ilsum[lk] + (lk+1)*XK_H;
-			    knsupc = SuperSize( k );
-			    for (i = 0; i < knsupc; ++i)
-				x_trs[i + ii] += dx_trs[i + ii];
-			}
-		lstres = berr[j];
-		++count;
-		/* Transfer x_trs (on diagonal processes) into X
-		   (on all processes). */
-		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
-					num_diag_procs, diag_procs, diag_len,
-					X_col, temp);
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-
-    /* Deallocate storage used by matrix-vector multiplication. */
-    SUPERLU_FREE(diag_procs);
-    SUPERLU_FREE(diag_len);
-    if ( N_update ) {
-	SUPERLU_FREE(update);
-	SUPERLU_FREE(bindx);
-	SUPERLU_FREE(val);
-    }
-    SUPERLU_FREE(mv_sup_to_proc);
-    SUPERLU_FREE(work);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()");
-#endif
-
-} /* PDGSRFS_ABXGLOBAL */
-
-
-/*
- * r[] is the residual vector distributed the same way as
- * matrix-vector product.
- */
-static void
-redist_all_to_diag(int_t n, double r[], Glu_persist_t *Glu_persist,
-		   LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
-		   double work[])
-{
-    int_t i, ii, k, lk, lr, nsupers;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, psrc, pkk;
-    MPI_Status status;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-    lr = 0;
-
-    for (k = 0; k < nsupers; ++k) {
-	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
-	psrc = mv_sup_to_proc[k];
-	knsupc = SuperSize( k );
-	lk = LBi( k, grid );
-	ii = ilsum[lk] + (lk+1)*XK_H;
-	if ( iam == psrc ) {
-	    if ( iam != pkk ) { /* Send X component. */
-		MPI_Send( &r[lr], knsupc, MPI_DOUBLE, pkk, Xk,
-			 grid->comm );
-	    } else { /* Local copy. */
-		for (i = 0; i < knsupc; ++i)
-		    work[i + ii] = r[i + lr];
-	    }
-	    lr += knsupc;
-	} else {
-	    if ( iam == pkk ) { /* Recv X component. */
-		MPI_Recv( &work[ii], knsupc, MPI_DOUBLE, psrc, Xk,
-			 grid->comm, &status );
-	    }
-	}
-    }
-} /* REDIST_ALL_TO_DIAG */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_1rhs_diag_to_all(int_t n, double x[],
-			Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-			gridinfo_t *grid, int_t num_diag_procs,
-			int_t diag_procs[], int_t diag_len[],
-			double y[], double work[])
-{
-    int_t i, ii, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = ilsum[lk] + (lk+1)*XK_H;
-		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
-		lwork += knsupc;
-	    }
-	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p], MPI_DOUBLE, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
-	    lwork += knsupc;
-	}
-    }
-} /* GATHER_1RHS_DIAG_TO_ALL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,248 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
-	double *B, int_t ldb, double *X, int_t ldx, int nrhs, 
-	SOLVEstruct_t *SOLVEstruct,
-	double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * PDGSRFS improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
- *        Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * ScalePermstruct (input) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_defs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) double* (local)
- *        The m_loc-by-NRHS right-hand side matrix of the possibly
- *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *       
- * ldb    (input) int (local)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) double* (local)
- *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
- *            transformed system A1*Y = Pc*Pr*B. where
- *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
- *        On exit, the improved solution matrix Y.
- *
- *        In order to obtain the solution X to the original system,
- *        Y should be permutated by Pc^T, and premultiplied by diag(C)
- *        if DiagScale = COL or BOTH.
- *        This must be done after this routine is called.
- *
- * ldx    (input) int (local)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * SOLVEstruct (output) SOLVEstruct_t* (global)
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double *ax, *R, *dx, *temp, *work, *B_col, *X_col;
-    int_t count, i, j, lwork, nz;
-    int   iam;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(char *);
-
-    /* Data structures used by matrix-vector multiply routine. */
-    pdgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
-    NRformat_loc *Astore;
-    int_t        m_loc, fst_row;
-
-
-    /* Initialization. */
-    Astore = (NRformat_loc *) A->Store;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    iam = grid->iam;
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-	      || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	pxerbla("PDGSRFS", grid, i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgsrfs()");
-#endif
-
-    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
-    if ( !(work = doubleMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = dx = work;
-    temp = ax + m_loc;
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    /* eps    = dlamch_("Epsilon");
-       safmin = dlamch_("Safe minimum"); */
-    eps    = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Epsilon");
-    safmin = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum");
-
-    /* Set SAFE1 essentially to be the underflow threshold times the
-       number of additions in each row. */
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pdgsmv(0, A, grid, gsmv_comm, X_col, ax);
-	    
-	    /* Compute residual, stored in R[]. */
-	    for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
-
-	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
-	    pdgsmv(1, A, grid, gsmv_comm, X_col, temp);
-	    for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < m_loc; ++i) {
-		if ( temp[i] > safe2 ) {
-		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
-		} else if ( temp[i] != 0.0 ) {
-                    /* Adding SAFE1 to the numerator guards against
-                       spuriously zero residuals (underflow). */
-                    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]);
-                }
-                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
-                   we know the true residual also must be exactly 0.0. */
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		pdgstrs(n, LUstruct, ScalePermstruct, grid,
-			dx, m_loc, fst_row, m_loc, 1, 
-			SOLVEstruct, stat, info);
-
-		/* Update solution. */
-		for (i = 0; i < m_loc; ++i) X_col[i] += dx[i];
-
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-    /* Deallocate storage. */
-    SUPERLU_FREE(work);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgsrfs()");
-#endif
-
-} /* PDGSRFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgsrfs.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,237 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
-	double *B, int_t ldb, double *X, int_t ldx, int nrhs, 
-	SOLVEstruct_t *SOLVEstruct,
-	double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * PDGSRFS improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
- *        Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * ScalePermstruct (input) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_defs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) double* (local)
- *        The m_loc-by-NRHS right-hand side matrix of the possibly
- *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *       
- * ldb    (input) int (local)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) double* (local)
- *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
- *            transformed system A1*Y = Pc*Pr*B. where
- *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
- *        On exit, the improved solution matrix Y.
- *
- *        In order to obtain the solution X to the original system,
- *        Y should be permutated by Pc^T, and premultiplied by diag(C)
- *        if DiagScale = COL or BOTH.
- *        This must be done after this routine is called.
- *
- * ldx    (input) int (local)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * SOLVEstruct (output) SOLVEstruct_t* (global)
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double *ax, *R, *dx, *temp, *work, *B_col, *X_col;
-    int_t count, i, j, lwork, nz;
-    int   iam;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* Data structures used by matrix-vector multiply routine. */
-    pdgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
-    NRformat_loc *Astore;
-    int_t        m_loc, fst_row;
-
-
-    /* Initialization. */
-    Astore = (NRformat_loc *) A->Store;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    iam = grid->iam;
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-	      || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	pxerbla("PDGSRFS", grid, i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgsrfs()");
-#endif
-
-    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
-    if ( !(work = doubleMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = dx = work;
-    temp = ax + m_loc;
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pdgsmv(0, A, grid, gsmv_comm, X_col, ax);
-	    
-	    /* Compute residual, stored in R[]. */
-	    for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
-
-	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
-	    pdgsmv(1, A, grid, gsmv_comm, X_col, temp);
-	    for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < m_loc; ++i) {
-		if ( temp[i] > safe2 )
-		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
-		else
-		    s = SUPERLU_MAX(s, (fabs(R[i]) + safe1)/(temp[i]+safe1));
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		pdgstrs(n, LUstruct, ScalePermstruct, grid,
-			dx, m_loc, fst_row, m_loc, 1, 
-			SOLVEstruct, stat, info);
-
-		/* Update solution. */
-		for (i = 0; i < m_loc; ++i) X_col[i] += dx[i];
-
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-    /* Deallocate storage. */
-    SUPERLU_FREE(work);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgsrfs()");
-#endif
-
-} /* PDGSRFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx_ABglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1065 +0,0 @@
-
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-
-void
-pdgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, 
-		 ScalePermstruct_t *ScalePermstruct,
-		 double B[], int ldb, int nrhs, gridinfo_t *grid,
-		 LUstruct_t *LUstruct, double *berr,
-		 SuperLUStat_t *stat, int *info)
-{
-/* 
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- *
- * pdgssvx_ABglobal solves a system of linear equations A*X=B,
- * by using Gaussian elimination with "static pivoting" to
- * compute the LU factorization of A.
- *
- * Static pivoting is a technique that combines the numerical stability
- * of partial pivoting with the scalability of Cholesky (no pivoting),
- * to run accurately and efficiently on large numbers of processors.
- *
- * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
- * description of the parallel algorithms.
- *
- * Here are the options for using this code:
- *
- *   1. Independent of all the other options specified below, the
- *      user must supply
- *
- *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
- *      -  grid, a structure describing the 2D processor mesh
- *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
- *            iterative refinement
- *
- *      On output, B is overwritten with the solution X.
- *
- *   2. Depending on options->Fact, the user has several options
- *      for solving A*X=B. The standard option is for factoring
- *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
- *      the previous factorization.)
- *
- *      -  options->Fact = DOFACT: A is factored "from scratch"
- *
- *      In this case the user must also supply
- *
- *      -  A, the input matrix
- *
- *      as well as the following options, which are described in more 
- *      detail below:
- *
- *      -  options->Equil,   to specify how to scale the rows and columns
- *                           of A to "equilibrate" it (to try to reduce its
- *                           condition number and so improve the
- *                           accuracy of the computed solution)
- *
- *      -  options->RowPerm, to specify how to permute the rows of A
- *                           (typically to control numerical stability)
- *
- *      -  options->ColPerm, to specify how to permute the columns of A
- *                           (typically to control fill-in and enhance
- *                           parallelism during factorization)
- *
- *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
- *                           pivots encountered during factorization
- *                           (to control numerical stability)
- *
- *      The outputs returned include
- *         
- *      -  ScalePermstruct,  modified to describe how the input matrix A
- *                           was equilibrated and permuted:
- *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
- *                                        columns of A were scaled
- *         -  ScalePermstruct->R, array of row scale factors
- *         -  ScalePermstruct->C, array of column scale factors
- *         -  ScalePermstruct->perm_r, row permutation vector
- *         -  ScalePermstruct->perm_c, column permutation vector
- *
- *            (part of ScalePermstruct may also need to be supplied on input,
- *             depending on options->RowPerm and options->ColPerm as described 
- *             later).
- *
- *      -  A, the input matrix A overwritten by the scaled and permuted matrix
- *                Pc*Pr*diag(R)*A*diag(C)
- *             where 
- *                Pr and Pc are row and columns permutation matrices determined
- *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
- *                  respectively, and 
- *                diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
- *                  ScalePermstruct->C
- *
- *      -  LUstruct, which contains the L and U factorization of A1 where
- *
- *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
- *               in A on output.)
- *
- *   3. The second value of options->Fact assumes that a matrix with the same
- *      sparsity pattern as A has already been factored:
- *     
- *      -  options->Fact = SamePattern: A is factored, assuming that it has
- *            the same nonzero pattern as a previously factored matrix. In this
- *            case the algorithm saves time by reusing the previously computed
- *            column permutation vector stored in ScalePermstruct->perm_c
- *            and the "elimination tree" of A stored in LUstruct->etree.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *      -  options->Equil
- *      -  options->RowPerm
- *      -  options->ReplaceTinyPivot
- *
- *      but not options->ColPerm, whose value is ignored. This is because the
- *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
- *
- *      -  A, the input matrix
- *      -  ScalePermstruct->perm_c, the column permutation
- *      -  LUstruct->etree, the elimination tree
- *
- *      The outputs returned include
- *         
- *      -  A, the input matrix A overwritten by the scaled and permuted matrix
- *            as described above
- *      -  ScalePermstruct,  modified to describe how the input matrix A was
- *                           equilibrated and row permuted
- *      -  LUstruct, modified to contain the new L and U factors
- *
- *   4. The third value of options->Fact assumes that a matrix B with the same
- *      sparsity pattern as A has already been factored, and where the
- *      row permutation of B can be reused for A. This is useful when A and B
- *      have similar numerical values, so that the same row permutation
- *      will make both factorizations numerically stable. This lets us reuse
- *      all of the previously computed structure of L and U.
- *
- *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
- *            assuming not only the same nonzero pattern as the previously
- *            factored matrix B, but reusing B's row permutation.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *      -  options->Equil
- *      -  options->ReplaceTinyPivot
- *
- *      but not options->RowPerm or options->ColPerm, whose values are ignored.
- *      This is because the permutations from ScalePermstruct->perm_r and
- *      ScalePermstruct->perm_c are used as input.
- *
- *      The user must also supply 
- *
- *      -  A, the input matrix
- *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
- *                                     column scaled
- *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
- *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
- *                             if any
- *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
- *      -  ScalePermstruct->perm_c, the column permutation of the previous 
- *                                  matrix
- *      -  all of LUstruct, the previously computed information about L and U
- *                (the actual numerical values of L and U stored in
- *                 LUstruct->Llu are ignored)
- *
- *      The outputs returned include
- *         
- *      -  A, the input matrix A overwritten by the scaled and permuted matrix
- *            as described above
- *      -  ScalePermstruct,  modified to describe how the input matrix A was
- *                           equilibrated 
- *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
- *      -  LUstruct, modified to contain the new L and U factors
- *
- *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
- *      call, and reuses its entire LU factorization
- *
- *      -  options->Fact = Factored: A is identical to a previously
- *            factorized matrix, so the entire previous factorization
- *            can be reused.
- *
- *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
- *       options->ReplaceTinyPivot)
- *
- *      The user must also supply 
- *
- *      -  A, the unfactored matrix, only in the case that iterative refinment
- *            is to be done (specifically A must be the output A from 
- *            the previous call, so that it has been scaled and permuted)
- *      -  all of ScalePermstruct
- *      -  all of LUstruct, including the actual numerical values of L and U
- *
- *      all of which are unmodified on output.
- *         
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following fields should be defined for this structure:
- *         
- *         o Fact (fact_t)
- *           Specifies whether or not the factored form of the matrix
- *           A is supplied on entry, and if not, how the matrix A should
- *           be factorized based on the previous history.
- *
- *           = DOFACT: The matrix A will be factorized from scratch.
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *
- *           = SamePattern: the matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the elimination tree
- *             LUstruct->etree
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ReplaceTinyPivot
- *                          ScalePermstruct->perm_c
- *                          LUstruct->etree
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
- *                          rest of LUstruct (GLU_persist, Llu)
- *
- *           = SamePattern_SameRowPerm: the matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *                 Inputs:  A
- *                          options->Equil, ReplaceTinyPivot
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          modified LUstruct->Llu
- *           = FACTORED: the matrix A is already factored.
- *                 Inputs:  all of ScalePermstruct
- *                          all of LUstruct
- *
- *         o Equil (yes_no_t)
- *           Specifies whether to equilibrate the system.
- *           = NO:  no equilibration.
- *           = YES: scaling factors are computed to equilibrate the system:
- *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
- *                  Whether or not the system will be equilibrated depends
- *                  on the scaling of the matrix A, but if equilibration is
- *                  used, A is overwritten by diag(R)*A*diag(C) and B by
- *                  diag(R)*B.
- *
- *         o RowPerm (rowperm_t)
- *           Specifies how to permute rows of the matrix A.
- *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
- *                        the original matrix to make the diagonal large
- *                        relative to the off-diagonal.
- *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
- *                        input by the user.
- *           
- *         o ColPerm (colperm_t)
- *           Specifies what type of column permutation to use to reduce fill.
- *           = NATURAL:       natural ordering.
- *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
- *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
- *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
- *         o ReplaceTinyPivot (yes_no_t)
- *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
- *                  LU factorization.
- *
- *         o IterRefine (IterRefine_t)
- *           Specifies how to perform iterative refinement.
- *           = NO:     no iterative refinement.
- *           = DOUBLE: accumulate residual in double precision.
- *           = EXTRA:  accumulate residual in extra precision.
- *
- *         NOTE: all options must be indentical on all processes when
- *               calling this routine.
- *
- * A (input/output) SuperMatrix*
- *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *         The number of linear equations is A->nrow. The type of A must be:
- *         Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. That is, A is stored in
- *         compressed column format (also known as Harwell-Boeing format).
- *         See supermatrix.h for the definition of 'SuperMatrix'.
- *         This routine only handles square A, however, the LU factorization
- *         routine pdgstrf can factorize rectangular matrices.
- *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
- *         depending on ScalePermstruct->DiagScale, options->RowPerm and
- *         options->colpem:
- *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
- *                diag(R)*A*diag(C).
- *             if options->RowPerm != NATURAL, A is further overwritten by
- *                Pr*diag(R)*A*diag(C).
- *             if options->ColPerm != NATURAL, A is further overwritten by
- *                Pc*Pr*diag(R)*A*diag(C).
- *         If all the above condition are true, the LU decomposition is
- *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
- *
- *         NOTE: Currently, A must reside in all processes when calling
- *               this routine.
- *
- * ScalePermstruct (input/output) ScalePermstruct_t*
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *         It contains the following fields:
- *
- *         o DiagScale (DiagScale_t)
- *           Specifies the form of equilibration that was done.
- *           = NOEQUIL: no equilibration.
- *           = ROW:     row equilibration, i.e., A was premultiplied by
- *                      diag(R).
- *           = COL:     Column equilibration, i.e., A was postmultiplied
- *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
- *                      replaced by diag(R)*A*diag(C).
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
- *           DiagScale is an input argument; otherwise it is an output
- *           argument.
- *
- *         o perm_r (int*)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *           If options->RowPerm = MY_PERMR, or
- *           options->Fact = SamePattern_SameRowPerm, perm_r is an
- *           input argument; otherwise it is an output argument.
- *
- *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
- *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
- *           input argument; otherwise, it is an output argument.
- *           On exit, perm_c may be overwritten by the product of the input
- *           perm_c and a permutation that postorders the elimination tree
- *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
- *           is already in postorder.
- *
- *         o R (double*) dimension (A->nrow)
- *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
- *                          diag(R).
- *           If DiagScale = NOEQUIL or COL, R is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
- *           an input argument; otherwise, R is an output argument.
- *
- *         o C (double*) dimension (A->ncol)
- *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
- *                          diag(C).
- *           If DiagScale = NOEQUIL or ROW, C is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
- *           an input argument; otherwise, C is an output argument.
- *         
- * B       (input/output) double*
- *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
- *         On exit, the solution matrix if info = 0;
- *
- *         NOTE: Currently, B must reside in all processes when calling
- *               this routine.
- *
- * ldb     (input) int (global)
- *         The leading dimension of matrix B.
- *
- * nrhs    (input) int (global)
- *         The number of right-hand sides.
- *         If nrhs = 0, only LU decomposition is performed, the forward
- *         and back substitutions are skipped.
- *
- * grid    (input) gridinfo_t*
- *         The 2D process mesh. It contains the MPI communicator, the number
- *         of process rows (NPROW), the number of process columns (NPCOL),
- *         and my process rank. It is an input argument to all the
- *         parallel routines.
- *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         It contains the following fields:
- *
- *         o etree (int*) dimension (A->ncol)
- *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
- *           It is computed in sp_colorder() during the first factorization,
- *           and is reused in the subsequent factorizations of the matrices
- *           with the same nonzero pattern.
- *           On exit of sp_colorder(), the columns of A are permuted so that
- *           the etree is in a certain postorder. This postorder is reflected
- *           in ScalePermstruct->perm_c.
- *           NOTE:
- *           Etree is a vector of parent pointers for a forest whose vertices
- *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- *         o Glu_persist (Glu_persist_t*)
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (LocalLU_t*)
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * berr    (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *
- *
- * See superlu_ddefs.h for the definitions of various data types.
- *
- */
-    SuperMatrix AC;
-    NCformat *Astore;
-    NCPformat *ACstore;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    Glu_freeable_t *Glu_freeable;
-            /* The nonzero structures of L and U factors, which are
-	       replicated on all processrs.
-	           (lsub, xlsub) contains the compressed subscript of
-		                 supernodes in L.
-          	   (usub, xusub) contains the compressed subscript of
-		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
-	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
-	      routine. They will be freed after DDISTRIBUTE routine.
-	      If options->Fact == SamePattern_SameRowPerm, these
-	      structures are not used.                                  */
-    fact_t   Fact;
-    double   *a;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *colptr, *rowind;
-    int_t    colequ, Equil, factored, job, notran, rowequ;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
-    int      iam;
-    int      ldx;  /* LDA for matrix X (global). */
-    char     equed[1], norm[1];
-    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
-    double   *X, *b_col, *b_work, *x_col;
-    double   t;
-    static mem_usage_t num_mem_usage, symb_mem_usage;
-#if ( PRNTlevel>= 2 )
-    double   dmin, dsum, dprod;
-#endif
-
-    /* Test input parameters. */
-    *info = 0;
-    Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
-	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
-	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
-	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
-	*info = -1;
-    else if ( options->IterRefine == EXTRA ) {
-	*info = -1;
-	fprintf(stderr, "Extra precise iterative refinement yet to support.");
-    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
-         A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < A->nrow )
-	*info = -5;
-    else if ( nrhs < 0 )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	pxerbla("pdgssvx_ABglobal", grid, -*info);
-	return;
-    }
-
-    /* Initialization */
-    factored = (Fact == FACTORED);
-    Equil = (!factored && options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    iam = grid->iam;
-    job = 5;
-    m = A->nrow;
-    n = A->ncol;
-    Astore = A->Store;
-    nnz = Astore->nnz;
-    a = Astore->nzval;
-    colptr = Astore->colptr;
-    rowind = Astore->rowind;
-    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
-	rowequ = (ScalePermstruct->DiagScale == ROW) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-	colequ = (ScalePermstruct->DiagScale == COL) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-    } else rowequ = colequ = FALSE;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgssvx_ABglobal()");
-#endif
-
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    etree = LUstruct->etree;
-    R = ScalePermstruct->R;
-    C = ScalePermstruct->C;
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	/* Allocate storage if not done so before. */
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->R = R;
-		ScalePermstruct->C = C;
-		break;
-	    case ROW: 
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->C = C;
-		break;
-	    case COL: 
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-		ScalePermstruct->R = R;
-		break;
-	}
-    }
-
-    /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix.
-       ------------------------------------------------------------*/
-    if ( Equil ) {
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Enter equil");
-#endif
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm ) {
-	    /* Reuse R and C. */
-	    switch ( ScalePermstruct->DiagScale ) {
-	      case NOEQUIL:
-		break;
-	      case ROW:
-		for (j = 0; j < n; ++j) {
-		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			irow = rowind[i];
-			a[i] *= R[irow];       /* Scale rows. */
-		    }
-		}
-		break;
-	      case COL:
-		for (j = 0; j < n; ++j)
-		    for (i = colptr[j]; i < colptr[j+1]; ++i)
-			a[i] *= C[j];          /* Scale columns. */
-		break;
-	      case BOTH: 
-		for (j = 0; j < n; ++j) {
-		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			irow = rowind[i];
-			a[i] *= R[irow] * C[j]; /* Scale rows and columns. */
-		    }
-		}
-	        break;
-	    }
-	} else {
-	    if ( !iam ) {
-		/* Compute row and column scalings to equilibrate matrix A. */
-		dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
-	    
-		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
-		if ( iinfo == 0 ) {
-		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
-		} else {
-		    if ( iinfo > 0 ) {
-			if ( iinfo <= m )
-			    fprintf(stderr, "The %d-th row of A is exactly zero\n", 
-				    iinfo);
-			else fprintf(stderr, "The %d-th column of A is exactly zero\n", 
-				     iinfo-n);
-			exit(-1);
-		    }
-		}
-	    } else {
-		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
-		if ( iinfo == 0 ) {
-		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
-		} else {
-		    ABORT("DGSEQU failed\n");
-		}
-	    }
-	
-	    /* Equilibrate matrix A. */
-	    dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
-	    if ( lsame_(equed, "R") ) {
-		ScalePermstruct->DiagScale = rowequ = ROW;
-	    } else if ( lsame_(equed, "C") ) {
-		ScalePermstruct->DiagScale = colequ = COL;
-	    } else if ( lsame_(equed, "B") ) {
-		ScalePermstruct->DiagScale = BOTH;
-		rowequ = ROW;
-		colequ = COL;
-	    } else ScalePermstruct->DiagScale = NOEQUIL;
-
-#if ( PRNTlevel>=1 )
-	    if ( !iam ) {
-		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
-	    }
-#endif
-	} /* if Fact ... */
-
-	stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit equil");
-#endif
-    } /* end if Equil ... */
-    
-    /* ------------------------------------------------------------
-       Permute rows of A. 
-       ------------------------------------------------------------*/
-    if ( options->RowPerm != NO ) {
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
-	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
-	    for (i = 0; i < colptr[n]; ++i) {
-		    irow = rowind[i]; 
-		    rowind[i] = perm_r[irow];
-	    }
-	} else if ( !factored ) {
-	    if ( job == 5 ) {
-		/* Allocate storage for scaling factors. */
-		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
-		    ABORT("SUPERLU_MALLOC fails for R1[]");
-		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
-		    ABORT("SUPERLU_MALLOC fails for C1[]");
-	    }
-
-	    if ( !iam ) {
-		/* Process 0 finds a row permutation for large diagonal. */
-		dldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1);
-		
-		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		if ( job == 5 && Equil ) {
-		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		}
-	    } else {
-		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		if ( job == 5 && Equil ) {
-		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		}
-	    }
-
-#if ( PRNTlevel>=2 )
-	    dmin = dlamch_("Overflow");
-	    dsum = 0.0;
-	    dprod = 1.0;
-#endif
-	    if ( job == 5 ) {
-		if ( Equil ) {
-		    for (i = 0; i < n; ++i) {
-			R1[i] = exp(R1[i]);
-			C1[i] = exp(C1[i]);
-		    }
-		    for (j = 0; j < n; ++j) {
-			for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			    irow = rowind[i];
-			    a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */
-			    rowind[i] = perm_r[irow];
-#if ( PRNTlevel>=2 )
-			    if ( rowind[i] == j ) /* New diagonal */
-				dprod *= fabs(a[i]);
-#endif
-			}
-		    }
-
-		    /* Multiply together the scaling factors. */
-		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
-		    else for (i = 0; i < m; ++i) R[i] = R1[i];
-		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
-		    else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
-		    ScalePermstruct->DiagScale = BOTH;
-		    rowequ = colequ = 1;
-		} else { /* No equilibration. */
-		    for (i = colptr[0]; i < colptr[n]; ++i) {
-		        irow = rowind[i];
-			rowind[i] = perm_r[irow];
-         	    }
-		}
-		SUPERLU_FREE (R1);
-		SUPERLU_FREE (C1);
-	    } else { /* job = 2,3,4 */
-		for (j = 0; j < n; ++j) {
-		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			irow = rowind[i];
-			rowind[i] = perm_r[irow];
-#if ( PRNTlevel>=2 )
-			if ( rowind[i] == j ) { /* New diagonal */
-			    if ( job == 2 || job == 3 )
-				dmin = SUPERLU_MIN(dmin, fabs(a[i]));
-			    else if ( job == 4 )
-				dsum += fabs(a[i]);
-			    else if ( job == 5 )
-				dprod *= fabs(a[i]);
-			}
-#endif
-		    }
-		}
-	    }
-
-#if ( PRNTlevel>=2 )
-	    if ( job == 2 || job == 3 ) {
-		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
-	    } else if ( job == 4 ) {
-		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
-	    } else if ( job == 5 ) {
-		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
-	    }
-#endif
-	    
-        } /* else !factored */
-
-	t = SuperLU_timer_() - t;
-	stat->utime[ROWPERM] = t;
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
-#endif
-    
-    } else { /* options->RowPerm == NOROWPERM */
-        for (i = 0; i < m; ++i) perm_r[i] = i;
-    }
-
-    if ( !factored || options->IterRefine ) {
-	/* Compute norm(A), which will be used to adjust small diagonal. */
-	if ( notran ) *(unsigned char *)norm = '1';
-	else *(unsigned char *)norm = 'I';
-	anorm = dlangs_dist(norm, A);
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
-#endif
-    }
-
-    /* ------------------------------------------------------------
-       Perform the LU factorization.
-       ------------------------------------------------------------*/
-    if ( !factored ) {
-	t = SuperLU_timer_();
-	/*
-	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	if ( permc_spec != MY_PERMC && Fact == DOFACT )
-	    /* Use an ordering provided by SuperLU */
-	    get_perm_c_dist(iam, permc_spec, A, perm_c);
-
-	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
-	   (a.k.a. column etree), depending on the choice of ColPerm.
-	   Adjust perm_c[] to be consistent with a postorder of etree.
-	   Permute columns of A to form A*Pc'. */
-	sp_colorder(options, A, perm_c, etree, &AC);
-
-	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
-	ACstore = AC.Store;
-	for (j = 0; j < n; ++j) 
-	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
-		irow = ACstore->rowind[i];
-		ACstore->rowind[i] = perm_c[irow];
-	    }
-	stat->utime[COLPERM] = SuperLU_timer_() - t;
-
-	/* Perform a symbolic factorization on matrix A and set up the
-	   nonzero data structures which are suitable for supernodal GENP. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-#if ( PRNTlevel>=1 ) 
-	    if ( !iam ) 
-		printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
-#endif
-	    t = SuperLU_timer_();
-	    if ( !(Glu_freeable = (Glu_freeable_t *)
-		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
-		ABORT("Malloc fails for Glu_freeable.");
-
-	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
-			     Glu_persist, Glu_freeable);
-
-	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-
-	    if ( iinfo < 0 ) {
-		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 ) 
-		if ( !iam ) {
-		    printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
-		    printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
-		    printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
-		    printf("\tint %d, short %d, float %d, double %d\n", 
-			   sizeof(int_t), sizeof(short), sizeof(float),
-			   sizeof(double));
-		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-			   symb_mem_usage.for_lu*1e-6, 
-			   symb_mem_usage.total*1e-6,
-			   symb_mem_usage.expansions);
-		}
-#endif
-	    } else {
-		if ( !iam ) {
-		    fprintf(stderr, "symbfact() error returns %d\n", iinfo);
-		    exit(-1);
-		}
-	    }
-	}
-
-	/* Distribute the L and U factors onto the process grid. */
-	t = SuperLU_timer_();
-	dist_mem_use = ddistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
-	stat->utime[DIST] = SuperLU_timer_() - t;
-
-	/* Deallocate storage used in symbolic factor. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-	    iinfo = symbfact_SubFree(Glu_freeable);
-	    SUPERLU_FREE(Glu_freeable);
-	}
-
-	/* Perform numerical factorization in parallel. */
-	t = SuperLU_timer_();
-	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
-	stat->utime[FACT] = SuperLU_timer_() - t;
-
-#if ( PRNTlevel>=1 )
-	{
-	    int_t TinyPivots;
-	    float for_lu, total, max, avg, temp;
-	    dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
-	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Reduce( &num_mem_usage.total, &total,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    temp = SUPERLU_MAX(symb_mem_usage.total,
-			       symb_mem_usage.for_lu +
-			       (float)dist_mem_use + num_mem_usage.for_lu);
-	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
-			  MPI_SUM, grid->comm );
-	    stat->TinyPivots = TinyPivots;
-	    if ( !iam ) {
-		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
-		       for_lu*1e-6, total*1e-6);
-		printf("\tAll space (MB):"
-		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
-		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
-		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
-	    }
-	}
-#endif
-    
-#if ( PRNTlevel>=2 )
-	if ( !iam ) printf(".. pdgstrf INFO = %d\n", *info);
-#endif
-
-    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
-	/* Permute columns of A to form A*Pc' using the existing perm_c.
-	 * NOTE: rows of A were previously permuted to Pc*A.
-	 */
-	sp_colorder(options, A, perm_c, NULL, &AC);
-    } /* if !factored ... */
-	
-    /* ------------------------------------------------------------
-       Compute the solution matrix X.
-       ------------------------------------------------------------*/
-    if ( nrhs ) {
-
-	if ( !(b_work = doubleMalloc_dist(n)) )
-	    ABORT("Malloc fails for b_work[]");
-
-	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    if ( rowequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < m; ++i) b_col[i] *= R[i];
-		    b_col += ldb;
-		}
-	    }
-	} else if ( colequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < m; ++i) b_col[i] *= C[i];
-		b_col += ldb;
-	    }
-	}
-
-	/* ------------------------------------------------------------
-	   Permute the right-hand side to form Pr*B.
-	   ------------------------------------------------------------*/
-	if ( options->RowPerm != NO ) {
-	    if ( notran ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
-		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
-		    b_col += ldb;
-		}
-	    }
-	}
-
-
-	/* ------------------------------------------------------------
-	   Permute the right-hand side to form Pc*B.
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
-		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
-		b_col += ldb;
-	    }
-	}
-
-	/* Save a copy of the right-hand side. */
-	ldx = ldb;
-	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
-	    ABORT("Malloc fails for X[]");
-	x_col = X;  b_col = B;
-	for (j = 0; j < nrhs; ++j) {
-	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
-	    x_col += ldx;  b_col += ldb;
-	}
-
-	/* ------------------------------------------------------------
-	   Solve the linear system.
-	   ------------------------------------------------------------*/
-	pdgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);
-
-	/* ------------------------------------------------------------
-	   Use iterative refinement to improve the computed solution and
-	   compute error bounds and backward error estimates for it.
-	   ------------------------------------------------------------*/
-	if ( options->IterRefine ) {
-	    /* Improve the solution by iterative refinement. */
-	    t = SuperLU_timer_();
-	    pdgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
-			      X, ldx, nrhs, berr, stat, info);
-	    stat->utime[REFINE] = SuperLU_timer_() - t;
-	}
-
-	/* Permute the solution matrix X <= Pc'*X. */
-	for (j = 0; j < nrhs; j++) {
-	    b_col = &B[j*ldb];
-	    x_col = &X[j*ldx];
-	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
-	}
-	
-	/* Transform the solution matrix X to a solution of the original system
-	   before the equilibration. */
-	if ( notran ) {
-	    if ( colequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < n; ++i) b_col[i] *= C[i];
-		    b_col += ldb;
-		}
-	    }
-	} else if ( rowequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < n; ++i) b_col[i] *= R[i];
-		b_col += ldb;
-	    }
-	}
-
-	SUPERLU_FREE(b_work);
-	SUPERLU_FREE(X);
-
-    } /* end if nrhs != 0 */
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-#endif
-
-    /* Deallocate R and/or C if it is not used. */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-	        SUPERLU_FREE(R);
-		SUPERLU_FREE(C);
-		break;
-	    case ROW: 
-		SUPERLU_FREE(C);
-		break;
-	    case COL: 
-		SUPERLU_FREE(R);
-		break;
-	}
-    }
-    if ( !factored || (factored && options->IterRefine) )
-	Destroy_CompCol_Permuted_dist(&AC);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgssvx_ABglobal()");
-#endif
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1283 +0,0 @@
-
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-pdgssvx(superlu_options_t *options, SuperMatrix *A, 
-	ScalePermstruct_t *ScalePermstruct,
-	double B[], int ldb, int nrhs, gridinfo_t *grid,
-	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
-	SuperLUStat_t *stat, int *info)
-{
-/* 
- * -- Distributed SuperLU routine (version 2.2) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- * Feburary 20, 2008
- *
- *
- * Purpose
- * =======
- *
- * PDGSSVX solves a system of linear equations A*X=B,
- * by using Gaussian elimination with "static pivoting" to
- * compute the LU factorization of A.
- *
- * Static pivoting is a technique that combines the numerical stability
- * of partial pivoting with the scalability of Cholesky (no pivoting),
- * to run accurately and efficiently on large numbers of processors.
- * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
- * description of the parallel algorithms.
- *
- * The input matrices A and B are distributed by block rows.
- * Here is a graphical illustration (0-based indexing):
- *
- *                        A                B
- *               0 ---------------       ------
- *                   |           |        |  |
- *                   |           |   P0   |  |
- *                   |           |        |  |
- *                 ---------------       ------
- *        - fst_row->|           |        |  |
- *        |          |           |        |  |
- *       m_loc       |           |   P1   |  |
- *        |          |           |        |  |
- *        -          |           |        |  |
- *                 ---------------       ------
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                 ---------------       ------
- * 
- * where, fst_row is the row number of the first row,
- *        m_loc is the number of rows local to this processor
- * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
- *
- *
- * Here are the options for using this code:
- *
- *   1. Independent of all the other options specified below, the
- *      user must supply
- *
- *      -  B, the matrix of right-hand sides, distributed by block rows,
- *            and its dimensions ldb (local) and nrhs (global)
- *      -  grid, a structure describing the 2D processor mesh
- *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
- *            iterative refinement
- *
- *      On output, B is overwritten with the solution X.
- *
- *   2. Depending on options->Fact, the user has four options
- *      for solving A*X=B. The standard option is for factoring
- *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
- *      the previous factorization.)
- *
- *      -  options->Fact = DOFACT: A is factored "from scratch"
- *
- *      In this case the user must also supply
- *
- *        o  A, the input matrix
- *
- *        as well as the following options to determine what matrix to
- *        factorize.
- *
- *        o  options->Equil,   to specify how to scale the rows and columns
- *                             of A to "equilibrate" it (to try to reduce its
- *                             condition number and so improve the
- *                             accuracy of the computed solution)
- *
- *        o  options->RowPerm, to specify how to permute the rows of A
- *                             (typically to control numerical stability)
- *
- *        o  options->ColPerm, to specify how to permute the columns of A
- *                             (typically to control fill-in and enhance
- *                             parallelism during factorization)
- *
- *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
- *                             pivots encountered during factorization
- *                             (to control numerical stability)
- *
- *      The outputs returned include
- *         
- *        o  ScalePermstruct,  modified to describe how the input matrix A
- *                             was equilibrated and permuted:
- *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
- *                                         columns of A were scaled
- *          .  ScalePermstruct->R, array of row scale factors
- *          .  ScalePermstruct->C, array of column scale factors
- *          .  ScalePermstruct->perm_r, row permutation vector
- *          .  ScalePermstruct->perm_c, column permutation vector
- *
- *          (part of ScalePermstruct may also need to be supplied on input,
- *           depending on options->RowPerm and options->ColPerm as described 
- *           later).
- *
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix diag(R)*A*diag(C)*Pc^T, where 
- *              Pc is the row permutation matrix determined by
- *                  ScalePermstruct->perm_c
- *              diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
- *                  ScalePermstruct->C
- *
- *        o  LUstruct, which contains the L and U factorization of A1 where
- *
- *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
- *                in A on output.)
- *
- *   3. The second value of options->Fact assumes that a matrix with the same
- *      sparsity pattern as A has already been factored:
- *     
- *      -  options->Fact = SamePattern: A is factored, assuming that it has
- *            the same nonzero pattern as a previously factored matrix. In
- *            this case the algorithm saves time by reusing the previously
- *            computed column permutation vector stored in
- *            ScalePermstruct->perm_c and the "elimination tree" of A
- *            stored in LUstruct->etree
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->RowPerm
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->ColPerm, whose value is ignored. This is because the
- *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->perm_c, the column permutation
- *        o  LUstruct->etree, the elimination tree
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct, modified to describe how the input matrix A was
- *                            equilibrated and row permuted
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   4. The third value of options->Fact assumes that a matrix B with the same
- *      sparsity pattern as A has already been factored, and where the
- *      row permutation of B can be reused for A. This is useful when A and B
- *      have similar numerical values, so that the same row permutation
- *      will make both factorizations numerically stable. This lets us reuse
- *      all of the previously computed structure of L and U.
- *
- *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
- *            assuming not only the same nonzero pattern as the previously
- *            factored matrix B, but reusing B's row permutation.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->RowPerm or options->ColPerm, whose values are
- *      ignored. This is because the permutations from ScalePermstruct->perm_r
- *      and ScalePermstruct->perm_c are used as input.
- *
- *      The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->DiagScale, how the previous matrix was row
- *                                       and/or column scaled
- *        o  ScalePermstruct->R, the row scalings of the previous matrix,
- *                               if any
- *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
- *                               if any
- *        o  ScalePermstruct->perm_r, the row permutation of the previous
- *                                    matrix
- *        o  ScalePermstruct->perm_c, the column permutation of the previous 
- *                                    matrix
- *        o  all of LUstruct, the previously computed information about
- *                            L and U (the actual numerical values of L and U
- *                            stored in LUstruct->Llu are ignored)
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct,  modified to describe how the input matrix A was
- *                             equilibrated (thus ScalePermstruct->DiagScale,
- *                             R and C may be modified)
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
- *      call, and reuses its entire LU factorization
- *
- *      -  options->Fact = Factored: A is identical to a previously
- *            factorized matrix, so the entire previous factorization
- *            can be reused.
- *
- *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
- *       options->ReplaceTinyPivot)
- *
- *      The user must also supply 
- *
- *        o  A, the unfactored matrix, only in the case that iterative
- *              refinment is to be done (specifically A must be the output
- *              A from the previous call, so that it has been scaled and permuted)
- *        o  all of ScalePermstruct
- *        o  all of LUstruct, including the actual numerical values of
- *           L and U
- *
- *      all of which are unmodified on output.
- *         
- * Arguments
- * =========
- *
- * options (input) superlu_options_t* (global)
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following fields should be defined for this structure:
- *         
- *         o Fact (fact_t)
- *           Specifies whether or not the factored form of the matrix
- *           A is supplied on entry, and if not, how the matrix A should
- *           be factorized based on the previous history.
- *
- *           = DOFACT: The matrix A will be factorized from scratch.
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *
- *           = SamePattern: the matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the elimination tree
- *             LUstruct->etree
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ReplaceTinyPivot
- *                          ScalePermstruct->perm_c
- *                          LUstruct->etree
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
- *                          rest of LUstruct (GLU_persist, Llu)
- *
- *           = SamePattern_SameRowPerm: the matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *                 Inputs:  A
- *                          options->Equil, ReplaceTinyPivot
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          modified LUstruct->Llu
- *           = FACTORED: the matrix A is already factored.
- *                 Inputs:  all of ScalePermstruct
- *                          all of LUstruct
- *
- *         o Equil (yes_no_t)
- *           Specifies whether to equilibrate the system.
- *           = NO:  no equilibration.
- *           = YES: scaling factors are computed to equilibrate the system:
- *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
- *                  Whether or not the system will be equilibrated depends
- *                  on the scaling of the matrix A, but if equilibration is
- *                  used, A is overwritten by diag(R)*A*diag(C) and B by
- *                  diag(R)*B.
- *
- *         o RowPerm (rowperm_t)
- *           Specifies how to permute rows of the matrix A.
- *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
- *                        the original matrix to make the diagonal large
- *                        relative to the off-diagonal.
- *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
- *                        input by the user.
- *           
- *         o ColPerm (colperm_t)
- *           Specifies what type of column permutation to use to reduce fill.
- *           = NATURAL:       natural ordering.
- *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
- *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
- *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
- *         o ReplaceTinyPivot (yes_no_t)
- *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
- *                  LU factorization.
- *
- *         o IterRefine (IterRefine_t)
- *           Specifies how to perform iterative refinement.
- *           = NO:     no iterative refinement.
- *           = DOUBLE: accumulate residual in double precision.
- *           = EXTRA:  accumulate residual in extra precision.
- *
- *         NOTE: all options must be indentical on all processes when
- *               calling this routine.
- *
- * A (input/output) SuperMatrix* (local)
- *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *           The number of linear equations is A->nrow. The type of A must be:
- *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *           That is, A is stored in distributed compressed row format.
- *           See supermatrix.h for the definition of 'SuperMatrix'.
- *           This routine only handles square A, however, the LU factorization
- *           routine PDGSTRF can factorize rectangular matrices.
- *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
- *           depending on ScalePermstruct->DiagScale and options->ColPerm:
- *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
- *                diag(R)*A*diag(C).
- *             if options->ColPerm != NATURAL, A is further overwritten by
- *                diag(R)*A*diag(C)*Pc^T.
- *           If all the above condition are true, the LU decomposition is
- *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
- *
- * ScalePermstruct (input/output) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *         It contains the following fields:
- *
- *         o DiagScale (DiagScale_t)
- *           Specifies the form of equilibration that was done.
- *           = NOEQUIL: no equilibration.
- *           = ROW:     row equilibration, i.e., A was premultiplied by
- *                      diag(R).
- *           = COL:     Column equilibration, i.e., A was postmultiplied
- *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
- *                      replaced by diag(R)*A*diag(C).
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
- *           DiagScale is an input argument; otherwise it is an output
- *           argument.
- *
- *         o perm_r (int*)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *           If options->RowPerm = MY_PERMR, or
- *           options->Fact = SamePattern_SameRowPerm, perm_r is an
- *           input argument; otherwise it is an output argument.
- *
- *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
- *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
- *           input argument; otherwise, it is an output argument.
- *           On exit, perm_c may be overwritten by the product of the input
- *           perm_c and a permutation that postorders the elimination tree
- *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
- *           is already in postorder.
- *
- *         o R (double*) dimension (A->nrow)
- *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
- *                          diag(R).
- *           If DiagScale = NOEQUIL or COL, R is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
- *           an input argument; otherwise, R is an output argument.
- *
- *         o C (double*) dimension (A->ncol)
- *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
- *                          diag(C).
- *           If DiagScale = NOEQUIL or ROW, C is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
- *           an input argument; otherwise, C is an output argument.
- *         
- * B       (input/output) double* (local)
- *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
- *           where, m_loc is the number of rows stored locally on my
- *           process and is defined in the data structure of matrix A.
- *         On exit, the solution matrix if info = 0;
- *
- * ldb     (input) int (local)
- *         The leading dimension of matrix B.
- *
- * nrhs    (input) int (global)
- *         The number of right-hand sides.
- *         If nrhs = 0, only LU decomposition is performed, the forward
- *         and back substitutions are skipped.
- *
- * grid    (input) gridinfo_t* (global)
- *         The 2D process mesh. It contains the MPI communicator, the number
- *         of process rows (NPROW), the number of process columns (NPCOL),
- *         and my process rank. It is an input argument to all the
- *         parallel routines.
- *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         It contains the following fields:
- *
- *         o etree (int*) dimension (A->ncol) (global)
- *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
- *           It is computed in sp_colorder() during the first factorization,
- *           and is reused in the subsequent factorizations of the matrices
- *           with the same nonzero pattern.
- *           On exit of sp_colorder(), the columns of A are permuted so that
- *           the etree is in a certain postorder. This postorder is reflected
- *           in ScalePermstruct->perm_c.
- *           NOTE:
- *           Etree is a vector of parent pointers for a forest whose vertices
- *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- *         o Glu_persist (Glu_persist_t*) (global)
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (LocalLU_t*) (local)
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * SOLVEstruct (input/output) SOLVEstruct_t*
- *         The data structure to hold the communication pattern used
- *         in the phases of triangular solution and iterative refinement.
- *         This pattern should be intialized only once for repeated solutions.
- *         If options->SolveInitialized = YES, it is an input argument.
- *         If options->SolveInitialized = NO and nrhs != 0, it is an output
- *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
- *
- * berr    (output) double*, dimension (nrhs) (global)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *
- * See superlu_ddefs.h for the definitions of varioous data types.
- *
- */
-    NRformat_loc *Astore;
-    SuperMatrix GA;      /* Global A in NC format */
-    NCformat *GAstore;
-    double   *a_GA;
-    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
-    NCPformat *GACstore;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    Glu_freeable_t *Glu_freeable;
-            /* The nonzero structures of L and U factors, which are
-	       replicated on all processrs.
-	           (lsub, xlsub) contains the compressed subscript of
-		                 supernodes in L.
-          	   (usub, xusub) contains the compressed subscript of
-		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
-	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
-	      routine. They will be freed after PDDISTRIBUTE routine.
-	      If options->Fact == SamePattern_SameRowPerm, these
-	      structures are not used.                                  */
-    fact_t   Fact;
-    double   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    *rowind_loc, *colptr_loc;
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
-    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
-    double   *X, *b_col, *b_work, *x_col;
-    double   t;
-    static mem_usage_t num_mem_usage, symb_mem_usage;
-#if ( PRNTlevel>= 2 )
-    double   dmin, dsum, dprod;
-#endif
-    int_t procs;
-
-    /* Structures needed for parallel symbolic factorization */
-    int_t *sizes, *fstVtxSep, parSymbFact;
-    int   noDomains, nprocs_num;
-    MPI_Comm symb_comm; /* communicator for symbolic factorization */
-    int   col, key; /* parameters for creating a new communicator */
-    Pslu_freeable_t Pslu_freeable;
-    float  flinfo;
-
-    /* Initialization. */
-    m       = A->nrow;
-    n       = A->ncol;
-    Astore  = (NRformat_loc *) A->Store;
-    nnz_loc = Astore->nnz_loc;
-    m_loc   = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    a       = (double *) Astore->nzval;
-    rowptr  = Astore->rowptr;
-    colind  = Astore->colind;
-    sizes   = NULL;
-    fstVtxSep = NULL;
-    symb_comm = MPI_COMM_NULL;
-
-    /* Test the input parameters. */
-    *info = 0;
-    Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
-	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
-	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
-	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
-	*info = -1;
-    else if ( options->IterRefine == EXTRA ) {
-	*info = -1;
-	fprintf(stderr, "Extra precise iterative refinement yet to support.");
-    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-		|| A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < m_loc )
-	*info = -5;
-    else if ( nrhs < 0 )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	pxerbla("pdgssvx", grid, -*info);
-	return;
-    }
-
-    factored = (Fact == FACTORED);
-    Equil = (!factored && options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    iam = grid->iam;
-    job = 5;
-    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
-	rowequ = (ScalePermstruct->DiagScale == ROW) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-	colequ = (ScalePermstruct->DiagScale == COL) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-    } else rowequ = colequ = FALSE;
-
-    /* The following arrays are replicated on all processes. */
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    etree = LUstruct->etree;
-    R = ScalePermstruct->R;
-    C = ScalePermstruct->C;
-    /********/
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgssvx()");
-#endif
-
-    /* Not factored & ask for equilibration */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
-	/* Allocate storage if not done so before. */
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->R = R;
-		ScalePermstruct->C = C;
-		break;
-	    case ROW: 
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->C = C;
-		break;
-	    case COL: 
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-		ScalePermstruct->R = R;
-		break;
-	}
-    }
-
-    /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix.
-       ------------------------------------------------------------*/
-    if ( Equil ) {
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Enter equil");
-#endif
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm ) {
-	    /* Reuse R and C. */
-	    switch ( ScalePermstruct->DiagScale ) {
-	      case NOEQUIL:
-		break;
-	      case ROW:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			a[i] *= R[irow];       /* Scale rows. */
-		    }
-		    ++irow;
-		}
-		break;
-	      case COL:
-		for (j = 0; j < m_loc; ++j)
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
-		        icol = colind[i];
-			a[i] *= C[icol];          /* Scale columns. */
-		    }
-		break;
-	      case BOTH:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			icol = colind[i];
-			a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
-		    }
-		    ++irow;
-		}
-	        break;
-	    }
-	} else { /* Compute R & C from scratch */
-            /* Compute the row and column scalings. */
-	    pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
-
-	    /* Equilibrate matrix A if it is badly-scaled. */
-	    pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
-
-	    if ( lsame_(equed, "R") ) {
-		ScalePermstruct->DiagScale = rowequ = ROW;
-	    } else if ( lsame_(equed, "C") ) {
-		ScalePermstruct->DiagScale = colequ = COL;
-	    } else if ( lsame_(equed, "B") ) {
-		ScalePermstruct->DiagScale = BOTH;
-		rowequ = ROW;
-		colequ = COL;
-	    } else ScalePermstruct->DiagScale = NOEQUIL;
-
-#if ( PRNTlevel>=1 )
-	    if ( !iam ) {
-		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
-	    }
-#endif
-	} /* if Fact ... */
-
-	stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit equil");
-#endif
-    } /* if Equil ... */
-
-    if ( !factored ) { /* Skip this if already factored. */
-        /*
-         * Gather A from the distributed compressed row format to
-         * global A in compressed column format.
-         * Numerical values are gathered only when a row permutation
-         * for large diagonal is sought after.
-         */
-	if ( Fact != SamePattern_SameRowPerm ) {
-            need_value = (options->RowPerm == LargeDiag);
-            pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
-            GAstore = (NCformat *) GA.Store;
-            colptr = GAstore->colptr;
-            rowind = GAstore->rowind;
-            nnz = GAstore->nnz;
-            if ( need_value ) a_GA = (double *) GAstore->nzval;
-            else assert(GAstore->nzval == NULL);
-	}
-
-        /* ------------------------------------------------------------
-           Find the row permutation for A.
-           ------------------------------------------------------------*/
-        if ( options->RowPerm != NO ) {
-	    t = SuperLU_timer_();
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
-	            /* Permute the global matrix GA for symbfact() */
-	            for (i = 0; i < colptr[n]; ++i) {
-	            	irow = rowind[i]; 
-		    	rowind[i] = perm_r[irow];
-	            }
-	        } else { /* options->RowPerm == LargeDiag */
-	            /* Get a new perm_r[] */
-	            if ( job == 5 ) {
-		        /* Allocate storage for scaling factors. */
-		        if ( !(R1 = doubleMalloc_dist(m)) )
-		            ABORT("SUPERLU_MALLOC fails for R1[]");
-		    	if ( !(C1 = doubleMalloc_dist(n)) )
-		            ABORT("SUPERLU_MALLOC fails for C1[]");
-	            }
-
-	            if ( !iam ) {
-		        /* Process 0 finds a row permutation */
-		        dldperm(job, m, nnz, colptr, rowind, a_GA,
-		                perm_r, R1, C1);
-		
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            } else {
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            }
-
-#if ( PRNTlevel>=2 )
-	            dmin = dlamch_("Overflow");
-	            dsum = 0.0;
-	            dprod = 1.0;
-#endif
-	            if ( job == 5 ) {
-		        if ( Equil ) {
-		            for (i = 0; i < n; ++i) {
-			        R1[i] = exp(R1[i]);
-			        C1[i] = exp(C1[i]);
-		            }
-
-		            /* Scale the distributed matrix */
-		            irow = fst_row;
-		            for (j = 0; j < m_loc; ++j) {
-			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			            icol = colind[i];
-			            a[i] *= R1[irow] * C1[icol];
-#if ( PRNTlevel>=2 )
-			            if ( perm_r[irow] == icol ) { /* New diagonal */
-			              if ( job == 2 || job == 3 )
-				        dmin = SUPERLU_MIN(dmin, fabs(a[i]));
-			              else if ( job == 4 )
-				        dsum += fabs(a[i]);
-			              else if ( job == 5 )
-				        dprod *= fabs(a[i]);
-			            }
-#endif
-			        }
-			        ++irow;
-		            }
-
-		            /* Multiply together the scaling factors. */
-		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
-		            else for (i = 0; i < m; ++i) R[i] = R1[i];
-		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
-		            else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
-		            ScalePermstruct->DiagScale = BOTH;
-		            rowequ = colequ = 1;
-
-		        } /* end Equil */
-
-                        /* Now permute global A to prepare for symbfact() */
-                        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	                        irow = rowind[i];
-		                rowind[i] = perm_r[irow];
-		            }
-		        }
-		        SUPERLU_FREE (R1);
-		        SUPERLU_FREE (C1);
-	            } else { /* job = 2,3,4 */
-		        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			        irow = rowind[i];
-			        rowind[i] = perm_r[irow];
-		            } /* end for i ... */
-		        } /* end for j ... */
-	            } /* end else job ... */
-
-#if ( PRNTlevel>=2 )
-	            if ( job == 2 || job == 3 ) {
-		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
-	            } else if ( job == 4 ) {
-		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
-	            } else if ( job == 5 ) {
-		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
-	            }
-#endif
-	    
-                } /* end if options->RowPerm ... */
-
-	        t = SuperLU_timer_() - t;
-	        stat->utime[ROWPERM] = t;
-#if ( PRNTlevel>=1 )
-	        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
-#endif
-            } /* end if Fact ... */
-        } else { /* options->RowPerm == NOROWPERM */
-            for (i = 0; i < m; ++i) perm_r[i] = i;
-        }
-
-#if ( DEBUGlevel>=2 )
-        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
-#endif
-    } /* end if (!factored) */
-
-    if ( !factored || options->IterRefine ) {
-	/* Compute norm(A), which will be used to adjust small diagonal. */
-	if ( notran ) *(unsigned char *)norm = '1';
-	else *(unsigned char *)norm = 'I';
-	anorm = pdlangs(norm, A, grid);
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
-#endif
-    }
-
-    /* ------------------------------------------------------------
-       Perform the LU factorization.
-       ------------------------------------------------------------*/
-    if ( !factored ) {
-	t = SuperLU_timer_();
-	/*
-	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
-	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	parSymbFact = options->ParSymbFact;
-
-#if ( PRNTlevel>=1 )
-	if ( parSymbFact && permc_spec != PARMETIS )
-	    if ( !iam ) printf(".. Parallel symbolic factorization"
-			       " only works wth ParMetis!\n");
-#endif
-
-	if ( parSymbFact == YES || permc_spec == PARMETIS ) {	
-	    nprocs_num = grid->nprow * grid->npcol;
-  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));
-
-	    /* create a new communicator for the first noDomains processors in
-	       grid->comm */
-	    key = iam;
-    	    if (iam < noDomains) col = 0;
-	    else col = MPI_UNDEFINED;
-	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
-
-	    permc_spec = PARMETIS; /* only works with PARMETIS */
-        }
-
-	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
-	  if ( permc_spec == PARMETIS ) {
-	      /* Get column permutation vector in perm_c.                    *
-	       * This routine takes as input the distributed input matrix A  *
-	       * and does not modify it.  It also allocates memory for       *
-	       * sizes[] and fstVtxSep[] arrays, that contain information    *
-	       * on the separator tree computed by ParMETIS.                 */
-	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
-                                  	   noDomains, &sizes, &fstVtxSep,
-                                           grid, &symb_comm);
-	      if (flinfo > 0)
-	          ABORT("ERROR in get perm_c parmetis.");
-	  } else {
-	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
-          }
-        }
-
-	stat->utime[COLPERM] = SuperLU_timer_() - t;
-
-	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
-	   (a.k.a. column etree), depending on the choice of ColPerm.
-	   Adjust perm_c[] to be consistent with a postorder of etree.
-	   Permute columns of A to form A*Pc'. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-	    if ( parSymbFact == NO ) {
-	        int_t *GACcolbeg, *GACcolend, *GACrowind;
-
-	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
-
-	        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
-	        GACstore = (NCPformat *) GAC.Store;
-	        GACcolbeg = GACstore->colbeg;
-	        GACcolend = GACstore->colend;
-	        GACrowind = GACstore->rowind;
-	        for (j = 0; j < n; ++j) {
-	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
-		        irow = GACrowind[i];
-		        GACrowind[i] = perm_c[irow];
-	            }
-	        }
-
-	        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
-                   the nonzero data structures for L & U. */
-#if ( PRNTlevel>=1 ) 
-                if ( !iam )
-		  printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
-#endif
-  	        t = SuperLU_timer_();
-	        if ( !(Glu_freeable = (Glu_freeable_t *)
-		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
-		    ABORT("Malloc fails for Glu_freeable.");
-
-	    	/* Every process does this. */
-	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
-			     	 Glu_persist, Glu_freeable);
-
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if ( iinfo < 0 ) { /* Successful return */
-		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 )
-		    if ( !iam ) {
-		    	printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
-		    	printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
-		    	printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n", 
-			       sizeof(int_t), sizeof(short), sizeof(float),
-			       sizeof(double));
-		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-			   	symb_mem_usage.for_lu*1e-6, 
-			   	symb_mem_usage.total*1e-6,
-			   	symb_mem_usage.expansions);
-		    }
-#endif
-	    	} else {
-		    if ( !iam ) {
-		        fprintf(stderr,"symbfact() error returns %d\n",iinfo);
-		    	exit(-1);
-		    }
-	        }
-	    } /* end if serial symbolic factorization */
-	    else {  /* parallel symbolic factorization */
-	    	t = SuperLU_timer_();
-	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
-				       sizes, fstVtxSep, &Pslu_freeable, 
-				       &(grid->comm), &symb_comm,
-				       &symb_mem_usage); 
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if (flinfo > 0) 
-	      	    ABORT("Insufficient memory for parallel symbolic factorization.");
-	    }
-	} /* end if Fact ... */
-
-#if ( PRNTlevel>=1 )
-	if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]);
-#endif
-        if (sizes) SUPERLU_FREE (sizes);
-        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
-	if (symb_comm != MPI_COMM_NULL)
-	  MPI_Comm_free (&symb_comm); 
-
-	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
-  	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-  	       distribution routine. */
-	    t = SuperLU_timer_();
-	    dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct,
-                                      Glu_freeable, LUstruct, grid);
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-
-  	    /* Deallocate storage used in symbolic factorization. */
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        iinfo = symbfact_SubFree(Glu_freeable);
-	        SUPERLU_FREE(Glu_freeable);
-	    }
-	} else {
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-	       distribution routine. */
-	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-    	    t = SuperLU_timer_();
-	    dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct,
-		  			   &Pslu_freeable, LUstruct, grid);
-	    if (dist_mem_use > 0)
-	        ABORT ("Not enough memory available for dist_psymbtonum\n");
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-	}
-
-#if ( PRNTlevel>=1 )
-	if (!iam) printf ("\tDISTRIBUTE time    %8.2f\n", stat->utime[DIST]);
-#endif
-
-	/* Perform numerical factorization in parallel. */
-	t = SuperLU_timer_();
-	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
-	stat->utime[FACT] = SuperLU_timer_() - t;
-
-#if ( PRNTlevel>=1 )
-	{
-	    int_t TinyPivots;
-	    float for_lu, total, max, avg, temp;
-	    dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
-	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Reduce( &num_mem_usage.total, &total,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    temp = SUPERLU_MAX(symb_mem_usage.total,
-			       symb_mem_usage.for_lu +
-			       (float)dist_mem_use + num_mem_usage.for_lu);
-	    if (parSymbFact == TRUE)
-	      /* The memory used in the redistribution routine
-		 includes the memory used for storing the symbolic
-		 structure and the memory allocated for numerical
-		 factorization */
-	      temp = SUPERLU_MAX(symb_mem_usage.total,
-				 (float)dist_mem_use);
-	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
-			  MPI_SUM, grid->comm );
-	    stat->TinyPivots = TinyPivots;
-	    if ( !iam ) {
-		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
-		       for_lu*1e-6, total*1e-6);
-		printf("\tAll space (MB):"
-		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
-		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
-		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
-	    }
-	}
-#endif
-    
-        /* Destroy GA */
-        if ( Fact != SamePattern_SameRowPerm )
-            Destroy_CompCol_Matrix_dist(&GA);
-    } /* end if (!factored) */
-	
-    /* ------------------------------------------------------------
-       Compute the solution matrix X.
-       ------------------------------------------------------------*/
-    if ( nrhs ) {
-
-	if ( !(b_work = doubleMalloc_dist(n)) )
-	    ABORT("Malloc fails for b_work[]");
-
-	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    if ( rowequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-		        b_col[i] *= R[irow];
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( colequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    b_col[i] *= C[irow];
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	/* Save a copy of the right-hand side. */
-	ldx = ldb;
-	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
-	    ABORT("Malloc fails for X[]");
-	x_col = X;  b_col = B;
-	for (j = 0; j < nrhs; ++j) {
-	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
-	    x_col += ldx;  b_col += ldb;
-	}
-
-	/* ------------------------------------------------------------
-	   Solve the linear system.
-	   ------------------------------------------------------------*/
-	if ( options->SolveInitialized == NO ) {
-	    dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
-		       SOLVEstruct);
-	}
-
-	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
-		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
-
-	/* ------------------------------------------------------------
-	   Use iterative refinement to improve the computed solution and
-	   compute error bounds and backward error estimates for it.
-	   ------------------------------------------------------------*/
-	if ( options->IterRefine ) {
-	    /* Improve the solution by iterative refinement. */
-	    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
-
-	    t = SuperLU_timer_();
-	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
-	        /* All these cases need to re-initialize gsmv structure */
-	        if ( options->RefineInitialized )
-		    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
-	        pdgsmv_init(A, SOLVEstruct->row_to_proc, grid,
-			    SOLVEstruct->gsmv_comm);
-	       
-                /* Save a copy of the transformed local col indices
-		   in colind_gsmv[]. */
-	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
-	        if ( !(it = intMalloc_dist(nnz_loc)) )
-		    ABORT("Malloc fails for colind_gsmv[]");
-	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
-	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
-	        options->RefineInitialized = YES;
-	    } else if ( Fact == SamePattern ||
-			Fact == SamePattern_SameRowPerm ) {
-	        double at;
-	        int_t k, jcol, p;
-	        /* Swap to beginning the part of A corresponding to the
-		   local part of X, as was done in pdgsmv_init() */
-	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-		    k = rowptr[i];
-		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-		        jcol = colind[j];
-		        p = SOLVEstruct->row_to_proc[jcol];
-		        if ( p == iam ) { /* Local */
-		            at = a[k]; a[k] = a[j]; a[j] = at;
-		            ++k;
-		        }
-		    }
-	        }
-	      
-	        /* Re-use the local col indices of A obtained from the
-		   previous call to pdgsmv_init() */
-	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
-	    }
-
-	    if ( nrhs == 1 ) { /* Use the existing solve structure */
-	        SOLVEstruct1 = SOLVEstruct;
-	    } else { /* For nrhs > 1, since refinement is performed for RHS
-			one at a time, the communication structure for pdgstrs
-			is different than the solve with nrhs RHS. 
-			So we use SOLVEstruct1 for the refinement step.
-		     */
-	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
-		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
-		    ABORT("Malloc fails for SOLVEstruct1");
-	        /* Copy the same stuff */
-	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
-	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
-	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
-	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
-	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
-	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
-	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		
-		/* Initialize the *gstrs_comm for 1 RHS. */
-		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
-		    ABORT("Malloc fails for gstrs_comm[]");
-		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
-			     Glu_persist, SOLVEstruct1);
-	    }
-
-	    pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
-		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
-
-            /* Deallocate the storage associated with SOLVEstruct1 */
-	    if ( nrhs > 1 ) {
-	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
-	        SUPERLU_FREE(SOLVEstruct1);
-	    }
-
-	    stat->utime[REFINE] = SuperLU_timer_() - t;
-	}
-
-	/* Permute the solution matrix B <= Pc'*X. */
-	pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
-			       SOLVEstruct->inv_perm_c,
-			       X, ldx, B, ldb, nrhs, grid);
-#if ( DEBUGlevel>=2 )
-	printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
-	for (i = 0; i < m_loc; ++i)
-	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
-#endif
-	
-	/* Transform the solution matrix X to a solution of the original
-	   system before the equilibration. */
-	if ( notran ) {
-	    if ( colequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-		        b_col[i] *= C[irow];
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( rowequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    b_col[i] *= R[irow];
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	SUPERLU_FREE(b_work);
-	SUPERLU_FREE(X);
-
-    } /* end if nrhs != 0 */
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-#endif
-
-    /* Deallocate R and/or C if it was not used. */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-	        SUPERLU_FREE(R);
-		SUPERLU_FREE(C);
-		break;
-	    case ROW: 
-		SUPERLU_FREE(C);
-		break;
-	    case COL: 
-		SUPERLU_FREE(R);
-		break;
-	}
-    }
-    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
- 	Destroy_CompCol_Permuted_dist(&GAC);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgssvx()");
-#endif
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgssvx.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,1280 +0,0 @@
-
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-pdgssvx(superlu_options_t *options, SuperMatrix *A, 
-	ScalePermstruct_t *ScalePermstruct,
-	double B[], int ldb, int nrhs, gridinfo_t *grid,
-	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
-	SuperLUStat_t *stat, int *info)
-{
-/* 
- * -- Distributed SuperLU routine (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- *
- * Last update: 2/18/2008
- *
- *
- * Purpose
- * =======
- *
- * PDGSSVX solves a system of linear equations A*X=B,
- * by using Gaussian elimination with "static pivoting" to
- * compute the LU factorization of A.
- *
- * Static pivoting is a technique that combines the numerical stability
- * of partial pivoting with the scalability of Cholesky (no pivoting),
- * to run accurately and efficiently on large numbers of processors.
- * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
- * description of the parallel algorithms.
- *
- * The input matrices A and B are distributed by block rows.
- * Here is a graphical illustration (0-based indexing):
- *
- *                        A                B
- *               0 ---------------       ------
- *                   |           |        |  |
- *                   |           |   P0   |  |
- *                   |           |        |  |
- *                 ---------------       ------
- *        - fst_row->|           |        |  |
- *        |          |           |        |  |
- *       m_loc       |           |   P1   |  |
- *        |          |           |        |  |
- *        -          |           |        |  |
- *                 ---------------       ------
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                 ---------------       ------
- * 
- * where, fst_row is the row number of the first row,
- *        m_loc is the number of rows local to this processor
- * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
- *
- *
- * Here are the options for using this code:
- *
- *   1. Independent of all the other options specified below, the
- *      user must supply
- *
- *      -  B, the matrix of right-hand sides, distributed by block rows,
- *            and its dimensions ldb (local) and nrhs (global)
- *      -  grid, a structure describing the 2D processor mesh
- *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
- *            iterative refinement
- *
- *      On output, B is overwritten with the solution X.
- *
- *   2. Depending on options->Fact, the user has four options
- *      for solving A*X=B. The standard option is for factoring
- *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
- *      the previous factorization.)
- *
- *      -  options->Fact = DOFACT: A is factored "from scratch"
- *
- *      In this case the user must also supply
- *
- *        o  A, the input matrix
- *
- *        as well as the following options to determine what matrix to
- *        factorize.
- *
- *        o  options->Equil,   to specify how to scale the rows and columns
- *                             of A to "equilibrate" it (to try to reduce its
- *                             condition number and so improve the
- *                             accuracy of the computed solution)
- *
- *        o  options->RowPerm, to specify how to permute the rows of A
- *                             (typically to control numerical stability)
- *
- *        o  options->ColPerm, to specify how to permute the columns of A
- *                             (typically to control fill-in and enhance
- *                             parallelism during factorization)
- *
- *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
- *                             pivots encountered during factorization
- *                             (to control numerical stability)
- *
- *      The outputs returned include
- *         
- *        o  ScalePermstruct,  modified to describe how the input matrix A
- *                             was equilibrated and permuted:
- *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
- *                                         columns of A were scaled
- *          .  ScalePermstruct->R, array of row scale factors
- *          .  ScalePermstruct->C, array of column scale factors
- *          .  ScalePermstruct->perm_r, row permutation vector
- *          .  ScalePermstruct->perm_c, column permutation vector
- *
- *          (part of ScalePermstruct may also need to be supplied on input,
- *           depending on options->RowPerm and options->ColPerm as described 
- *           later).
- *
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix diag(R)*A*diag(C)*Pc^T, where 
- *              Pc is the row permutation matrix determined by
- *                  ScalePermstruct->perm_c
- *              diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
- *                  ScalePermstruct->C
- *
- *        o  LUstruct, which contains the L and U factorization of A1 where
- *
- *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
- *                in A on output.)
- *
- *   3. The second value of options->Fact assumes that a matrix with the same
- *      sparsity pattern as A has already been factored:
- *     
- *      -  options->Fact = SamePattern: A is factored, assuming that it has
- *            the same nonzero pattern as a previously factored matrix. In
- *            this case the algorithm saves time by reusing the previously
- *            computed column permutation vector stored in
- *            ScalePermstruct->perm_c and the "elimination tree" of A
- *            stored in LUstruct->etree
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->RowPerm
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->ColPerm, whose value is ignored. This is because the
- *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->perm_c, the column permutation
- *        o  LUstruct->etree, the elimination tree
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct, modified to describe how the input matrix A was
- *                            equilibrated and row permuted
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   4. The third value of options->Fact assumes that a matrix B with the same
- *      sparsity pattern as A has already been factored, and where the
- *      row permutation of B can be reused for A. This is useful when A and B
- *      have similar numerical values, so that the same row permutation
- *      will make both factorizations numerically stable. This lets us reuse
- *      all of the previously computed structure of L and U.
- *
- *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
- *            assuming not only the same nonzero pattern as the previously
- *            factored matrix B, but reusing B's row permutation.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->RowPerm or options->ColPerm, whose values are
- *      ignored. This is because the permutations from ScalePermstruct->perm_r
- *      and ScalePermstruct->perm_c are used as input.
- *
- *      The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->DiagScale, how the previous matrix was row
- *                                       and/or column scaled
- *        o  ScalePermstruct->R, the row scalings of the previous matrix,
- *                               if any
- *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
- *                               if any
- *        o  ScalePermstruct->perm_r, the row permutation of the previous
- *                                    matrix
- *        o  ScalePermstruct->perm_c, the column permutation of the previous 
- *                                    matrix
- *        o  all of LUstruct, the previously computed information about
- *                            L and U (the actual numerical values of L and U
- *                            stored in LUstruct->Llu are ignored)
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct,  modified to describe how the input matrix A was
- *                             equilibrated (thus ScalePermstruct->DiagScale,
- *                             R and C may be modified)
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
- *      call, and reuses its entire LU factorization
- *
- *      -  options->Fact = Factored: A is identical to a previously
- *            factorized matrix, so the entire previous factorization
- *            can be reused.
- *
- *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
- *       options->ReplaceTinyPivot)
- *
- *      The user must also supply 
- *
- *        o  A, the unfactored matrix, only in the case that iterative
- *              refinment is to be done (specifically A must be the output
- *              A from the previous call, so that it has been scaled and permuted)
- *        o  all of ScalePermstruct
- *        o  all of LUstruct, including the actual numerical values of
- *           L and U
- *
- *      all of which are unmodified on output.
- *         
- * Arguments
- * =========
- *
- * options (input) superlu_options_t* (global)
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following fields should be defined for this structure:
- *         
- *         o Fact (fact_t)
- *           Specifies whether or not the factored form of the matrix
- *           A is supplied on entry, and if not, how the matrix A should
- *           be factorized based on the previous history.
- *
- *           = DOFACT: The matrix A will be factorized from scratch.
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *
- *           = SamePattern: the matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the elimination tree
- *             LUstruct->etree
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ReplaceTinyPivot
- *                          ScalePermstruct->perm_c
- *                          LUstruct->etree
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
- *                          rest of LUstruct (GLU_persist, Llu)
- *
- *           = SamePattern_SameRowPerm: the matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *                 Inputs:  A
- *                          options->Equil, ReplaceTinyPivot
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          modified LUstruct->Llu
- *           = FACTORED: the matrix A is already factored.
- *                 Inputs:  all of ScalePermstruct
- *                          all of LUstruct
- *
- *         o Equil (yes_no_t)
- *           Specifies whether to equilibrate the system.
- *           = NO:  no equilibration.
- *           = YES: scaling factors are computed to equilibrate the system:
- *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
- *                  Whether or not the system will be equilibrated depends
- *                  on the scaling of the matrix A, but if equilibration is
- *                  used, A is overwritten by diag(R)*A*diag(C) and B by
- *                  diag(R)*B.
- *
- *         o RowPerm (rowperm_t)
- *           Specifies how to permute rows of the matrix A.
- *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
- *                        the original matrix to make the diagonal large
- *                        relative to the off-diagonal.
- *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
- *                        input by the user.
- *           
- *         o ColPerm (colperm_t)
- *           Specifies what type of column permutation to use to reduce fill.
- *           = NATURAL:       natural ordering.
- *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
- *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
- *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
- *         o ReplaceTinyPivot (yes_no_t)
- *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
- *                  LU factorization.
- *
- *         o IterRefine (IterRefine_t)
- *           Specifies how to perform iterative refinement.
- *           = NO:     no iterative refinement.
- *           = DOUBLE: accumulate residual in double precision.
- *           = EXTRA:  accumulate residual in extra precision.
- *
- *         NOTE: all options must be indentical on all processes when
- *               calling this routine.
- *
- * A (input/output) SuperMatrix* (local)
- *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *           The number of linear equations is A->nrow. The type of A must be:
- *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *           That is, A is stored in distributed compressed row format.
- *           See supermatrix.h for the definition of 'SuperMatrix'.
- *           This routine only handles square A, however, the LU factorization
- *           routine PDGSTRF can factorize rectangular matrices.
- *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
- *           depending on ScalePermstruct->DiagScale and options->ColPerm:
- *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
- *                diag(R)*A*diag(C).
- *             if options->ColPerm != NATURAL, A is further overwritten by
- *                diag(R)*A*diag(C)*Pc^T.
- *           If all the above condition are true, the LU decomposition is
- *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
- *
- * ScalePermstruct (input/output) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *         It contains the following fields:
- *
- *         o DiagScale (DiagScale_t)
- *           Specifies the form of equilibration that was done.
- *           = NOEQUIL: no equilibration.
- *           = ROW:     row equilibration, i.e., A was premultiplied by
- *                      diag(R).
- *           = COL:     Column equilibration, i.e., A was postmultiplied
- *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
- *                      replaced by diag(R)*A*diag(C).
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
- *           DiagScale is an input argument; otherwise it is an output
- *           argument.
- *
- *         o perm_r (int*)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *           If options->RowPerm = MY_PERMR, or
- *           options->Fact = SamePattern_SameRowPerm, perm_r is an
- *           input argument; otherwise it is an output argument.
- *
- *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
- *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
- *           input argument; otherwise, it is an output argument.
- *           On exit, perm_c may be overwritten by the product of the input
- *           perm_c and a permutation that postorders the elimination tree
- *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
- *           is already in postorder.
- *
- *         o R (double*) dimension (A->nrow)
- *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
- *                          diag(R).
- *           If DiagScale = NOEQUIL or COL, R is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
- *           an input argument; otherwise, R is an output argument.
- *
- *         o C (double*) dimension (A->ncol)
- *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
- *                          diag(C).
- *           If DiagScale = NOEQUIL or ROW, C is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
- *           an input argument; otherwise, C is an output argument.
- *         
- * B       (input/output) double* (local)
- *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
- *           where, m_loc is the number of rows stored locally on my
- *           process and is defined in the data structure of matrix A.
- *         On exit, the solution matrix if info = 0;
- *
- * ldb     (input) int (local)
- *         The leading dimension of matrix B.
- *
- * nrhs    (input) int (global)
- *         The number of right-hand sides.
- *         If nrhs = 0, only LU decomposition is performed, the forward
- *         and back substitutions are skipped.
- *
- * grid    (input) gridinfo_t* (global)
- *         The 2D process mesh. It contains the MPI communicator, the number
- *         of process rows (NPROW), the number of process columns (NPCOL),
- *         and my process rank. It is an input argument to all the
- *         parallel routines.
- *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         It contains the following fields:
- *
- *         o etree (int*) dimension (A->ncol) (global)
- *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
- *           It is computed in sp_colorder() during the first factorization,
- *           and is reused in the subsequent factorizations of the matrices
- *           with the same nonzero pattern.
- *           On exit of sp_colorder(), the columns of A are permuted so that
- *           the etree is in a certain postorder. This postorder is reflected
- *           in ScalePermstruct->perm_c.
- *           NOTE:
- *           Etree is a vector of parent pointers for a forest whose vertices
- *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- *         o Glu_persist (Glu_persist_t*) (global)
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (LocalLU_t*) (local)
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * SOLVEstruct (input/output) SOLVEstruct_t*
- *         The data structure to hold the communication pattern used
- *         in the phases of triangular solution and iterative refinement.
- *         This pattern should be intialized only once for repeated solutions.
- *         If options->SolveInitialized = YES, it is an input argument.
- *         If options->SolveInitialized = NO and nrhs != 0, it is an output
- *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
- *
- * berr    (output) double*, dimension (nrhs) (global)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *
- * See superlu_ddefs.h for the definitions of varioous data types.
- *
- */
-    NRformat_loc *Astore;
-    SuperMatrix GA;      /* Global A in NC format */
-    NCformat *GAstore;
-    double   *a_GA;
-    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
-    NCPformat *GACstore;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    Glu_freeable_t *Glu_freeable;
-            /* The nonzero structures of L and U factors, which are
-	       replicated on all processrs.
-	           (lsub, xlsub) contains the compressed subscript of
-		                 supernodes in L.
-          	   (usub, xusub) contains the compressed subscript of
-		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
-	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
-	      routine. They will be freed after PDDISTRIBUTE routine.
-	      If options->Fact == SamePattern_SameRowPerm, these
-	      structures are not used.                                  */
-    fact_t   Fact;
-    double   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    *rowind_loc, *colptr_loc;
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
-    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
-    double   *X, *b_col, *b_work, *x_col;
-    double   t;
-    static mem_usage_t num_mem_usage, symb_mem_usage;
-#if ( PRNTlevel>= 2 )
-    double   dmin, dsum, dprod;
-#endif
-    int_t procs;
-
-    /* Structures needed for parallel symbolic factorization */
-    int_t *sizes, *fstVtxSep, parSymbFact;
-    int   noDomains, nprocs_num;
-    MPI_Comm symb_comm; /* communicator for symbolic factorization */
-    int   col, key; /* parameters for creating a new communicator */
-    Pslu_freeable_t Pslu_freeable;
-    float  flinfo;
-
-    /* Initialization. */
-    m       = A->nrow;
-    n       = A->ncol;
-    Astore  = (NRformat_loc *) A->Store;
-    nnz_loc = Astore->nnz_loc;
-    m_loc   = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    a       = (double *) Astore->nzval;
-    rowptr  = Astore->rowptr;
-    colind  = Astore->colind;
-    sizes   = NULL;
-    fstVtxSep = NULL;
-    symb_comm = MPI_COMM_NULL;
-
-    /* Test the input parameters. */
-    *info = 0;
-    Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
-	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
-	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
-	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
-	*info = -1;
-    else if ( options->IterRefine == EXTRA ) {
-	*info = -1;
-	fprintf(stderr, "Extra precise iterative refinement yet to support.");
-    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-		|| A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < m_loc )
-	*info = -5;
-    else if ( nrhs < 0 )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	pxerbla("pdgssvx", grid, -*info);
-	return;
-    }
-
-    factored = (Fact == FACTORED);
-    Equil = (!factored && options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    iam = grid->iam;
-    job = 5;
-    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
-	rowequ = (ScalePermstruct->DiagScale == ROW) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-	colequ = (ScalePermstruct->DiagScale == COL) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-    } else rowequ = colequ = FALSE;
-
-    /* The following arrays are replicated on all processes. */
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    etree = LUstruct->etree;
-    R = ScalePermstruct->R;
-    C = ScalePermstruct->C;
-    /********/
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgssvx()");
-#endif
-
-    /* Not factored & ask for equilibration */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
-	/* Allocate storage if not done so before. */
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->R = R;
-		ScalePermstruct->C = C;
-		break;
-	    case ROW: 
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->C = C;
-		break;
-	    case COL: 
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-		ScalePermstruct->R = R;
-		break;
-	}
-    }
-
-    /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix.
-       ------------------------------------------------------------*/
-    if ( Equil ) {
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Enter equil");
-#endif
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm ) {
-	    /* Reuse R and C. */
-	    switch ( ScalePermstruct->DiagScale ) {
-	      case NOEQUIL:
-		break;
-	      case ROW:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			a[i] *= R[irow];       /* Scale rows. */
-		    }
-		    ++irow;
-		}
-		break;
-	      case COL:
-		for (j = 0; j < m_loc; ++j)
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
-		        icol = colind[i];
-			a[i] *= C[icol];          /* Scale columns. */
-		    }
-		break;
-	      case BOTH:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			icol = colind[i];
-			a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
-		    }
-		    ++irow;
-		}
-	        break;
-	    }
-	} else { /* Compute R & C from scratch */
-            /* Compute the row and column scalings. */
-	    pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
-
-	    /* Equilibrate matrix A if it is badly-scaled. */
-	    pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
-
-	    if ( lsame_(equed, "R") ) {
-		ScalePermstruct->DiagScale = rowequ = ROW;
-	    } else if ( lsame_(equed, "C") ) {
-		ScalePermstruct->DiagScale = colequ = COL;
-	    } else if ( lsame_(equed, "B") ) {
-		ScalePermstruct->DiagScale = BOTH;
-		rowequ = ROW;
-		colequ = COL;
-	    } else ScalePermstruct->DiagScale = NOEQUIL;
-
-#if ( PRNTlevel>=1 )
-	    if ( !iam ) {
-		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
-	    }
-#endif
-	} /* if Fact ... */
-
-	stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit equil");
-#endif
-    } /* if Equil ... */
-
-    if ( !factored ) { /* Skip this if already factored. */
-        /*
-         * Gather A from the distributed compressed row format to
-         * global A in compressed column format.
-         * Numerical values are gathered only when a row permutation
-         * for large diagonal is sought after.
-         */
-	if ( Fact != SamePattern_SameRowPerm ) {
-            need_value = (options->RowPerm == LargeDiag);
-            pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
-            GAstore = (NCformat *) GA.Store;
-            colptr = GAstore->colptr;
-            rowind = GAstore->rowind;
-            nnz = GAstore->nnz;
-            if ( need_value ) a_GA = (double *) GAstore->nzval;
-            else assert(GAstore->nzval == NULL);
-	}
-
-        /* ------------------------------------------------------------
-           Find the row permutation for A.
-           ------------------------------------------------------------*/
-        if ( options->RowPerm != NO ) {
-	    t = SuperLU_timer_();
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
-	            /* Permute the global matrix GA for symbfact() */
-	            for (i = 0; i < colptr[n]; ++i) {
-	            	irow = rowind[i]; 
-		    	rowind[i] = perm_r[irow];
-	            }
-	        } else { /* options->RowPerm == LargeDiag */
-	            /* Get a new perm_r[] */
-	            if ( job == 5 ) {
-		        /* Allocate storage for scaling factors. */
-		        if ( !(R1 = doubleMalloc_dist(m)) )
-		            ABORT("SUPERLU_MALLOC fails for R1[]");
-		    	if ( !(C1 = doubleMalloc_dist(n)) )
-		            ABORT("SUPERLU_MALLOC fails for C1[]");
-	            }
-
-	            if ( !iam ) {
-		        /* Process 0 finds a row permutation */
-		        dldperm(job, m, nnz, colptr, rowind, a_GA,
-		                perm_r, R1, C1);
-		
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            } else {
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            }
-
-#if ( PRNTlevel>=2 )
-	            dmin = dlamch_("Overflow");
-	            dsum = 0.0;
-	            dprod = 1.0;
-#endif
-	            if ( job == 5 ) {
-		        if ( Equil ) {
-		            for (i = 0; i < n; ++i) {
-			        R1[i] = exp(R1[i]);
-			        C1[i] = exp(C1[i]);
-		            }
-
-		            /* Scale the distributed matrix */
-		            irow = fst_row;
-		            for (j = 0; j < m_loc; ++j) {
-			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			            icol = colind[i];
-			            a[i] *= R1[irow] * C1[icol];
-#if ( PRNTlevel>=2 )
-			            if ( perm_r[irow] == icol ) { /* New diagonal */
-			              if ( job == 2 || job == 3 )
-				        dmin = SUPERLU_MIN(dmin, fabs(a[i]));
-			              else if ( job == 4 )
-				        dsum += fabs(a[i]);
-			              else if ( job == 5 )
-				        dprod *= fabs(a[i]);
-			            }
-#endif
-			        }
-			        ++irow;
-		            }
-
-		            /* Multiply together the scaling factors. */
-		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
-		            else for (i = 0; i < m; ++i) R[i] = R1[i];
-		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
-		            else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
-		            ScalePermstruct->DiagScale = BOTH;
-		            rowequ = colequ = 1;
-
-		        } /* end Equil */
-
-                        /* Now permute global A to prepare for symbfact() */
-                        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	                        irow = rowind[i];
-		                rowind[i] = perm_r[irow];
-		            }
-		        }
-		        SUPERLU_FREE (R1);
-		        SUPERLU_FREE (C1);
-	            } else { /* job = 2,3,4 */
-		        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			        irow = rowind[i];
-			        rowind[i] = perm_r[irow];
-		            } /* end for i ... */
-		        } /* end for j ... */
-	            } /* end else job ... */
-
-#if ( PRNTlevel>=2 )
-	            if ( job == 2 || job == 3 ) {
-		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
-	            } else if ( job == 4 ) {
-		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
-	            } else if ( job == 5 ) {
-		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
-	            }
-#endif
-	    
-                } /* end if options->RowPerm ... */
-
-	        t = SuperLU_timer_() - t;
-	        stat->utime[ROWPERM] = t;
-#if ( PRNTlevel>=1 )
-	        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
-#endif
-            } /* end if Fact ... */
-        } else { /* options->RowPerm == NOROWPERM */
-            for (i = 0; i < m; ++i) perm_r[i] = i;
-        }
-
-#if ( DEBUGlevel>=2 )
-        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
-#endif
-    } /* end if (!factored) */
-
-    if ( !factored || options->IterRefine ) {
-	/* Compute norm(A), which will be used to adjust small diagonal. */
-	if ( notran ) *(unsigned char *)norm = '1';
-	else *(unsigned char *)norm = 'I';
-	anorm = pdlangs(norm, A, grid);
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
-#endif
-    }
-
-    /* ------------------------------------------------------------
-       Perform the LU factorization.
-       ------------------------------------------------------------*/
-    if ( !factored ) {
-	t = SuperLU_timer_();
-	/*
-	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
-	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	parSymbFact = options->ParSymbFact;
-
-#if ( PRNTlevel>=1 )
-	if ( parSymbFact && permc_spec != PARMETIS )
-	    if ( !iam ) printf(".. Parallel symbolic factorization"
-			       " only works wth ParMetis!\n");
-#endif
-
-	if ( parSymbFact == YES || permc_spec == PARMETIS ) {	
-	    nprocs_num = grid->nprow * grid->npcol;
-  	    noDomains = (int) ( pow(2, ((int) log2( (double)nprocs_num ))));
-
-	    /* create a new communicator for the first noDomains processors in
-	       grid->comm */
-	    key = iam;
-    	    if (iam < noDomains) col = 0;
-	    else col = MPI_UNDEFINED;
-	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
-
-	    permc_spec = PARMETIS; /* only works with PARMETIS */
-        }
-
-	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
-	  if ( permc_spec == PARMETIS ) {
-	      /* Get column permutation vector in perm_c.                    *
-	       * This routine takes as input the distributed input matrix A  *
-	       * and does not modify it.  It also allocates memory for       *
-	       * sizes[] and fstVtxSep[] arrays, that contain information    *
-	       * on the separator tree computed by ParMETIS.                 */
-	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
-                                  	   noDomains, &sizes, &fstVtxSep,
-                                           grid, &symb_comm);
-	      if (flinfo > 0)
-	          ABORT("ERROR in get perm_c parmetis.");
-	  } else {
-	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
-          }
-        }
-
-	stat->utime[COLPERM] = SuperLU_timer_() - t;
-
-	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
-	   (a.k.a. column etree), depending on the choice of ColPerm.
-	   Adjust perm_c[] to be consistent with a postorder of etree.
-	   Permute columns of A to form A*Pc'. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-	    if ( parSymbFact == NO ) {
-	        int_t *GACcolbeg, *GACcolend, *GACrowind;
-
-	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
-
-	        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
-	        GACstore = (NCPformat *) GAC.Store;
-	        GACcolbeg = GACstore->colbeg;
-	        GACcolend = GACstore->colend;
-	        GACrowind = GACstore->rowind;
-	        for (j = 0; j < n; ++j) {
-	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
-		        irow = GACrowind[i];
-		        GACrowind[i] = perm_c[irow];
-	            }
-	        }
-
-	        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
-                   the nonzero data structures for L & U. */
-#if ( PRNTlevel>=1 ) 
-                if ( !iam )
-		  printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
-#endif
-  	        t = SuperLU_timer_();
-	        if ( !(Glu_freeable = (Glu_freeable_t *)
-		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
-		    ABORT("Malloc fails for Glu_freeable.");
-
-	    	/* Every process does this. */
-	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
-			     	 Glu_persist, Glu_freeable);
-
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if ( iinfo < 0 ) { /* Successful return */
-		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 )
-		    if ( !iam ) {
-		    	printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
-		    	printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
-		    	printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n", 
-			       sizeof(int_t), sizeof(short), sizeof(float),
-			       sizeof(double));
-		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-			   	symb_mem_usage.for_lu*1e-6, 
-			   	symb_mem_usage.total*1e-6,
-			   	symb_mem_usage.expansions);
-		    }
-#endif
-	    	} else {
-		    if ( !iam ) {
-		        fprintf(stderr,"symbfact() error returns %d\n",iinfo);
-		    	exit(-1);
-		    }
-	        }
-	    } /* end if serial symbolic factorization */
-	    else {  /* parallel symbolic factorization */
-	    	t = SuperLU_timer_();
-	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
-				       sizes, fstVtxSep, &Pslu_freeable, 
-				       &(grid->comm), &symb_comm,
-				       &symb_mem_usage); 
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if (flinfo > 0) 
-	      	    ABORT("Insufficient memory for parallel symbolic factorization.");
-	    }
-	} /* end if Fact ... */
-
-	if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]);
-        if (sizes) SUPERLU_FREE (sizes);
-        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
-	if (symb_comm != MPI_COMM_NULL)
-	  MPI_Comm_free (&symb_comm); 
-
-	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
-  	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-  	       distribution routine. */
-	    t = SuperLU_timer_();
-	    dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct,
-                                      Glu_freeable, LUstruct, grid);
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-
-  	    /* Deallocate storage used in symbolic factorization. */
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        iinfo = symbfact_SubFree(Glu_freeable);
-	        SUPERLU_FREE(Glu_freeable);
-	    }
-	} else {
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-	       distribution routine. */
-	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-    	    t = SuperLU_timer_();
-	    dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct,
-		  			   &Pslu_freeable, LUstruct, grid);
-	    if (dist_mem_use > 0)
-	        ABORT ("Not enough memory available for dist_psymbtonum\n");
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-	}
-
-	if (!iam) printf ("\tDISTRIBUTE time    %8.2f\n", stat->utime[DIST]);
-
-	/* Perform numerical factorization in parallel. */
-	t = SuperLU_timer_();
-	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
-	stat->utime[FACT] = SuperLU_timer_() - t;
-
-#if ( PRNTlevel>=1 )
-	{
-	    int_t TinyPivots;
-	    float for_lu, total, max, avg, temp;
-	    dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
-	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Reduce( &num_mem_usage.total, &total,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    temp = SUPERLU_MAX(symb_mem_usage.total,
-			       symb_mem_usage.for_lu +
-			       (float)dist_mem_use + num_mem_usage.for_lu);
-	    if (parSymbFact == TRUE)
-	      /* The memory used in the redistribution routine
-		 includes the memory used for storing the symbolic
-		 structure and the memory allocated for numerical
-		 factorization */
-	      temp = SUPERLU_MAX(symb_mem_usage.total,
-				 (float)dist_mem_use);
-	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
-			  MPI_SUM, grid->comm );
-	    stat->TinyPivots = TinyPivots;
-	    if ( !iam ) {
-		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
-		       for_lu*1e-6, total*1e-6);
-		printf("\tAll space (MB):"
-		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
-		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
-		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
-	    }
-	}
-#endif
-    
-        /* Destroy GA */
-        if ( Fact != SamePattern_SameRowPerm )
-            Destroy_CompCol_Matrix_dist(&GA);
-    } /* end if (!factored) */
-	
-    /* ------------------------------------------------------------
-       Compute the solution matrix X.
-       ------------------------------------------------------------*/
-    if ( nrhs ) {
-
-	if ( !(b_work = doubleMalloc_dist(n)) )
-	    ABORT("Malloc fails for b_work[]");
-
-	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    if ( rowequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-		        b_col[i] *= R[irow];
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( colequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    b_col[i] *= C[irow];
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	/* Save a copy of the right-hand side. */
-	ldx = ldb;
-	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
-	    ABORT("Malloc fails for X[]");
-	x_col = X;  b_col = B;
-	for (j = 0; j < nrhs; ++j) {
-	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
-	    x_col += ldx;  b_col += ldb;
-	}
-
-	/* ------------------------------------------------------------
-	   Solve the linear system.
-	   ------------------------------------------------------------*/
-	if ( options->SolveInitialized == NO ) {
-	    dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
-		       SOLVEstruct);
-	}
-
-	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
-		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
-
-	/* ------------------------------------------------------------
-	   Use iterative refinement to improve the computed solution and
-	   compute error bounds and backward error estimates for it.
-	   ------------------------------------------------------------*/
-	if ( options->IterRefine ) {
-	    /* Improve the solution by iterative refinement. */
-	    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
-
-	    t = SuperLU_timer_();
-	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
-	        /* All these cases need to re-initialize gsmv structure */
-	        if ( options->RefineInitialized )
-		    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
-	        pdgsmv_init(A, SOLVEstruct->row_to_proc, grid,
-			    SOLVEstruct->gsmv_comm);
-	       
-                /* Save a copy of the transformed local col indices
-		   in colind_gsmv[]. */
-	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
-	        if ( !(it = intMalloc_dist(nnz_loc)) )
-		    ABORT("Malloc fails for colind_gsmv[]");
-	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
-	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
-	        options->RefineInitialized = YES;
-	    } else if ( Fact == SamePattern ||
-			Fact == SamePattern_SameRowPerm ) {
-	        double at;
-	        int_t k, jcol, p;
-	        /* Swap to beginning the part of A corresponding to the
-		   local part of X, as was done in pdgsmv_init() */
-	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-		    k = rowptr[i];
-		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-		        jcol = colind[j];
-		        p = SOLVEstruct->row_to_proc[jcol];
-		        if ( p == iam ) { /* Local */
-		            at = a[k]; a[k] = a[j]; a[j] = at;
-		            ++k;
-		        }
-		    }
-	        }
-	      
-	        /* Re-use the local col indices of A obtained from the
-		   previous call to pdgsmv_init() */
-	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
-	    }
-
-	    if ( nrhs == 1 ) { /* Use the existing solve structure */
-	        SOLVEstruct1 = SOLVEstruct;
-	    } else { /* For nrhs > 1, since refinement is performed for RHS
-			one at a time, the communication structure for pdgstrs
-			is different than the solve with nrhs RHS. 
-			So we use SOLVEstruct1 for the refinement step.
-		     */
-	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
-		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
-		    ABORT("Malloc fails for SOLVEstruct1");
-	        /* Copy the same stuff */
-	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
-	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
-	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
-	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
-	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
-	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
-	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		
-		/* Initialize the *gstrs_comm for 1 RHS. */
-		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
-		    ABORT("Malloc fails for gstrs_comm[]");
-		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
-			     Glu_persist, SOLVEstruct1);
-	    }
-
-	    pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
-		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
-
-            /* Deallocate the storage associated with SOLVEstruct1 */
-	    if ( nrhs > 1 ) {
-	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
-	        SUPERLU_FREE(SOLVEstruct1);
-	    }
-
-	    stat->utime[REFINE] = SuperLU_timer_() - t;
-	}
-
-	/* Permute the solution matrix B <= Pc'*X. */
-	pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
-			       SOLVEstruct->inv_perm_c,
-			       X, ldx, B, ldb, nrhs, grid);
-#if ( DEBUGlevel>=2 )
-	printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
-	for (i = 0; i < m_loc; ++i)
-	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
-#endif
-	
-	/* Transform the solution matrix X to a solution of the original
-	   system before the equilibration. */
-	if ( notran ) {
-	    if ( colequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-		        b_col[i] *= C[irow];
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( rowequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    b_col[i] *= R[irow];
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	SUPERLU_FREE(b_work);
-	SUPERLU_FREE(X);
-
-    } /* end if nrhs != 0 */
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-#endif
-
-    /* Deallocate R and/or C if it was not used. */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-	        SUPERLU_FREE(R);
-		SUPERLU_FREE(C);
-		break;
-	    case ROW: 
-		SUPERLU_FREE(C);
-		break;
-	    case COL: 
-		SUPERLU_FREE(R);
-		break;
-	}
-    }
-    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
- 	Destroy_CompCol_Permuted_dist(&GAC);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgssvx()");
-#endif
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_irecv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1343 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-#if ( VAMPIR>=1 )
-#include <VT.h>
-#endif
-
-/*
- * Internal prototypes
- */
-static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
-		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
-#ifdef _CRAY
-static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
-#else
-static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, SuperLUStat_t *);
-#endif
-
-/* 
- * Sketch of the algorithm
- * =======================
- *
- * The following relations hold:
- *     * A_kk = L_kk * U_kk
- *     * L_ik = Aik * U_kk^(-1)
- *     * U_kj = L_kk^(-1) * A_kj
- *
- *              ----------------------------------
- *              |   |                            |
- *              ----|-----------------------------
- *              |   | \ U_kk|                    |
- *              |   |   \   |        U_kj        |
- *              |   |L_kk \ |         ||         |
- *              ----|-------|---------||----------
- *              |   |       |         \/         |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   | L_ik ==>       A_ij        |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              ----------------------------------
- *
- * Handle the first block of columns separately.
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity. ( pdgstrf2(0), one column at a time )
- *     * Compute block row of U
- *     * Update trailing matrix
- * 
- * Loop over the remaining blocks of columns.
- *   mycol = MYCOL( iam, grid );
- *   myrow = MYROW( iam, grid );
- *   N = nsupers;
- *   For (k = 1; k < N; ++k) {
- *       krow = PROW( k, grid );
- *       kcol = PCOL( k, grid );
- *       Pkk = PNUM( krow, kcol, grid );
- *
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity.
- *       if ( mycol == kcol ) {
- *           pdgstrf2(k), one column at a time 
- *       }
- *
- *     * Parallel triangular solve
- *       if ( iam == Pkk ) multicast L_k,k to this process row;
- *       if ( myrow == krow && mycol != kcol ) {
- *          Recv L_k,k from process Pkk;
- *          for (j = k+1; j < N; ++j) 
- *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
- *                 U_k,j = L_k,k \ A_k,j;
- *       }
- *
- *     * Parallel rank-k update
- *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
- *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
- *       if ( myrow != krow ) {
- *          Pkj = PNUM( krow, mycol, grid );
- *          Recv U_k,k+1:N from process Pkj;
- *       }
- *       if ( mycol != kcol ) {
- *          Pik = PNUM( myrow, kcol, grid );
- *          Recv L_k+1:N,k from process Pik;
- *       }
- *       for (j = k+1; k < N; ++k) {
- *          for (i = k+1; i < N; ++i) 
- *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
- *                   && L_i,k != 0 && U_k,j != 0 )
- *                 A_i,j = A_i,j - L_i,k * U_k,j;
- *       }
- *  }
- *
- *
- * Remaining issues
- *   (1) Use local indices for L subscripts and SPA.  [DONE]
- *
- */
-/************************************************************************/
-int_t pdgstrf
-/************************************************************************/
-(
- superlu_options_t *options, int m, int n, double anorm,
- LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
- )
-/* 
- * Purpose
- * =======
- *
- *  PDGSTRF performs the LU factorization in parallel.
- *
- * Arguments
- * =========
- * 
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following field should be defined:
- *         o ReplaceTinyPivot (yes_no_t)
- *           Specifies whether to replace the tiny diagonals by
- *           sqrt(epsilon)*norm(A) during LU factorization.
- *
- * m      (input) int
- *        Number of rows in the matrix.
- *
- * n      (input) int
- *        Number of columns in the matrix.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         The following fields should be defined:
- *
- *         o Glu_persist (input) Glu_persist_t*
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (input/output) LocalLU_t*
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- *
- */
-{
-#ifdef _CRAY
-    _fcd ftcs = _cptofcd("N", strlen("N"));
-    _fcd ftcs1 = _cptofcd("L", strlen("L"));
-    _fcd ftcs2 = _cptofcd("N", strlen("N"));
-    _fcd ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    double alpha = 1.0, beta = 0.0;
-    int_t *xsup;
-    int_t *lsub, *lsub1, *usub, *Usub_buf,
-          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
-    double *lusup, *lusup1, *uval, *Uval_buf,
-           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
-    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
-          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
-          nlb, nub, nsupc, rel, rukp;
-    int_t Pc, Pr;
-    int   iam, kcol, krow, mycol, myrow, pi, pj;
-    int   j, k, lk, nsupers;
-    int   nsupr, nbrow, segsize;
-    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
-		      *     0 : transferred in Lsub_buf[]
-		      *     1 : transferred in Lval_buf[]
-		      *     2 : transferred in Usub_buf[] 
-		      *     3 : transferred in Uval_buf[]
-		      */
-    int_t  msg0, msg2;
-    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
-    double **Unzval_br_ptr, **Lnzval_bc_ptr;
-    int_t  *index;
-    double *nzval;
-    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
-    double *ucol;
-    int_t  *indirect;
-    double *tempv, *tempv2d;
-    int_t iinfo;
-    int_t *ToRecv, *ToSendD, **ToSendR;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    superlu_scope_t *scp;
-    float s_eps;
-    double thresh;
-    double *tempU2d, *tempu;
-    int    full, ldt, ldu, lead_zero, ncols;
-    MPI_Request recv_req[4], *send_req;
-    MPI_Status status;
-#if ( DEBUGlevel>=2 ) 
-    int_t num_copy=0, num_update=0;
-#endif
-#if ( PRNTlevel==3 )
-    int_t  zero_msg = 0, total_msg = 0;
-#endif
-#if ( PROFlevel>=1 )
-    double t1, t2;
-    float msg_vol = 0, msg_cnt = 0;
-    int_t iword = sizeof(int_t), dword = sizeof(double);
-#endif
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( m < 0 ) *info = -2;
-    else if ( n < 0 ) *info = -3;
-    if ( *info ) {
-	pxerbla("pdgstrf", grid, -*info);
-	return (-1);
-    }
-
-    /* Quick return if possible. */
-    if ( m == 0 || n == 0 ) return 0;
-
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    s_eps = slamch_("Epsilon");
-    thresh = s_eps * anorm;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgstrf()");
-#endif
-
-    stat->ops[FACT] = 0.0;
-
-    if ( Pr*Pc > 1 ) {
-	i = Llu->bufmax[0];
-	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
-	    ABORT("Malloc fails for Lsub_buf.");
-	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
-	i = Llu->bufmax[1];
-	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
-	    ABORT("Malloc fails for Lval_buf[].");
-	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
-	if ( Llu->bufmax[2] != 0 ) 
-	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
-		ABORT("Malloc fails for Usub_buf[].");
-	if ( Llu->bufmax[3] != 0 ) 
-	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
-		ABORT("Malloc fails for Uval_buf[].");
-	if ( !(send_req =
-	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
-	    ABORT("Malloc fails for send_req[].");
-    }
-    if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) )
-	ABORT("Malloc fails for ujrow[].");
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
-	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
-	       Llu->bufmax[0], Llu->bufmax[1], 
-	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
-    }
-#endif
-
-    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
-    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
-    Lval_buf_2[0] = Llu->Lval_buf_2[0];
-    Lval_buf_2[1] = Llu->Lval_buf_2[1];
-    Usub_buf = Llu->Usub_buf;
-    Uval_buf = Llu->Uval_buf;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    Unzval_br_ptr = Llu->Unzval_br_ptr;
-    ToRecv = Llu->ToRecv;
-    ToSendD = Llu->ToSendD;
-    ToSendR = Llu->ToSendR;
-
-    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
-    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
-	ABORT("Calloc fails for tempv2d[].");
-    tempU2d = tempv2d + ldt*ldt;
-    if ( !(indirect = intMalloc_dist(ldt)) )
-	ABORT("Malloc fails for indirect[].");
-    k = CEILING( nsupers, Pr ); /* Number of local block rows */
-    if ( !(iuip = intMalloc_dist(k)) )
-	ABORT("Malloc fails for iuip[].");
-    if ( !(ruip = intMalloc_dist(k)) )
-	ABORT("Malloc fails for ruip[].");
-
-#if ( VAMPIR>=1 )
-    VT_symdef(1, "Send-L", "Comm");
-    VT_symdef(2, "Recv-L", "Comm");
-    VT_symdef(3, "Send-U", "Comm");
-    VT_symdef(4, "Recv-U", "Comm");
-    VT_symdef(5, "TRF2", "Factor");
-    VT_symdef(100, "Factor", "Factor");
-    VT_begin(100);
-    VT_traceon();
-#endif
-
-    /* ---------------------------------------------------------------
-       Handle the first block column separately to start the pipeline.
-       --------------------------------------------------------------- */
-    if ( mycol == 0 ) {
-#if ( VAMPIR>=1 )
-	VT_begin(5);
-#endif
-	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
-#if ( VAMPIR>=1 )
-	VT_end(5);
-#endif
-
-	scp = &grid->rscp; /* The scope of process row. */
-
-	/* Process column *kcol* multicasts numeric values of L(:,k) 
-	   to process rows. */
-	lsub = Lrowind_bc_ptr[0];
-	lusup = Lnzval_bc_ptr[0];
-	if ( lsub ) {
-	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
-	    msgcnt[1] = lsub[1] * SuperSize( 0 );
-	} else {
-	    msgcnt[0] = msgcnt[1] = 0;
-	}
-	
-	for (pj = 0; pj < Pc; ++pj) {
-	    if ( ToSendR[0][pj] != EMPTY ) {
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		VT_begin(1);
-#endif
-		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
-			  &send_req[pj] );
-		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
-			  &send_req[pj+Pc] );
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
-		       iam, 0, msgcnt[0], msgcnt[1], pj);
-#endif
-#if ( VAMPIR>=1 )
-		VT_end(1);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-		msg_cnt += 2;
-		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
-#endif
-	    }
-	} /* for pj ... */
-    } else { /* Post immediate receives. */
-	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
-	    scp = &grid->rscp; /* The scope of process row. */
-	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
-		      0, scp->comm, &recv_req[0] );
-	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
-		      1, scp->comm, &recv_req[1] );
-#if ( DEBUGlevel>=2 )
-	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
-#endif
-	}
-    } /* if mycol == 0 */
-
-    /* ------------------------------------------
-       MAIN LOOP: Loop through all block columns.
-       ------------------------------------------ */
-    for (k = 0; k < nsupers; ++k) {
-
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-
-	if ( mycol == kcol ) {
-	    lk = LBj( k, grid ); /* Local block number. */
-
-	    for (pj = 0; pj < Pc; ++pj) {
-                /* Wait for Isend to complete before using lsub/lusup. */
-		if ( ToSendR[lk][pj] != EMPTY ) {
-		    MPI_Wait( &send_req[pj], &status );
-		    MPI_Wait( &send_req[pj+Pc], &status );
-		}
-	    }
-	    lsub = Lrowind_bc_ptr[lk];
-	    lusup = Lnzval_bc_ptr[lk];
-	} else {
-	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
-		scp = &grid->rscp; /* The scope of process row. */
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		VT_begin(2);
-#endif
-		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
-		  Llu->bufmax[0]);*/
-		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
-			 (4*k)%NTAGS, scp->comm, &status );*/
-		MPI_Wait( &recv_req[0], &status );
-		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
-		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 
-		  Llu->bufmax[1]);*/
-		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 
-			 (4*k+1)%NTAGS, scp->comm, &status );*/
-		MPI_Wait( &recv_req[1], &status );
-		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
-#if ( VAMPIR>=1 )
-		VT_end(2);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
-		       iam, k, msgcnt[0], msgcnt[1], kcol);
-		fflush(stdout);
-#endif
-		lsub = Lsub_buf_2[k%2];
-		lusup = Lval_buf_2[k%2];
-#if ( PRNTlevel==3 )
-		++total_msg;
-		if ( !msgcnt[0] ) ++zero_msg;
-#endif
-	    } else msgcnt[0] = 0;
-	} /* if mycol = Pc(k) */
-
-	scp = &grid->cscp; /* The scope of process column. */
-
-	if ( myrow == krow ) {
-	    /* Parallel triangular solve across process row *krow* --
-	       U(k,j) = L(k,k) \ A(k,j).  */
-#ifdef _CRAY
-	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
-#else
-	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
-#endif
-
-	    /* Multicasts U(k,:) to process columns. */
-	    lk = LBi( k, grid );
-	    usub = Ufstnz_br_ptr[lk];
-	    uval = Unzval_br_ptr[lk];
-	    if ( usub )	{
-		msgcnt[2] = usub[2];
-		msgcnt[3] = usub[1];
-	    } else {
-		msgcnt[2] = msgcnt[3] = 0;
-	    }
-
-	    if ( ToSendD[lk] == YES ) {
-		for (pi = 0; pi < Pr; ++pi) {
-		    if ( pi != myrow ) {
-#if ( PROFlevel>=1 )
-			TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-			VT_begin(3);
-#endif
-			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
-				 (4*k+2)%NTAGS, scp->comm);
-			MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
-				 (4*k+3)%NTAGS, scp->comm);
-#if ( VAMPIR>=1 )
-			VT_end(3);
-#endif
-#if ( PROFlevel>=1 )
-			TOC(t2, t1);
-			stat->utime[COMM] += t2;
-			msg_cnt += 2;
-			msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
-#endif
-		    } /* if pi ... */
-		} /* for pi ... */
-	    } /* if ToSendD ... */
-	} else { /* myrow != krow */
-	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		VT_begin(4);
-#endif
-		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
-		  Llu->bufmax[2]);*/
-		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
-			 (4*k+2)%NTAGS, scp->comm, &status );
-		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
-		/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, 
-		  Llu->bufmax[3]);*/
-		MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, 
-			 (4*k+3)%NTAGS, scp->comm, &status );
-		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
-#if ( VAMPIR>=1 )
-		VT_end(4);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-#endif
-		usub = Usub_buf;
-		uval = Uval_buf;
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
-#endif
-#if ( PRNTlevel==3 )
-		++total_msg;
-		if ( !msgcnt[2] ) ++zero_msg;
-#endif
-	    } else msgcnt[2] = 0;
-	} /* if myrow == Pr(k) */
-	  
-	/* 
-	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
-	 *  for (j = k+1; k < N; ++k) {
-	 *     for (i = k+1; i < N; ++i) 
-	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
-	 *              && L(i,k) != 0 && U(k,j) != 0 )
-	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
-	 */
-	msg0 = msgcnt[0];
-	msg2 = msgcnt[2];
-	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
-	    nsupr = lsub[1]; /* LDA of lusup. */
-	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
-		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
-		luptr0 = knsupc;
-		nlb = lsub[0] - 1;
-	    } else {
-		lptr0 = BC_HEADER;
-		luptr0 = 0;
-		nlb = lsub[0];
-	    }
-	    lptr = lptr0;
-	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
-		ib = lsub[lptr];
-		lib = LBi( ib, grid );
-		iuip[lib] = BR_HEADER;
-		ruip[lib] = 0;
-		lptr += LB_DESCRIPTOR + lsub[lptr+1];
-	    }
-	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
-	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
-	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
-	    klst = FstBlockC( k+1 );
-	    
-	    /* ---------------------------------------------------
-	       Update the first block column A(:,k+1).
-	       --------------------------------------------------- */
-	    jb = usub[iukp];   /* Global block number of block U(k,j). */
-	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
-		--nub;
-		lptr = lptr0;
-		luptr = luptr0;
-		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
-		nsupc = SuperSize( jb );
-		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
-		/* Prepare to call DGEMM. */
-		jj = iukp;
-		while ( usub[jj] == klst ) ++jj;
-		ldu = klst - usub[jj++];
-		ncols = 1;
-		full = 1;
-		for (; jj < iukp+nsupc; ++jj) {
-		    segsize = klst - usub[jj];
-		    if ( segsize ) {
-		        ++ncols;
-			if ( segsize != ldu ) full = 0;
-		        if ( segsize > ldu ) ldu = segsize;
-		    }
-		}
-#if ( DEBUGlevel>=3 )
-		++num_update;
-#endif
-		if ( full ) {
-		    tempu = &uval[rukp];
-		} else { /* Copy block U(k,j) into tempU2d. */
-#if ( DEBUGlevel>=3 )
-		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-			 iam, full, k, jb, ldu, ncols, nsupc);
-		  ++num_copy;
-#endif
-		    tempu = tempU2d;
-		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
-		        segsize = klst - usub[jj];
-			if ( segsize ) {
-			    lead_zero = ldu - segsize;
-			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
-			    tempu += lead_zero;
-			    for (i = 0; i < segsize; ++i)
-				tempu[i] = uval[rukp+i];
-			    rukp += segsize;
-			    tempu += segsize;
-			}
-		    }
-		    tempu = tempU2d;
-		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-		} /* if full ... */
-
-		for (lb = 0; lb < nlb; ++lb) { 
-		    ib = lsub[lptr]; /* Row block L(i,k). */
-		    nbrow = lsub[lptr+1];  /* Number of full rows. */
-		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
-		    tempv = tempv2d;
-#ifdef _CRAY
-		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
-			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			  tempu, &ldu, &beta, tempv, &ldt);
-#elif defined (USE_VENDOR_BLAS)
-		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt);
-#else
-		    hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, 
-                           &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
-#endif
-		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
-
-		    /* Now gather the result into the destination block. */
-		    if ( ib < jb ) { /* A(i,j) is in U. */
-			ilst = FstBlockC( ib+1 );
-			lib = LBi( ib, grid );
-			index = Ufstnz_br_ptr[lib];
-			ijb = index[iuip[lib]];
-			while ( ijb < jb ) { /* Search for dest block. */
-			    ruip[lib] += index[iuip[lib]+1];
-			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
-			    ijb = index[iuip[lib]];
-			}
-			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
-
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    fnz = index[iuip[lib]++];
-			    if ( segsize ) { /* Nonzero segment in U(k.j). */
-				ucol = &Unzval_br_ptr[lib][ruip[lib]];
-				for (i = 0, it = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    ucol[rel] -= tempv[it++];
-				}
-				tempv += ldt;
-			    }
-			    ruip[lib] += ilst - fnz;
-			}
-		    } else { /* A(i,j) is in L. */
-			index = Lrowind_bc_ptr[ljb];
-			ldv = index[1];   /* LDA of the dest lusup. */
-			lptrj = BC_HEADER;
-			luptrj = 0;
-			ijb = index[lptrj];
-			while ( ijb != ib ) { /* Search for dest block -- 
-						 blocks are not ordered! */
-			    luptrj += index[lptrj+1];
-			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
-			    ijb = index[lptrj];
-			}
-			/*
-			 * Build indirect table. This is needed because the
-			 * indices are not sorted.
-			 */
-			fnz = FstBlockC( ib );
-			lptrj += LB_DESCRIPTOR;
-			for (i = 0; i < index[lptrj-1]; ++i) {
-			    rel = index[lptrj + i] - fnz;
-			    indirect[rel] = i;
-			}
-			nzval = Lnzval_bc_ptr[ljb] + luptrj;
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    if ( segsize ) {
-/*#pragma _CRI cache_bypass nzval,tempv*/
-				for (it = 0, i = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    nzval[indirect[rel]] -= tempv[it++];
-				}
-				tempv += ldt;
-			    }
-			    nzval += ldv;
-			}
-		    } /* if ib < jb ... */
-		    lptr += nbrow;
-		    luptr += nbrow;
-		} /* for lb ... */
-		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
-		iukp += nsupc;
-	    }  /* if jb == k+1 */
-	} /* if L(:,k) and U(k,:) not empty */
-
-
-	if ( k+1 < nsupers ) {
-	  kcol = PCOL( k+1, grid );
-	  if ( mycol == kcol ) {
-#if ( VAMPIR>=1 )
-	    VT_begin(5);
-#endif
-	    /* Factor diagonal and subdiagonal blocks and test for exact
-	       singularity.  */
-	    pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
-#if ( VAMPIR>=1 )
-	    VT_end(5);
-#endif
-
-	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
-	       to process rows. */
-	    lk = LBj( k+1, grid ); /* Local block number. */
-	    lsub1 = Lrowind_bc_ptr[lk];
- 	    if ( lsub1 ) {
-		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
-		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
-	    } else {
-		msgcnt[0] = 0;
-		msgcnt[1] = 0;
-	    }
-	    scp = &grid->rscp; /* The scope of process row. */
-	    for (pj = 0; pj < Pc; ++pj) {
-		if ( ToSendR[lk][pj] != EMPTY ) {
-		    lusup1 = Lnzval_bc_ptr[lk];
-#if ( PROFlevel>=1 )
-		    TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		    VT_begin(1);
-#endif
-		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
-			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
-		    MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
-			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
-#if ( VAMPIR>=1 )
-		    VT_end(1);
-#endif
-#if ( PROFlevel>=1 )
-		    TOC(t2, t1);
-		    stat->utime[COMM] += t2;
-		    msg_cnt += 2;
-		    msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
-#endif
-#if ( DEBUGlevel>=2 )
-		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
-			   iam, k+1, msgcnt[0], msgcnt[1], pj);
-#endif
-		}
-	    } /* for pj ... */
-	  } else { /* Post Recv of block column L(:,k+1). */
-	    if ( ToRecv[k+1] >= 1 ) {
-		scp = &grid->rscp; /* The scope of process row. */
-		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
-			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
-		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, 
-			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
-#endif
-	    }
-	  } /* if mycol == Pc(k+1) */
-        } /* if k+1 < nsupers */
-
-	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
-	    /* ---------------------------------------------------
-	       Update all other blocks using block row U(k,:)
-	       --------------------------------------------------- */
-	    for (j = 0; j < nub; ++j) { 
-		lptr = lptr0;
-		luptr = luptr0;
-		jb = usub[iukp];  /* Global block number of block U(k,j). */
-		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
-		nsupc = SuperSize( jb );
-		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
-		/* Prepare to call DGEMM. */
-		jj = iukp;
-		while ( usub[jj] == klst ) ++jj;
-		ldu = klst - usub[jj++];
-		ncols = 1;
-		full = 1;
-		for (; jj < iukp+nsupc; ++jj) {
-		    segsize = klst - usub[jj];
-		    if ( segsize ) {
-		        ++ncols;
-			if ( segsize != ldu ) full = 0;
-		        if ( segsize > ldu ) ldu = segsize;
-		    }
-		}
-#if ( DEBUGlevel>=3 )
-		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-		       iam, full, k, jb, ldu, ncols, nsupc);
-		++num_update;
-#endif
-		if ( full ) {
-		    tempu = &uval[rukp];
-		} else { /* Copy block U(k,j) into tempU2d. */
-#if ( DEBUGlevel>=3 )
-		    ++num_copy;
-#endif
-		    tempu = tempU2d;
-		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
-		        segsize = klst - usub[jj];
-			if ( segsize ) {
-			    lead_zero = ldu - segsize;
-			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
-			    tempu += lead_zero;
-			    for (i = 0; i < segsize; ++i)
-			        tempu[i] = uval[rukp+i];
-			    rukp += segsize;
-			    tempu += segsize;
-			}
-		    }
-		    tempu = tempU2d;
-		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-		} /* if full ... */
-
-		for (lb = 0; lb < nlb; ++lb) { 
-		    ib = lsub[lptr];       /* Row block L(i,k). */
-		    nbrow = lsub[lptr+1];  /* Number of full rows. */
-		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
-		    tempv = tempv2d;
-#ifdef _CRAY
-		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
-			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			  tempu, &ldu, &beta, tempv, &ldt);
-#elif defined (USE_VENDOR_BLAS)
-		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt);
-#else
-		    hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, 
-                           &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
-#endif
-		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
-
-		    /* Now gather the result into the destination block. */
-		    if ( ib < jb ) { /* A(i,j) is in U. */
-			ilst = FstBlockC( ib+1 );
-			lib = LBi( ib, grid );
-			index = Ufstnz_br_ptr[lib];
-			ijb = index[iuip[lib]];
-			while ( ijb < jb ) { /* Search for dest block. */
-			    ruip[lib] += index[iuip[lib]+1];
-			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
-			    ijb = index[iuip[lib]];
-			}
-			/* Skip descriptor.  Now point to fstnz index of 
-			   block U(i,j). */
-			iuip[lib] += UB_DESCRIPTOR;
-
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    fnz = index[iuip[lib]++];
-			    if ( segsize ) { /* Nonzero segment in U(k.j). */
-				ucol = &Unzval_br_ptr[lib][ruip[lib]];
-				for (i = 0 ; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    ucol[rel] -= tempv[i];
-				}
-				tempv += ldt;
-			    }
-			    ruip[lib] += ilst - fnz;
-			}
-		    } else { /* A(i,j) is in L. */
-			index = Lrowind_bc_ptr[ljb];
-			ldv = index[1];   /* LDA of the dest lusup. */
-			lptrj = BC_HEADER;
-			luptrj = 0;
-			ijb = index[lptrj];
-			while ( ijb != ib ) { /* Search for dest block -- 
-						 blocks are not ordered! */
-			    luptrj += index[lptrj+1];
-			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
-			    ijb = index[lptrj];
-			}
-			/*
-			 * Build indirect table. This is needed because the
-			 * indices are not sorted for the L blocks.
-			 */
-			fnz = FstBlockC( ib );
-			lptrj += LB_DESCRIPTOR;
-			for (i = 0; i < index[lptrj-1]; ++i) {
-			    rel = index[lptrj + i] - fnz;
-			    indirect[rel] = i;
-			}
-			nzval = Lnzval_bc_ptr[ljb] + luptrj;
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    if ( segsize ) {
-/*#pragma _CRI cache_bypass nzval,tempv*/
-				for (i = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    nzval[indirect[rel]] -= tempv[i];
-				}
-				tempv += ldt;
-			    }
-			    nzval += ldv;
-			}
-		    } /* if ib < jb ... */
-		    lptr += nbrow;
-		    luptr += nbrow;
-		} /* for lb ... */
-		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
-		iukp += nsupc;
-	    } /* for j ... */
-	} /* if  k L(:,k) and U(k,:) are not empty */
-
-    } 
-    /* ------------------------------------------
-       END MAIN LOOP: for k = ...
-       ------------------------------------------ */
-
-#if ( VAMPIR>=1 )
-    VT_end(100);
-    VT_traceoff();
-#endif
-
-    if ( Pr*Pc > 1 ) {
-	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
-	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
-	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
-	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
-	SUPERLU_FREE(send_req);
-    }
-
-    SUPERLU_FREE(Llu->ujrow);
-    SUPERLU_FREE(tempv2d);
-    SUPERLU_FREE(indirect);
-    SUPERLU_FREE(iuip);
-    SUPERLU_FREE(ruip);
-
-    /* Prepare error message. */
-    if ( *info == 0 ) *info = n + 1;
-#if ( PROFlevel>=1 )
-    TIC(t1);
-#endif
-    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
-#if ( PROFlevel>=1 )
-    TOC(t2, t1);
-    stat->utime[COMM] += t2;
-    {
-	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
-	
-	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
-		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	MPI_Reduce( &msg_cnt, &msg_cnt_max,
-		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	MPI_Reduce( &msg_vol, &msg_vol_sum,
-		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	MPI_Reduce( &msg_vol, &msg_vol_max,
-		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	if ( !iam ) {
-	    printf("\tPDGSTRF comm stat:"
-		   "\tAvg\tMax\t\tAvg\tMax\n"
-		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
-		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
-		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
-	}
-    }
-#endif
-    if ( iinfo == n + 1 ) *info = 0;
-    else *info = iinfo;
-
-
-#if ( PRNTlevel==3 )
-    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
-    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
-    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
-    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
-#endif
-
-#if ( DEBUGlevel>=2 )
-    for (i = 0; i < Pr * Pc; ++i) {
-	if ( iam == i ) {
-	    dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
-	    dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
-	    printf("(%d)\n", iam);
-	    PrintInt10("Recv", nsupers, Llu->ToRecv);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-#if ( DEBUGlevel>=3 )
-    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
-#endif
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgstrf()");
-#endif
-} /* PDGSTRF */
-
-
-/************************************************************************/
-static void pdgstrf2
-/************************************************************************/
-(
- superlu_options_t *options,
- int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat, int* info
- )
-/* 
- * Purpose
- * =======
- *   Factor diagonal and subdiagonal blocks and test for exact singularity.
- *   Only the process column that owns block column *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * k      (input) int (global)
- *        The column number of the block column to be factorized.
- *
- * thresh (input) double (global)
- *        The threshold value = s_eps * anorm.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization.
- *        See SuperLUStat_t structure defined in util.h.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- *
- */
-{
-    int    c, iam, l, pkk;
-    int    incx = 1, incy = 1;
-    int    nsupr; /* number of rows in the block (LDA) */
-    int    luptr;
-    int_t  i, krow, j, jfst, jlst;
-    int_t  nsupc; /* number of columns in the block */
-    int_t  *xsup = Glu_persist->xsup;
-    double *lusup, temp;
-    double *ujrow;
-    double alpha = -1;
-    *info = 0;
-
-    /* Quick return. */
-
-    /* Initialization. */
-    iam   = grid->iam;
-    krow  = PROW( k, grid );
-    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
-    j     = LBj( k, grid ); /* Local block number */
-    jfst  = FstBlockC( k );
-    jlst  = FstBlockC( k+1 );
-    lusup = Llu->Lnzval_bc_ptr[j];
-    nsupc = SuperSize( k );
-    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
-    ujrow = Llu->ujrow;
-
-    luptr = 0; /* Point to the diagonal entries. */
-    c = nsupc;
-    for (j = 0; j < jlst - jfst; ++j) {
-	/* Broadcast the j-th row (nsupc - j) elements to
-	   the process column. */
-	if ( iam == pkk ) { /* Diagonal process. */
-	    i = luptr;
-	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
-		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
-#if ( PRNTlevel>=2 )
-		    printf("(%d) .. col %d, tiny pivot %e  ",
-			   iam, jfst+j, lusup[i]);
-#endif
-		    /* Keep the replaced diagonal with the same sign. */
-		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
-		    else lusup[i] = thresh;
-#if ( PRNTlevel>=2 )
-		    printf("replaced by %e\n", lusup[i]);
-#endif
-		    ++(stat->TinyPivots);
-		}
-	    }
-	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
-	}
-#if 0
-	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
-#else
-	MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
-	/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
-		   grid, COMM_COLUMN, &c);*/
-#endif
-
-#if ( DEBUGlevel>=2 )
-if ( k == 3329 && j == 2 ) {
-	if ( iam == pkk ) {
-	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
-	} else {
-	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
-	}
-}
-#endif
-
-	if ( !lusup ) { /* Empty block column. */
-	    --c;
-	    if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
-	    continue;
-	}
-
-	/* Test for singularity. */
-	if ( ujrow[0] == 0.0 ) {
-	    *info = j+jfst+1;
-	} else {
-	    /* Scale the j-th column of the matrix. */
-	    temp = 1.0 / ujrow[0];
-	    if ( iam == pkk ) {
-		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
-		stat->ops[FACT] += nsupr-j-1;
-	    } else {
-		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
-		stat->ops[FACT] += nsupr;
-	    }
-	}
-	    
-	/* Rank-1 update of the trailing submatrix. */
-	if ( --c ) {
-	    if ( iam == pkk ) {
-		l = nsupr - j - 1;
-#ifdef _CRAY
-		SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
-		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#elif defined (USE_VENDOR_BLAS)
-		dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
-		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#else
-		hypre_F90_NAME_BLAS(dger,DGER)(&l, &c, &alpha, 
-                      &lusup[luptr+1], &incx,
-		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#endif
-		stat->ops[FACT] += 2 * l * c;
-	    } else {
-#ifdef _CRAY
-		SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
-		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#elif defined (USE_VENDOR_BLAS)
-		dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
-		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#else
-		hypre_F90_NAME_BLAS(dger,DGER)(&nsupr, &c, &alpha, 
-                      &lusup[luptr], &incx, 
-		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#endif
-		stat->ops[FACT] += 2 * nsupr * c;
-	    }
-	}
-	
-	/* Move to the next column. */
-	if ( iam == pkk ) luptr += nsupr + 1;
-	else luptr += nsupr;
-
-    } /* for j ... */
-
-} /* PDGSTRF2 */
-
-
-/************************************************************************/
-static void pdgstrs2
-/************************************************************************/
-#ifdef _CRAY
-(
- int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
- )
-#else
-(
- int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat
- )
-#endif
-/* 
- * Purpose
- * =======
- *   Perform parallel triangular solves
- *           U(k,:) := A(k,:) \ L(k,k). 
- *   Only the process row that owns block row *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * m      (input) int (global)
- *        Number of rows in the matrix.
- *
- * k      (input) int (global)
- *        The row number of the block row to be factorized.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization; 
- *        See SuperLUStat_t structure defined in util.h.
- *
- */
-{
-    int    iam, pkk;
-    int    incx = 1;
-    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
-    int    segsize;
-    int_t  nsupc; /* number of columns in the block */
-    int_t  luptr, iukp, rukp;
-    int_t  b, gb, j, klst, knsupc, lk, nb;
-    int_t  *xsup = Glu_persist->xsup;
-    int_t  *usub;
-    double *lusup, *uval;
-
-    /* Quick return. */
-    lk = LBi( k, grid ); /* Local block number */
-    if ( !Llu->Unzval_br_ptr[lk] ) return;
-
-    /* Initialization. */
-    iam  = grid->iam;
-    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
-    klst = FstBlockC( k+1 );
-    knsupc = SuperSize( k );
-    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
-    uval = Llu->Unzval_br_ptr[lk];
-    nb = usub[0];
-    iukp = BR_HEADER;
-    rukp = 0;
-    if ( iam == pkk ) {
-	lk = LBj( k, grid );
-	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
-	lusup = Llu->Lnzval_bc_ptr[lk];
-    } else {
-	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
-	lusup = Llu->Lval_buf_2[k%2];
-    }
-
-    /* Loop through all the row blocks. */
-    for (b = 0; b < nb; ++b) {
-	gb = usub[iukp];
-	nsupc = SuperSize( gb );
-	iukp += UB_DESCRIPTOR;
-
-	/* Loop through all the segments in the block. */
-	for (j = 0; j < nsupc; ++j) {
-	    segsize = klst - usub[iukp++]; 
-	    if ( segsize ) { /* Nonzero segment. */
-		luptr = (knsupc - segsize) * (nsupr + 1);
-#ifdef _CRAY
-		STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
-		      &uval[rukp], &incx);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
-		       &uval[rukp], &incx);
-#else
-		hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L", "N", "U", &segsize, 
-                       &lusup[luptr], &nsupr, 
-		       &uval[rukp], &incx, 1, 1, 1);
-#endif
-		stat->ops[FACT] += segsize * (segsize + 1);
-		rukp += segsize;
-	    }
-	}
-    } /* for b ... */
-
-} /* PDGSTRS2 */
-
-static int
-probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
-	   int buf_size)
-{
-    MPI_Status status;
-    int count; 
-
-    MPI_Probe( source, tag, comm, &status );
-    MPI_Get_count( &status, datatype, &count );
-    if ( count > buf_size ) {
-        printf("(%d) Recv'ed count %d > buffer size $d\n",
-	       iam, count, buf_size);
-	exit(-1);
-    }
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrf_X1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1324 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-#define CRAY_X1
-#if ( VAMPIR>=1 )
-#include <VT.h>
-#endif
-
-/*
- * Internal prototypes
- */
-static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
-		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
-#ifdef _CRAY
-static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
-#else
-static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, SuperLUStat_t *);
-#endif
-
-/* 
- * Sketch of the algorithm
- * =======================
- *
- * The following relations hold:
- *     * A_kk = L_kk * U_kk
- *     * L_ik = Aik * U_kk^(-1)
- *     * U_kj = L_kk^(-1) * A_kj
- *
- *              ----------------------------------
- *              |   |                            |
- *              ----|-----------------------------
- *              |   | \ U_kk|                    |
- *              |   |   \   |        U_kj        |
- *              |   |L_kk \ |         ||         |
- *              ----|-------|---------||----------
- *              |   |       |         \/         |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   | L_ik ==>       A_ij        |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              ----------------------------------
- *
- * Handle the first block of columns separately.
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity. ( pdgstrf2(0), one column at a time )
- *     * Compute block row of U
- *     * Update trailing matrix
- * 
- * Loop over the remaining blocks of columns.
- *   mycol = MYCOL( iam, grid );
- *   myrow = MYROW( iam, grid );
- *   N = nsupers;
- *   For (k = 1; k < N; ++k) {
- *       krow = PROW( k, grid );
- *       kcol = PCOL( k, grid );
- *       Pkk = PNUM( krow, kcol, grid );
- *
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity.
- *       if ( mycol == kcol ) {
- *           pdgstrf2(k), one column at a time 
- *       }
- *
- *     * Parallel triangular solve
- *       if ( iam == Pkk ) multicast L_k,k to this process row;
- *       if ( myrow == krow && mycol != kcol ) {
- *          Recv L_k,k from process Pkk;
- *          for (j = k+1; j < N; ++j) 
- *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
- *                 U_k,j = L_k,k \ A_k,j;
- *       }
- *
- *     * Parallel rank-k update
- *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
- *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
- *       if ( myrow != krow ) {
- *          Pkj = PNUM( krow, mycol, grid );
- *          Recv U_k,k+1:N from process Pkj;
- *       }
- *       if ( mycol != kcol ) {
- *          Pik = PNUM( myrow, kcol, grid );
- *          Recv L_k+1:N,k from process Pik;
- *       }
- *       for (j = k+1; k < N; ++k) {
- *          for (i = k+1; i < N; ++i) 
- *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
- *                   && L_i,k != 0 && U_k,j != 0 )
- *                 A_i,j = A_i,j - L_i,k * U_k,j;
- *       }
- *  }
- *
- *
- * Remaining issues
- *   (1) Use local indices for L subscripts and SPA.  [DONE]
- *
- */
-/************************************************************************/
-void pdgstrf
-/************************************************************************/
-(
- superlu_options_t *options, int m, int n, double anorm,
- LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
- )
-/* 
- * Purpose
- * =======
- *
- *  PDGSTRF performs the LU factorization in parallel.
- *
- * Arguments
- * =========
- * 
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following field should be defined:
- *         o ReplaceTinyPivot (yes_no_t)
- *           Specifies whether to replace the tiny diagonals by
- *           sqrt(epsilon)*norm(A) during LU factorization.
- *
- * m      (input) int
- *        Number of rows in the matrix.
- *
- * n      (input) int
- *        Number of columns in the matrix.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         The following fields should be defined:
- *
- *         o Glu_persist (input) Glu_persist_t*
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (input/output) LocalLU_t*
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- *
- */
-{
-#ifdef _CRAY
-    _fcd ftcs = _cptofcd("N", strlen("N"));
-    _fcd ftcs1 = _cptofcd("L", strlen("L"));
-    _fcd ftcs2 = _cptofcd("N", strlen("N"));
-    _fcd ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    double alpha = 1.0, beta = 0.0;
-    int_t *xsup;
-    int_t *lsub, *lsub1, *usub, *Usub_buf,
-          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
-    double *lusup, *lusup1, *uval, *Uval_buf,
-           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
-    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
-          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
-          nlb, nub, nsupc, rel, rukp;
-    int_t Pc, Pr;
-    int   iam, kcol, krow, mycol, myrow, pi, pj;
-    int   j, k, lk, nsupers;
-    int   nsupr, nbrow, segsize;
-    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
-		      *     0 : transferred in Lsub_buf[]
-		      *     1 : transferred in Lval_buf[]
-		      *     2 : transferred in Usub_buf[] 
-		      *     3 : transferred in Uval_buf[]
-		      */
-    int_t  msg0, msg2;
-    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
-    double **Unzval_br_ptr, **Lnzval_bc_ptr;
-    int_t  *index;
-    double *nzval;
-    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
-    double *ucol;
-    int_t  *indirect;
-    double *tempv, *tempv2d;
-    int_t iinfo;
-    int_t *ToRecv, *ToSendD, **ToSendR;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    superlu_scope_t *scp;
-    double s_eps, thresh;
-    double *tempU2d, *tempu;
-    int    full, ldt, ldu, lead_zero, ncols;
-    MPI_Request recv_req[4], *send_req;
-    MPI_Status status;
-#ifdef CRAY_X1
-    int nonzero_segs;
-#endif
-#if ( DEBUGlevel>=2 ) 
-    int_t num_copy=0, num_update=0;
-#endif
-#if ( PRNTlevel==3 )
-    int_t  zero_msg = 0, total_msg = 0;
-#endif
-#if ( PROFlevel>=1 )
-    double t1, t2;
-    float msg_vol = 0, msg_cnt = 0;
-    int_t iword = sizeof(int_t), dword = sizeof(double);
-#endif
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( m < 0 ) *info = -2;
-    else if ( n < 0 ) *info = -3;
-    if ( *info ) {
-	pxerbla("pdgstrf", grid, -*info);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( m == 0 || n == 0 ) return;
-
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    s_eps = slamch_("Epsilon");
-    thresh = s_eps * anorm;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgstrf()");
-#endif
-
-    stat->ops[FACT] = 0.0;
-
-    if ( Pr*Pc > 1 ) {
-	i = Llu->bufmax[0];
-	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
-	    ABORT("Malloc fails for Lsub_buf.");
-	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
-	i = Llu->bufmax[1];
-	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
-	    ABORT("Malloc fails for Lval_buf[].");
-	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
-	if ( Llu->bufmax[2] != 0 ) 
-	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
-		ABORT("Malloc fails for Usub_buf[].");
-	if ( Llu->bufmax[3] != 0 ) 
-	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
-		ABORT("Malloc fails for Uval_buf[].");
-	if ( !(send_req =
-	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
-	    ABORT("Malloc fails for send_req[].");
-    }
-    if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) )
-	ABORT("Malloc fails for ujrow[].");
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
-	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
-	       Llu->bufmax[0], Llu->bufmax[1], 
-	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
-    }
-#endif
-
-    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
-    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
-    Lval_buf_2[0] = Llu->Lval_buf_2[0];
-    Lval_buf_2[1] = Llu->Lval_buf_2[1];
-    Usub_buf = Llu->Usub_buf;
-    Uval_buf = Llu->Uval_buf;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    Unzval_br_ptr = Llu->Unzval_br_ptr;
-    ToRecv = Llu->ToRecv;
-    ToSendD = Llu->ToSendD;
-    ToSendR = Llu->ToSendR;
-
-    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
-    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
-	ABORT("Calloc fails for tempv2d[].");
-    tempU2d = tempv2d + ldt*ldt;
-#ifdef CRAY_X1
-    if ( !(indirect = intMalloc_dist(2*ldt)) )
-	ABORT("Malloc fails for indirect[].");
-#else
-    if ( !(indirect = intMalloc_dist(ldt)) )
-	ABORT("Malloc fails for indirect[].");
-#endif
-    k = CEILING( nsupers, Pr ); /* Number of local block rows */
-    if ( !(iuip = intMalloc_dist(k)) )
-	ABORT("Malloc fails for iuip[].");
-    if ( !(ruip = intMalloc_dist(k)) )
-	ABORT("Malloc fails for ruip[].");
-
-#if ( VAMPIR>=1 )
-    VT_symdef(1, "Send-L", "Comm");
-    VT_symdef(2, "Recv-L", "Comm");
-    VT_symdef(3, "Send-U", "Comm");
-    VT_symdef(4, "Recv-U", "Comm");
-    VT_symdef(5, "TRF2", "Factor");
-    VT_symdef(100, "Factor", "Factor");
-    VT_begin(100);
-    VT_traceon();
-#endif
-
-    /* ---------------------------------------------------------------
-       Handle the first block column separately to start the pipeline.
-       --------------------------------------------------------------- */
-    if ( mycol == 0 ) {
-#if ( VAMPIR>=1 )
-	VT_begin(5);
-#endif
-	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
-#if ( VAMPIR>=1 )
-	VT_end(5);
-#endif
-
-	scp = &grid->rscp; /* The scope of process row. */
-
-	/* Process column *kcol* multicasts numeric values of L(:,k) 
-	   to process rows. */
-	lsub = Lrowind_bc_ptr[0];
-	lusup = Lnzval_bc_ptr[0];
-	if ( lsub ) {
-	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
-	    msgcnt[1] = lsub[1] * SuperSize( 0 );
-	} else {
-	    msgcnt[0] = msgcnt[1] = 0;
-	}
-	
-	for (pj = 0; pj < Pc; ++pj) {
-	    if ( ToSendR[0][pj] != EMPTY ) {
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		VT_begin(1);
-#endif
-		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
-			  &send_req[pj] );
-		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
-			  &send_req[pj+Pc] );
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
-		       iam, 0, msgcnt[0], msgcnt[1], pj);
-#endif
-#if ( VAMPIR>=1 )
-		VT_end(1);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-		msg_cnt += 2;
-		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
-#endif
-	    }
-	} /* for pj ... */
-    } else { /* Post immediate receives. */
-	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
-	    scp = &grid->rscp; /* The scope of process row. */
-	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
-		      0, scp->comm, &recv_req[0] );
-	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
-		      1, scp->comm, &recv_req[1] );
-#if ( DEBUGlevel>=2 )
-	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
-#endif
-	}
-    } /* if mycol == 0 */
-
-    /* ------------------------------------------
-       MAIN LOOP: Loop through all block columns.
-       ------------------------------------------ */
-    for (k = 0; k < nsupers; ++k) {
-
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-
-	if ( mycol == kcol ) {
-	    lk = LBj( k, grid ); /* Local block number. */
-
-	    for (pj = 0; pj < Pc; ++pj) {
-                /* Wait for Isend to complete before using lsub/lusup. */
-		if ( ToSendR[lk][pj] != EMPTY ) {
-		    MPI_Wait( &send_req[pj], &status );
-		    MPI_Wait( &send_req[pj+Pc], &status );
-		}
-	    }
-	    lsub = Lrowind_bc_ptr[lk];
-	    lusup = Lnzval_bc_ptr[lk];
-	} else {
-	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
-		scp = &grid->rscp; /* The scope of process row. */
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		VT_begin(2);
-#endif
-		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
-		  Llu->bufmax[0]);*/
-		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
-			 (4*k)%NTAGS, scp->comm, &status );*/
-		MPI_Wait( &recv_req[0], &status );
-		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
-		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 
-		  Llu->bufmax[1]);*/
-		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 
-			 (4*k+1)%NTAGS, scp->comm, &status );*/
-		MPI_Wait( &recv_req[1], &status );
-		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
-#if ( VAMPIR>=1 )
-		VT_end(2);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
-		       iam, k, msgcnt[0], msgcnt[1], kcol);
-		fflush(stdout);
-#endif
-		lsub = Lsub_buf_2[k%2];
-		lusup = Lval_buf_2[k%2];
-#if ( PRNTlevel==3 )
-		++total_msg;
-		if ( !msgcnt[0] ) ++zero_msg;
-#endif
-	    } else msgcnt[0] = 0;
-	} /* if mycol = Pc(k) */
-
-	scp = &grid->cscp; /* The scope of process column. */
-
-	if ( myrow == krow ) {
-	    /* Parallel triangular solve across process row *krow* --
-	       U(k,j) = L(k,k) \ A(k,j).  */
-#ifdef _CRAY
-	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
-#else
-	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
-#endif
-
-	    /* Multicasts U(k,:) to process columns. */
-	    lk = LBi( k, grid );
-	    usub = Ufstnz_br_ptr[lk];
-	    uval = Unzval_br_ptr[lk];
-	    if ( usub )	{
-		msgcnt[2] = usub[2];
-		msgcnt[3] = usub[1];
-	    } else {
-		msgcnt[2] = msgcnt[3] = 0;
-	    }
-
-	    if ( ToSendD[lk] == YES ) {
-		for (pi = 0; pi < Pr; ++pi) {
-		    if ( pi != myrow ) {
-#if ( PROFlevel>=1 )
-			TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-			VT_begin(3);
-#endif
-			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
-				 (4*k+2)%NTAGS, scp->comm);
-			MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
-				 (4*k+3)%NTAGS, scp->comm);
-#if ( VAMPIR>=1 )
-			VT_end(3);
-#endif
-#if ( PROFlevel>=1 )
-			TOC(t2, t1);
-			stat->utime[COMM] += t2;
-			msg_cnt += 2;
-			msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
-#endif
-		    } /* if pi ... */
-		} /* for pi ... */
-	    } /* if ToSendD ... */
-	} else { /* myrow != krow */
-	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		VT_begin(4);
-#endif
-		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
-		  Llu->bufmax[2]);*/
-		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
-			 (4*k+2)%NTAGS, scp->comm, &status );
-		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
-		/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, 
-		  Llu->bufmax[3]);*/
-		MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, 
-			 (4*k+3)%NTAGS, scp->comm, &status );
-		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
-#if ( VAMPIR>=1 )
-		VT_end(4);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-#endif
-		usub = Usub_buf;
-		uval = Uval_buf;
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
-#endif
-#if ( PRNTlevel==3 )
-		++total_msg;
-		if ( !msgcnt[2] ) ++zero_msg;
-#endif
-	    } else msgcnt[2] = 0;
-	} /* if myrow == Pr(k) */
-	  
-	/* 
-	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
-	 *  for (j = k+1; k < N; ++k) {
-	 *     for (i = k+1; i < N; ++i) 
-	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
-	 *              && L(i,k) != 0 && U(k,j) != 0 )
-	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
-	 */
-	msg0 = msgcnt[0];
-	msg2 = msgcnt[2];
-	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
-	    nsupr = lsub[1]; /* LDA of lusup. */
-	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
-		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
-		luptr0 = knsupc;
-		nlb = lsub[0] - 1;
-	    } else {
-		lptr0 = BC_HEADER;
-		luptr0 = 0;
-		nlb = lsub[0];
-	    }
-	    lptr = lptr0;
-	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
-		ib = lsub[lptr];
-		lib = LBi( ib, grid );
-		iuip[lib] = BR_HEADER;
-		ruip[lib] = 0;
-		lptr += LB_DESCRIPTOR + lsub[lptr+1];
-	    }
-	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
-	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
-	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
-	    klst = FstBlockC( k+1 );
-	    
-	    /* ---------------------------------------------------
-	       Update the first block column A(:,k+1).
-	       --------------------------------------------------- */
-	    jb = usub[iukp];   /* Global block number of block U(k,j). */
-	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
-		--nub;
-		lptr = lptr0;
-		luptr = luptr0;
-		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
-		nsupc = SuperSize( jb );
-		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
-		/* Prepare to call DGEMM. */
-		jj = iukp;
-		while ( usub[jj] == klst ) ++jj;
-		ldu = klst - usub[jj++];
-		ncols = 1;
-		full = 1;
-		for (; jj < iukp+nsupc; ++jj) {
-		    segsize = klst - usub[jj];
-		    if ( segsize ) {
-		        ++ncols;
-			if ( segsize != ldu ) full = 0;
-		        if ( segsize > ldu ) ldu = segsize;
-		    }
-		}
-#if ( DEBUGlevel>=3 )
-		++num_update;
-#endif
-		if ( full ) {
-		    tempu = &uval[rukp];
-		} else { /* Copy block U(k,j) into tempU2d. */
-#if ( DEBUGlevel>=3 )
-		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-			 iam, full, k, jb, ldu, ncols, nsupc);
-		  ++num_copy;
-#endif
-		    tempu = tempU2d;
-		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
-		        segsize = klst - usub[jj];
-			if ( segsize ) {
-			    lead_zero = ldu - segsize;
-			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
-			    tempu += lead_zero;
-			    for (i = 0; i < segsize; ++i)
-				tempu[i] = uval[rukp+i];
-			    rukp += segsize;
-			    tempu += segsize;
-			}
-		    }
-		    tempu = tempU2d;
-		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-		} /* if full ... */
-
-		for (lb = 0; lb < nlb; ++lb) { 
-		    ib = lsub[lptr]; /* Row block L(i,k). */
-		    nbrow = lsub[lptr+1];  /* Number of full rows. */
-		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
-		    tempv = tempv2d;
-#ifdef _CRAY
-		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
-			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			  tempu, &ldu, &beta, tempv, &ldt);
-#else
-		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt);
-#endif
-		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
-
-		    /* Now gather the result into the destination block. */
-		    if ( ib < jb ) { /* A(i,j) is in U. */
-			ilst = FstBlockC( ib+1 );
-			lib = LBi( ib, grid );
-			index = Ufstnz_br_ptr[lib];
-			ijb = index[iuip[lib]];
-			while ( ijb < jb ) { /* Search for dest block. */
-			    ruip[lib] += index[iuip[lib]+1];
-			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
-			    ijb = index[iuip[lib]];
-			}
-			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
-
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    fnz = index[iuip[lib]++];
-			    if ( segsize ) { /* Nonzero segment in U(k.j). */
-				ucol = &Unzval_br_ptr[lib][ruip[lib]];
-				for (i = 0, it = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    ucol[rel] -= tempv[it++];
-				}
-				tempv += ldt;
-			    }
-			    ruip[lib] += ilst - fnz;
-			}
-		    } else { /* A(i,j) is in L. */
-			index = Lrowind_bc_ptr[ljb];
-			ldv = index[1];   /* LDA of the dest lusup. */
-			lptrj = BC_HEADER;
-			luptrj = 0;
-			ijb = index[lptrj];
-			while ( ijb != ib ) { /* Search for dest block -- 
-						 blocks are not ordered! */
-			    luptrj += index[lptrj+1];
-			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
-			    ijb = index[lptrj];
-			}
-			/*
-			 * Build indirect table. This is needed because the
-			 * indices are not sorted.
-			 */
-			fnz = FstBlockC( ib );
-			lptrj += LB_DESCRIPTOR;
-			for (i = 0; i < index[lptrj-1]; ++i) {
-			    rel = index[lptrj + i] - fnz;
-			    indirect[rel] = i;
-			}
-			nzval = Lnzval_bc_ptr[ljb] + luptrj;
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    if ( segsize ) {
-/*#pragma _CRI cache_bypass nzval,tempv*/
-				for (it = 0, i = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    nzval[indirect[rel]] -= tempv[it++];
-				}
-				tempv += ldt;
-			    }
-			    nzval += ldv;
-			}
-		    } /* if ib < jb ... */
-		    lptr += nbrow;
-		    luptr += nbrow;
-		} /* for lb ... */
-		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
-		iukp += nsupc;
-	    }  /* if jb == k+1 */
-	} /* if L(:,k) and U(k,:) not empty */
-
-
-	if ( k+1 < nsupers ) {
-	  kcol = PCOL( k+1, grid );
-	  if ( mycol == kcol ) {
-#if ( VAMPIR>=1 )
-	    VT_begin(5);
-#endif
-	    /* Factor diagonal and subdiagonal blocks and test for exact
-	       singularity.  */
-	    pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
-#if ( VAMPIR>=1 )
-	    VT_end(5);
-#endif
-
-	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
-	       to process rows. */
-	    lk = LBj( k+1, grid ); /* Local block number. */
-	    lsub1 = Lrowind_bc_ptr[lk];
- 	    if ( lsub1 ) {
-		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
-		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
-	    } else {
-		msgcnt[0] = 0;
-		msgcnt[1] = 0;
-	    }
-	    scp = &grid->rscp; /* The scope of process row. */
-	    for (pj = 0; pj < Pc; ++pj) {
-		if ( ToSendR[lk][pj] != EMPTY ) {
-		    lusup1 = Lnzval_bc_ptr[lk];
-#if ( PROFlevel>=1 )
-		    TIC(t1);
-#endif
-#if ( VAMPIR>=1 )
-		    VT_begin(1);
-#endif
-		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
-			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
-		    MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
-			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
-#if ( VAMPIR>=1 )
-		    VT_end(1);
-#endif
-#if ( PROFlevel>=1 )
-		    TOC(t2, t1);
-		    stat->utime[COMM] += t2;
-		    msg_cnt += 2;
-		    msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
-#endif
-#if ( DEBUGlevel>=2 )
-		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
-			   iam, k+1, msgcnt[0], msgcnt[1], pj);
-#endif
-		}
-	    } /* for pj ... */
-	  } else { /* Post Recv of block column L(:,k+1). */
-	    if ( ToRecv[k+1] >= 1 ) {
-		scp = &grid->rscp; /* The scope of process row. */
-		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
-			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
-		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, 
-			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
-#endif
-	    }
-	  } /* if mycol == Pc(k+1) */
-        } /* if k+1 < nsupers */
-
-	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
-	    /* ---------------------------------------------------
-	       Update all other blocks using block row U(k,:)
-	       --------------------------------------------------- */
-	    for (j = 0; j < nub; ++j) { 
-		lptr = lptr0;
-		luptr = luptr0;
-		jb = usub[iukp];  /* Global block number of block U(k,j). */
-		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
-		nsupc = SuperSize( jb );
-		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
-		/* Prepare to call DGEMM. */
-		jj = iukp;
-		while ( usub[jj] == klst ) ++jj;
-		ldu = klst - usub[jj++];
-		ncols = 1;
-		full = 1;
-		for (; jj < iukp+nsupc; ++jj) {
-		    segsize = klst - usub[jj];
-		    if ( segsize ) {
-		        ++ncols;
-			if ( segsize != ldu ) full = 0;
-		        if ( segsize > ldu ) ldu = segsize;
-		    }
-		}
-#if ( DEBUGlevel>=3 )
-		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-		       iam, full, k, jb, ldu, ncols, nsupc);
-		++num_update;
-#endif
-		if ( full ) {
-		    tempu = &uval[rukp];
-		} else { /* Copy block U(k,j) into tempU2d. */
-#if ( DEBUGlevel>=3 )
-		    ++num_copy;
-#endif
-		    tempu = tempU2d;
-		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
-		        segsize = klst - usub[jj];
-			if ( segsize ) {
-			    lead_zero = ldu - segsize;
-			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
-			    tempu += lead_zero;
-			    for (i = 0; i < segsize; ++i)
-			        tempu[i] = uval[rukp+i];
-			    rukp += segsize;
-			    tempu += segsize;
-			}
-		    }
-		    tempu = tempU2d;
-		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-		} /* if full ... */
-
-		for (lb = 0; lb < nlb; ++lb) { 
-		    ib = lsub[lptr];       /* Row block L(i,k). */
-		    nbrow = lsub[lptr+1];  /* Number of full rows. */
-		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
-		    tempv = tempv2d;
-#ifdef _CRAY
-		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
-			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			  tempu, &ldu, &beta, tempv, &ldt);
-#else
-		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt);
-#endif
-		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;
-
-		    /* Now gather the result into the destination block. */
-		    if ( ib < jb ) { /* A(i,j) is in U. */
-			ilst = FstBlockC( ib+1 );
-			lib = LBi( ib, grid );
-			index = Ufstnz_br_ptr[lib];
-			ijb = index[iuip[lib]];
-			while ( ijb < jb ) { /* Search for dest block. */
-			    ruip[lib] += index[iuip[lib]+1];
-			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
-			    ijb = index[iuip[lib]];
-			}
-			/* Skip descriptor.  Now point to fstnz index of 
-			   block U(i,j). */
-			iuip[lib] += UB_DESCRIPTOR;
-
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    fnz = index[iuip[lib]++];
-			    if ( segsize ) { /* Nonzero segment in U(k.j). */
-				ucol = &Unzval_br_ptr[lib][ruip[lib]];
-				for (i = 0 ; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    ucol[rel] -= tempv[i];
-				}
-				tempv += ldt;
-			    }
-			    ruip[lib] += ilst - fnz;
-			}
-		    } else { /* A(i,j) is in L. */
-			index = Lrowind_bc_ptr[ljb];
-			ldv = index[1];   /* LDA of the dest lusup. */
-			lptrj = BC_HEADER;
-			luptrj = 0;
-			ijb = index[lptrj];
-			while ( ijb != ib ) { /* Search for dest block -- 
-						 blocks are not ordered! */
-			    luptrj += index[lptrj+1];
-			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
-			    ijb = index[lptrj];
-			}
-			/*
-			 * Build indirect table. This is needed because the
-			 * indices are not sorted for the L blocks.
-			 */
-			fnz = FstBlockC( ib );
-			lptrj += LB_DESCRIPTOR;
-			for (i = 0; i < index[lptrj-1]; ++i) {
-			    rel = index[lptrj + i] - fnz;
-			    indirect[rel] = i;
-			}
-			nzval = Lnzval_bc_ptr[ljb] + luptrj;
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    if ( segsize ) {
-/*#pragma _CRI cache_bypass nzval,tempv*/
-				for (i = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    nzval[indirect[rel]] -= tempv[i];
-				}
-				tempv += ldt;
-			    }
-			    nzval += ldv;
-			}
-		    } /* if ib < jb ... */
-		    lptr += nbrow;
-		    luptr += nbrow;
-		} /* for lb ... */
-		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
-		iukp += nsupc;
-	    } /* for j ... */
-	} /* if  k L(:,k) and U(k,:) are not empty */
-
-    } 
-    /* ------------------------------------------
-       END MAIN LOOP: for k = ...
-       ------------------------------------------ */
-
-#if ( VAMPIR>=1 )
-    VT_end(100);
-    VT_traceoff();
-#endif
-
-    if ( Pr*Pc > 1 ) {
-	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
-	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
-	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
-	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
-	SUPERLU_FREE(send_req);
-    }
-
-    SUPERLU_FREE(Llu->ujrow);
-    SUPERLU_FREE(tempv2d);
-    SUPERLU_FREE(indirect);
-    SUPERLU_FREE(iuip);
-    SUPERLU_FREE(ruip);
-
-    /* Prepare error message. */
-    if ( *info == 0 ) *info = n + 1;
-#if ( PROFlevel>=1 )
-    TIC(t1);
-#endif
-    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
-#if ( PROFlevel>=1 )
-    TOC(t2, t1);
-    stat->utime[COMM] += t2;
-    {
-	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
-	
-	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
-		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	MPI_Reduce( &msg_cnt, &msg_cnt_max,
-		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	MPI_Reduce( &msg_vol, &msg_vol_sum,
-		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	MPI_Reduce( &msg_vol, &msg_vol_max,
-		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	if ( !iam ) {
-	    printf("\tPDGSTRF comm stat:"
-		   "\tAvg\tMax\t\tAvg\tMax\n"
-		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
-		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
-		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
-	}
-    }
-#endif
-    if ( iinfo == n + 1 ) *info = 0;
-    else *info = iinfo;
-
-
-#if ( PRNTlevel==3 )
-    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
-    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
-    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
-    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
-#endif
-
-#if ( PRNTlevel==2 )
-    for (i = 0; i < Pr * Pc; ++i) {
-	if ( iam == i ) {
-	    dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
-	    dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
-	    printf("(%d)\n", iam);
-	    PrintInt10("Recv", nsupers, Llu->ToRecv);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-#if ( DEBUGlevel>=3 )
-    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
-#endif
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgstrf()");
-#endif
-} /* PDGSTRF */
-
-
-/************************************************************************/
-static void pdgstrf2
-/************************************************************************/
-(
- superlu_options_t *options,
- int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat, int* info
- )
-/* 
- * Purpose
- * =======
- *   Factor diagonal and subdiagonal blocks and test for exact singularity.
- *   Only the process column that owns block column *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * k      (input) int (global)
- *        The column number of the block column to be factorized.
- *
- * thresh (input) double (global)
- *        The threshold value = s_eps * anorm.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization.
- *        See SuperLUStat_t structure defined in util.h.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- *
- */
-{
-    int    c, iam, l, pkk;
-    int    incx = 1, incy = 1;
-    int    nsupr; /* number of rows in the block (LDA) */
-    int    luptr;
-    int_t  i, krow, j, jfst, jlst;
-    int_t  nsupc; /* number of columns in the block */
-    int_t  *xsup = Glu_persist->xsup;
-    double *lusup, temp;
-    double *ujrow;
-    double alpha = -1;
-    *info = 0;
-
-    /* Quick return. */
-
-    /* Initialization. */
-    iam   = grid->iam;
-    krow  = PROW( k, grid );
-    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
-    j     = LBj( k, grid ); /* Local block number */
-    jfst  = FstBlockC( k );
-    jlst  = FstBlockC( k+1 );
-    lusup = Llu->Lnzval_bc_ptr[j];
-    nsupc = SuperSize( k );
-    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
-    ujrow = Llu->ujrow;
-
-    luptr = 0; /* Point to the diagonal entries. */
-    c = nsupc;
-    for (j = 0; j < jlst - jfst; ++j) {
-	/* Broadcast the j-th row (nsupc - j) elements to
-	   the process column. */
-	if ( iam == pkk ) { /* Diagonal process. */
-	    i = luptr;
-	    if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
-		if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
-#if ( PRNTlevel>=2 )
-		    printf("(%d) .. col %d, tiny pivot %e  ",
-			   iam, jfst+j, lusup[i]);
-#endif
-		    /* Keep the replaced diagonal with the same sign. */
-		    if ( lusup[i] < 0 ) lusup[i] = -thresh;
-		    else lusup[i] = thresh;
-#if ( PRNTlevel>=2 )
-		    printf("replaced by %e\n", lusup[i]);
-#endif
-		    ++(stat->TinyPivots);
-		}
-	    }
-	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
-	}
-#if 0
-	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
-#else
-	MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
-	/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
-		   grid, COMM_COLUMN, &c);*/
-#endif
-
-#if ( DEBUGlevel>=2 )
-if ( k == 3329 && j == 2 ) {
-	if ( iam == pkk ) {
-	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
-	} else {
-	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
-	}
-}
-#endif
-
-	if ( !lusup ) { /* Empty block column. */
-	    --c;
-	    if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
-	    continue;
-	}
-
-	/* Test for singularity. */
-	if ( ujrow[0] == 0.0 ) {
-	    *info = j+jfst+1;
-	} else {
-	    /* Scale the j-th column of the matrix. */
-	    temp = 1.0 / ujrow[0];
-	    if ( iam == pkk ) {
-		for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
-		stat->ops[FACT] += nsupr-j-1;
-	    } else {
-		for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
-		stat->ops[FACT] += nsupr;
-	    }
-	}
-	    
-	/* Rank-1 update of the trailing submatrix. */
-	if ( --c ) {
-	    if ( iam == pkk ) {
-		l = nsupr - j - 1;
-#ifdef _CRAY
-		SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
-		     &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#else
-		dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
-		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#endif
-		stat->ops[FACT] += 2 * l * c;
-	    } else {
-#ifdef _CRAY
-		SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
-		     &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#else
-		dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
-		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#endif
-		stat->ops[FACT] += 2 * nsupr * c;
-	    }
-	}
-	
-	/* Move to the next column. */
-	if ( iam == pkk ) luptr += nsupr + 1;
-	else luptr += nsupr;
-
-    } /* for j ... */
-
-} /* PDGSTRF2 */
-
-
-/************************************************************************/
-static void pdgstrs2
-/************************************************************************/
-#ifdef _CRAY
-(
- int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
- )
-#else
-(
- int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat
- )
-#endif
-/* 
- * Purpose
- * =======
- *   Perform parallel triangular solves
- *           U(k,:) := A(k,:) \ L(k,k). 
- *   Only the process column that owns block column *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * m      (input) int (global)
- *        Number of rows in the matrix.
- *
- * k      (input) int (global)
- *        The row number of the block row to be factorized.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization; 
- *        See SuperLUStat_t structure defined in util.h.
- *
- */
-{
-    int    iam, pkk;
-    int    incx = 1;
-    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
-    int    segsize;
-    int_t  nsupc; /* number of columns in the block */
-    int_t  luptr, iukp, rukp;
-    int_t  b, gb, j, klst, knsupc, lk, nb;
-    int_t  *xsup = Glu_persist->xsup;
-    int_t  *usub;
-    double *lusup, *uval;
-
-    /* Quick return. */
-    lk = LBi( k, grid ); /* Local block number */
-    if ( !Llu->Unzval_br_ptr[lk] ) return;
-
-    /* Initialization. */
-    iam  = grid->iam;
-    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
-    klst = FstBlockC( k+1 );
-    knsupc = SuperSize( k );
-    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
-    uval = Llu->Unzval_br_ptr[lk];
-    nb = usub[0];
-    iukp = BR_HEADER;
-    rukp = 0;
-    if ( iam == pkk ) {
-	lk = LBj( k, grid );
-	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
-	lusup = Llu->Lnzval_bc_ptr[lk];
-    } else {
-	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
-	lusup = Llu->Lval_buf_2[k%2];
-    }
-
-    /* Loop through all the row blocks. */
-    for (b = 0; b < nb; ++b) {
-	gb = usub[iukp];
-	nsupc = SuperSize( gb );
-	iukp += UB_DESCRIPTOR;
-
-	/* Loop through all the segments in the block. */
-	for (j = 0; j < nsupc; ++j) {
-	    segsize = klst - usub[iukp++]; 
-	    if ( segsize ) { /* Nonzero segment. */
-		luptr = (knsupc - segsize) * (nsupr + 1);
-#ifdef _CRAY
-		STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
-		      &uval[rukp], &incx);
-#else
-		dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
-		       &uval[rukp], &incx);
-#endif
-		stat->ops[FACT] += segsize * (segsize + 1);
-		rukp += segsize;
-	    }
-	}
-    } /* for b ... */
-
-} /* PDGSTRS2 */
-
-static int
-probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
-	   int buf_size)
-{
-    MPI_Status status;
-    int count; 
-
-    MPI_Probe( source, tag, comm, &status );
-    MPI_Get_count( &status, datatype, &count );
-    if ( count > buf_size ) {
-        printf("(%d) Recv'ed count %d > buffer size $d\n",
-	       iam, count, buf_size);
-	exit(-1);
-    }
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,819 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_ddefs.h"
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
-		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
-		   int*, double*, int*, double*, double*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-
-void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
-	      double *x, int nrhs, SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * PDGSTRS1 solves a system of distributed linear equations
- *
- *                   op( sub(A) ) * X = sub( B )
- *
- * with a general N-by-N distributed matrix sub( A ) using the LU
- * factorization computed by PDGSTRF.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures to store L and U factors,
- *        and the permutation vectors.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t' structure.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * x      (input/output) double*
- *        On entry, the right hand side matrix.
- *        On exit, the solution matrix if info = 0;
- *
- *        NOTE: the right-hand side matrix is already distributed on
- *              the diagonal processes.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves; 
- *        See SuperLUStat_t structure defined in util.h.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double alpha = 1.0;
-    double *lsum;  /* Local running sum of the updates to B-components */
-    double *lusup, *dest;
-    double *recvbuf, *tempv;
-    double *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    double **Lnzval_bc_ptr;
-    MPI_Status status;
-#ifdef ISEND_IRECV
-    MPI_Request *send_req, recv_req;
-#endif
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -8;
-    if ( *info ) {
-	pxerbla("PDGSTRS1", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-    Llu->SolveMsgSent = 0;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgstrs1()");
-#endif
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS1. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#ifdef ISEND_IRECV
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Compute ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs 
-        + nlb * LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
-    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-
-    /*
-     * Prepended the block number in the header for lsum[].
-     */
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H] = k; 
-	}
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( !frecv[lk] && !fmod[lk] ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p)
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   MPI_DOUBLE, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, 
-                                 pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu,
-			   send_req, stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /*
-     * Compute the internal nodes asynchronously by all processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#ifdef ISEND_IRECV
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &recv_req );
-	MPI_Wait( &recv_req, &status );
-#else
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-#endif
-
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM:
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j)
-		  for (i = 0; i < knsupc; ++i)
-		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p)
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                     MPI_DOUBLE, pi, Xk, grid->comm,
-				     &send_req[Llu->SolveMsgSent++] );
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=2 )
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( DEBUGlevel>=2 )
-    if ( !iam ) printf("\n.. After L-solve: y =\n");
-    for (i = 0, k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    ii = X_BLK( lk );
-	    for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    Llu->SolveMsgSent = 0;
-#endif
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS1. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j)
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nlb is the number of local block rows. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb)
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( !brecv[lk] && !bmod[lk] ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p)
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   MPI_DOUBLE, pi, Xk, grid->comm,
-				   &send_req[Llu->SolveMsgSent++] );
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, 
-                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM:
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i)
-			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-
-		if ( !(--brecv[lk]) && !bmod[lk] ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p)
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       MPI_DOUBLE, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-    /* Deallocate storage. */
-
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i)
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    SUPERLU_FREE(send_req);
-#endif
-    
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgstrs1()");
-#endif
-
-} /* PDGSTRS1 */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal_Bsend.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,998 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_ddefs.h"
-
-
-/*#define ISEND_IRECV*/
-
-/* Parry's change
-   Use MPI_Bsend with a large buffer attached in the main program */
-#define BSEND 1
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
-		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
-		   int*, double*, int*, double*, double*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-
-void
-pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *B,
-		int_t ldb, int nrhs, SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * pdgstrs_Bglobal solves a system of distributed linear equations
- * A*X = B with a general N-by-N matrix A using the LU factorization
- * computed by pdgstrf.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input/output) double*
- *        On entry, the right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *        On exit, the solution matrix of the possibly equilibrated
- *        and row permuted system if info = 0;
- *
- *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
- *              processes when calling this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double alpha = 1.0;
-    double *lsum;  /* Local running sum of the updates to B-components */
-    double *x;     /* X component at step k. */
-    double *lusup, *dest;
-    double *recvbuf, *tempv;
-    double *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    double **Lnzval_bc_ptr;
-    MPI_Status status;
-#if defined(ISEND_IRECV) || defined(BSEND)
-    MPI_Request *send_req, recv_req;
-    int test_flag;
-#endif
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-    /*-- Function prototypes --*/
-    extern void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *,
-				   LocalLU_t *, gridinfo_t *, int_t, int_t [],
-				   int_t [], double [], int_t, double []);
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -9;
-    if ( *info ) {
-	pxerbla("PDGSTRS_BGLOBAL", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-#ifdef BSEND
-    if(!iam) {
-      printf("Using MPI_Bsend in triangular solve\n");
-      fflush(stdout);
-    }
-#endif
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-
-    stat->ops[SOLVE] = 0.0;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");
-#endif
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#if defined(ISEND_IRECV) || defined(BSEND)
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-    for (i = 0; i < Pr; ++i) send_req[i] = MPI_REQUEST_NULL;
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Obtain ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)))
-	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
-	ABORT("Malloc fails for x[].");
-    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doubleMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-
-    /*
-     * Copy B into X on the diagonal processes.
-     */
-    ii = 0;
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
-	    kcol = PCOL( k, grid );
-	    if ( mycol == kcol ) { /* Diagonal process. */
-		jj = X_BLK( lk );
-		x[jj - XK_H] = k;  /* Block number prepended in the header. */
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
-			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
-	    }
-	}
-	ii += knsupc;
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=1 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#else
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p)
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-#if 1
-			MPI_Test( &send_req[p], &test_flag, &status );
-#else
-			if ( send_req[p] != MPI_REQUEST_NULL ) 
-			    MPI_Wait( &send_req[p], &status );
-#endif
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]);
-#else
-#ifdef BSEND
-			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			   send_req,stat);
-#ifdef ISEND_IRECV
-		/* Wait for previous Isends to complete. */
-		for (p = 0; p < Pr; ++p) {
-		    if ( fsendx_plist[lk][p] != EMPTY )
-			/*MPI_Wait( &send_req[p], &status );*/
-			MPI_Test( &send_req[p], &test_flag, &status );
-		}
-#endif
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /* -----------------------------------------------------------
-       Compute the internal nodes asynchronously by all processes.
-       ----------------------------------------------------------- */
-#if ( DEBUGlevel>=1 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#ifdef ISEND_IRECV
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &recv_req );
-	MPI_Wait( &recv_req, &status );
-#else
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-#endif
-
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM:
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j)
-		  for (i = 0; i < knsupc; ++i)
-		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#else
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p)
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-#if 1
-			  MPI_Test( &send_req[p], &test_flag, &status );
-#else
-			  if ( send_req[p] != MPI_REQUEST_NULL )
-			    MPI_Wait( &send_req[p], &status );
-#endif
-			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
-				    MPI_DOUBLE, pi, Xk, grid->comm, 
-				    &send_req[p]);
-#else
-#ifdef BSEND
-			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-#ifdef ISEND_IRECV
-		  /* Wait for the previous Isends to complete. */
-		  for (p = 0; p < Pr; ++p) {
-		      if ( fsendx_plist[lk][p] != EMPTY )
-			  MPI_Test( &send_req[p], &test_flag, &status );
-		  }
-#endif
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=1 )	      
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( PRNTlevel==2 )
-    if ( !iam ) printf("\n.. After L-solve: y =\n");
-    for (i = 0, k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    ii = X_BLK( lk );
-	    for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-    /* MPI_Barrier( grid->comm );  Drain messages in the forward solve. */
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j)
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nlb is the number of local block rows. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb)
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=1 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#else
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p)
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-#if 1
-			MPI_Test( &send_req[p], &test_flag, &status );
-#else
-			if ( send_req[p] != MPI_REQUEST_NULL )
-			  MPI_Wait( &send_req[p], &status );
-#endif
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]);
-#else
-#ifdef BSEND
-			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-#ifdef ISEND_IRECV
-		/* Wait for the previous Isends to complete. */
-		for (p = 0; p < Pr; ++p) {
-		    if ( bsendx_plist[lk][p] != EMPTY )
-			/*MPI_Wait( &send_req[p], &status );*/
-			MPI_Test( &send_req[p], &test_flag, &status );
-		}
-#endif
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-	
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM:
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i)
-			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-
-		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#else
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p)
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-#if 1
-			    MPI_Test( &send_req[p], &test_flag, &status );
-#else
-			    if ( send_req[p] != MPI_REQUEST_NULL )
-			        MPI_Wait( &send_req[p], &status );
-#endif
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				      MPI_DOUBLE, pi, Xk, grid->comm,
-				      &send_req[p] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-#ifdef ISEND_IRECV
-		    /* Wait for the previous Isends to complete. */
-		    for (p = 0; p < Pr; ++p) {
-			if ( bsendx_plist[lk][p] != EMPTY )
-			    /*MPI_Wait( &send_req[p], &status );*/
-			    MPI_Test( &send_req[p], &test_flag, &status );
-		    }
-#endif
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=1 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-
-    /* Copy the solution X into B (on all processes). */
-    {
-	int_t num_diag_procs, *diag_procs, *diag_len;
-	double *work;
-
-	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		       &diag_procs, &diag_len);
-	jj = diag_len[0];
-	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
-	if ( !(work = doubleMalloc_dist(jj*nrhs)) )
-	    ABORT("Malloc fails for work[]");
-	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
-			   grid, num_diag_procs, diag_procs, diag_len,
-			   B, ldb, work);
-	SUPERLU_FREE(diag_procs);
-	SUPERLU_FREE(diag_len);
-	SUPERLU_FREE(work);
-    }
-
-    /* Deallocate storage. */
-
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(x);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i)
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (p = 0; p < Pr; ++p) {
-        if ( send_req[p] != MPI_REQUEST_NULL )
-	    MPI_Wait( &send_req[p], &status );
-    }
-    SUPERLU_FREE(send_req);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");
-#endif
-/* Chao debug */
-
-  MPI_Barrier( grid->comm );  /* Drain messages in the forward solve. */
-
-} /* PDGSTRS_BGLOBAL */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_diag_to_all(int_t n, int_t nrhs, double x[],
-		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-		   gridinfo_t *grid, int_t num_diag_procs,
-		   int_t diag_procs[], int_t diag_len[],
-		   double y[], int_t ldy, double work[])
-{
-    int_t i, ii, j, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    double *x_col, *y_col;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
-		x_col = &x[ii];
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
-		    lwork += knsupc;
-		    x_col += knsupc;
-		}
-	    }
-	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    y_col = &y[ii];
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
-		lwork += knsupc;
-		y_col += ldy;
-	    }
-	}
-    }
-} /* GATHER_DIAG_TO_ALL */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_Bglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,955 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_ddefs.h"
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
-		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
-		   int*, double*, int*, double*, double*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-static void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *,
-                               LocalLU_t *, gridinfo_t *, int_t, int_t [],
-                               int_t [], double [], int_t, double []);
-
-
-void
-pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
-                double *B, int_t ldb, int nrhs, 
-                SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * pdgstrs_Bglobal solves a system of distributed linear equations
- * A*X = B with a general N-by-N matrix A using the LU factorization
- * computed by pdgstrf.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input/output) double*
- *        On entry, the right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *        On exit, the solution matrix of the possibly equilibrated
- *        and row permuted system if info = 0;
- *
- *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
- *              processes when calling this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double alpha = 1.0;
-    double *lsum;  /* Local running sum of the updates to B-components */
-    double *x;     /* X component at step k. */
-    double *lusup, *dest;
-    double *recvbuf, *tempv;
-    double *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    double **Lnzval_bc_ptr;
-    MPI_Status status;
-#if defined (ISEND_IRECV) || defined (BSEND)
-    MPI_Request *send_req, recv_req;
-#endif
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -9;
-    if ( *info ) {
-	pxerbla("PDGSTRS_BGLOBAL", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-    stat->ops[SOLVE] = 0.0;
-    Llu->SolveMsgSent = 0;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");
-#endif
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#if defined (ISEND_IRECV) || defined (BSEND)
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Obtain ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs 
-        + nlb * LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doubleMalloc_dist(((size_t)ldalsum) * nrhs 
-        + nlb * XK_H)) )
-	ABORT("Malloc fails for x[].");
-    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-
-    /*
-     * Copy B into X on the diagonal processes.
-     */
-    ii = 0;
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
-	    kcol = PCOL( k, grid );
-	    if ( mycol == kcol ) { /* Diagonal process. */
-		jj = X_BLK( lk );
-		x[jj - XK_H] = k;  /* Block number prepended in the header. */
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
-			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
-	    }
-	}
-	ii += knsupc;
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   MPI_DOUBLE, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-#ifdef BSEND
-			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  MPI_DOUBLE, 
-                                  pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			   send_req,stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /* -----------------------------------------------------------
-       Compute the internal nodes asynchronously by all processes.
-       ----------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#ifdef ISEND_IRECV
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &recv_req );
-	MPI_Wait( &recv_req, &status );
-#else
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-#endif
-
-	k = *recvbuf;
-
-
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM: /* Receiver must be a diagonal process */
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j)
-		  for (i = 0; i < knsupc; ++i)
-		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p) {
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm, 
-				     &send_req[Llu->SolveMsgSent++]);
-#else
-#ifdef BSEND
-			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                     MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-                  }
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=2 )	      
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( DEBUGlevel>=2 )
-    printf("\n(%d) .. After L-solve: y =\n", iam);
-    for (i = 0, k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    ii = X_BLK( lk );
-	    for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    Llu->SolveMsgSent = 0;
-#endif
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j)
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nub is the number of local block columns. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb) {
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    }
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   MPI_DOUBLE, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-#ifdef BSEND
-			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                  MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-	
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM: /* Receiver must be a diagonal process */
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i)
-			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-
-		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p) {
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       MPI_DOUBLE, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                      MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		    }
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-
-    /* Copy the solution X into B (on all processes). */
-    {
-	int_t num_diag_procs, *diag_procs, *diag_len;
-	double *work;
-
-	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		       &diag_procs, &diag_len);
-	jj = diag_len[0];
-	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
-	if ( !(work = doubleMalloc_dist(((size_t)jj)*nrhs)) )
-	    ABORT("Malloc fails for work[]");
-	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
-			   grid, num_diag_procs, diag_procs, diag_len,
-			   B, ldb, work);
-	SUPERLU_FREE(diag_procs);
-	SUPERLU_FREE(diag_len);
-	SUPERLU_FREE(work);
-    }
-
-    /* Deallocate storage. */
-
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(x);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i)
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    SUPERLU_FREE(send_req);
-#endif
-#ifdef BSEND
-    SUPERLU_FREE(send_req);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");
-#endif
-
-} /* PDGSTRS_BGLOBAL */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_diag_to_all(int_t n, int_t nrhs, double x[],
-		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-		   gridinfo_t *grid, int_t num_diag_procs,
-		   int_t diag_procs[], int_t diag_len[],
-		   double y[], int_t ldy, double work[])
-{
-    int_t i, ii, j, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    double *x_col, *y_col;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
-		x_col = &x[ii];
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
-		    lwork += knsupc;
-		    x_col += knsupc;
-		}
-	    }
-	    MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    y_col = &y[ii];
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
-		lwork += knsupc;
-		y_col += ldy;
-	    }
-	}
-    }
-} /* GATHER_DIAG_TO_ALL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1200 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include "superlu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
-		   double*, int*, double*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-
-int_t
-pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb,
-                      int_t fst_row, int_t *ilsum, double *x,
-		      ScalePermstruct_t *ScalePermstruct,
-		      Glu_persist_t *Glu_persist,
-		      gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct)
-{
-/*
- * Purpose
- * =======
- *   Re-distribute B on the diagonal processes of the 2D process mesh.
- * 
- * Note
- * ====
- *   This routine can only be called after the routine pxgstrs_init(),
- *   in which the structures of the send and receive buffers are set up.
- *
- * Arguments
- * =========
- * 
- * B      (input) double*
- *        The distributed right-hand side matrix of the possibly
- *        equilibrated system.
- *
- * m_loc  (input) int (local)
- *        The local row dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * ldb    (input) int (local)
- *        Leading dimension of matrix B.
- *
- * fst_row (input) int (global)
- *        The row number of B's first row in the global matrix.
- *
- * ilsum  (input) int* (global)
- *        Starting position of each supernode in a full array.
- *
- * x      (output) double*
- *        The solution vector. It is valid only on the diagonal processes.
- *
- * ScalePermstruct (input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * SOLVEstruct (input) SOLVEstruct_t*
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * Return value
- * ============
- *
- */
-    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
-    int  *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
-    int  *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t  *perm_r, *perm_c; /* row and column permutation vectors */
-    int_t  *send_ibuf, *recv_ibuf;
-    double *send_dbuf, *recv_dbuf;
-    int_t  *xsup, *supno;
-    int_t  i, ii, irow, gbi, j, jj, k, knsupc, l, lk;
-    int    p, procs;
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    procs = grid->nprow * grid->npcol;
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    SendCnt      = gstrs_comm->B_to_X_SendCnt;
-    SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt +   procs;
-    RecvCnt      = gstrs_comm->B_to_X_SendCnt + 2*procs;
-    RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs;
-    sdispls      = gstrs_comm->B_to_X_SendCnt + 4*procs;
-    sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs;
-    rdispls      = gstrs_comm->B_to_X_SendCnt + 6*procs;
-    rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs;
-    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
-    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
-
-    /* ------------------------------------------------------------
-       NOW COMMUNICATE THE ACTUAL DATA.
-       ------------------------------------------------------------*/
-    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
-    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
-    if ( !(send_ibuf = intMalloc_dist(k + l)) )
-        ABORT("Malloc fails for send_ibuf[].");
-    recv_ibuf = send_ibuf + k;
-    if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) )
-        ABORT("Malloc fails for send_dbuf[].");
-    recv_dbuf = send_dbuf + k * nrhs;
-    
-    for (p = 0; p < procs; ++p) {
-        ptr_to_ibuf[p] = sdispls[p];
-        ptr_to_dbuf[p] = sdispls[p] * nrhs;
-    }
-
-    /* Copy the row indices and values to the send buffer. */
-    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
-        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
-	gbi = BlockNum( irow );
-	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
-	k = ptr_to_ibuf[p];
-	send_ibuf[k] = irow;
-	k = ptr_to_dbuf[p];
-	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
-	    send_dbuf[k++] = B[i + j*ldb];
-	}
-	++ptr_to_ibuf[p];
-	ptr_to_dbuf[p] += nrhs;
-    }
-
-    /* Communicate the (permuted) row indices. */
-    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
-		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
-
-    /* Communicate the numerical values. */
-    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE,
-		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
-		  grid->comm);
-    
-    /* ------------------------------------------------------------
-       Copy buffer into X on the diagonal processes.
-       ------------------------------------------------------------*/
-    ii = 0;
-    for (p = 0; p < procs; ++p) {
-        jj = rdispls_nrhs[p];
-        for (i = 0; i < RecvCnt[p]; ++i) {
-	    /* Only the diagonal processes do this; the off-diagonal processes
-	       have 0 RecvCnt. */
-	    irow = recv_ibuf[ii]; /* The permuted row index. */
-	    k = BlockNum( irow );
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );  /* Local block number. */
-	    l = X_BLK( lk );
-	    x[l - XK_H] = k;      /* Block number prepended in the header. */
-	    irow = irow - FstBlockC(k); /* Relative row number in X-block */
-	    RHS_ITERATE(j) {
-	        x[l + irow + j*knsupc] = recv_dbuf[jj++];
-	    }
-	    ++ii;
-	}
-    }
-
-    SUPERLU_FREE(send_ibuf);
-    SUPERLU_FREE(send_dbuf);
-    
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()");
-#endif
-    return 0;
-} /* pdReDistribute_B_to_X */
-
-int_t
-pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row,
-		      int_t nrhs, double *x, int_t *ilsum,
-		      ScalePermstruct_t *ScalePermstruct,
-		      Glu_persist_t *Glu_persist, gridinfo_t *grid,
-		      SOLVEstruct_t *SOLVEstruct)
-{
-/*
- * Purpose
- * =======
- *   Re-distribute X on the diagonal processes to B distributed on all
- *   the processes.
- *
- * Note
- * ====
- *   This routine can only be called after the routine pxgstrs_init(),
- *   in which the structures of the send and receive buffers are set up.
- *
- */
-    int_t  i, ii, irow, j, jj, k, knsupc, nsupers, l, lk;
-    int_t  *xsup, *supno;
-    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
-    int  *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs;
-    int  *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t  *send_ibuf, *recv_ibuf;
-    double *send_dbuf, *recv_dbuf;
-    int_t  *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-    int  iam, p, q, pkk, procs;
-    int_t  num_diag_procs, *diag_procs;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    iam = grid->iam;
-    procs = grid->nprow * grid->npcol;
- 
-    SendCnt      = gstrs_comm->X_to_B_SendCnt;
-    SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
-    RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
-    RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs;
-    sdispls      = gstrs_comm->X_to_B_SendCnt + 4*procs;
-    sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs;
-    rdispls      = gstrs_comm->X_to_B_SendCnt + 6*procs;
-    rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs;
-    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
-    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
-
-    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
-    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
-    if ( !(send_ibuf = intMalloc_dist(k + l)) )
-        ABORT("Malloc fails for send_ibuf[].");
-    recv_ibuf = send_ibuf + k;
-    if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) )
-        ABORT("Malloc fails for send_dbuf[].");
-    recv_dbuf = send_dbuf + k * nrhs;
-    for (p = 0; p < procs; ++p) {
-        ptr_to_ibuf[p] = sdispls[p];
-        ptr_to_dbuf[p] = sdispls_nrhs[p];
-    }
-    num_diag_procs = SOLVEstruct->num_diag_procs;
-    diag_procs = SOLVEstruct->diag_procs;
-
-    for (p = 0; p < num_diag_procs; ++p) {  /* For all diagonal processes. */
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid ); /* Local block number */
-		irow = FstBlockC( k );
-		l = X_BLK( lk );
-		for (i = 0; i < knsupc; ++i) {
-#if 0
-		    ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */
-#else
-		    ii = irow;
-#endif
-		    q = row_to_proc[ii];
-		    jj = ptr_to_ibuf[q];
-		    send_ibuf[jj] = ii;
-		    jj = ptr_to_dbuf[q];
-		    RHS_ITERATE(j) { /* RHS stored in row major in buffer. */
-		        send_dbuf[jj++] = x[l + i + j*knsupc];
-		    }
-		    ++ptr_to_ibuf[q];
-		    ptr_to_dbuf[q] += nrhs;
-		    ++irow;
-		}
-	    }
-	}
-    }
-    
-    /* ------------------------------------------------------------
-        COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
-       ------------------------------------------------------------*/
-    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
-		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
-    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, 
-		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE,
-		  grid->comm);
-
-    /* ------------------------------------------------------------
-       COPY THE BUFFER INTO B.
-       ------------------------------------------------------------*/
-    for (i = 0, k = 0; i < m_loc; ++i) {
-	irow = recv_ibuf[i];
-	irow -= fst_row; /* Relative row number */
-	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
-	    B[irow + j*ldb] = recv_dbuf[k++];
-	}
-    }
-
-    SUPERLU_FREE(send_ibuf);
-    SUPERLU_FREE(send_dbuf);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()");
-#endif
-    return 0;
-
-} /* pdReDistribute_X_to_B */
-
-
-void
-pdgstrs(int_t n, LUstruct_t *LUstruct, 
-	ScalePermstruct_t *ScalePermstruct,
-	gridinfo_t *grid, double *B,
-	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
-	SOLVEstruct_t *SOLVEstruct,
-	SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * PDGSTRS solves a system of distributed linear equations
- * A*X = B with a general N-by-N matrix A using the LU factorization
- * computed by PDGSTRF.
- * If the equilibration, and row and column permutations were performed,
- * the LU factorization was performed for A1 where
- *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- * and the linear system solved is
- *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
- * the permutation to B1 by Pc*Pr is applied internally in this routine.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from PDGSTRF for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *        A may be scaled and permuted into A1, so that
- *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_defs.h for the definition of 'gridinfo_t'.
- *
- * B      (input/output) double*
- *        On entry, the distributed right-hand side matrix of the possibly
- *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *        On exit, the distributed solution matrix Y of the possibly
- *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
- *        and X is the solution of the original system.
- *
- * m_loc  (input) int (local)
- *        The local row dimension of matrix B.
- *
- * fst_row (input) int (global)
- *        The row number of B's first row in the global matrix.
- *
- * ldb    (input) int (local)
- *        The leading dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- * 
- * SOLVEstruct (output) SOLVEstruct_t* (global)
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    double alpha = 1.0;
-    double zero = 0.0;
-    double *lsum;  /* Local running sum of the updates to B-components */
-    double *x;     /* X component at step k. */
-		    /* NOTE: x and lsum are of same size. */
-    double *lusup, *dest;
-    double *recvbuf, *tempv;
-    double *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *supno, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    double **Lnzval_bc_ptr;
-    MPI_Status status;
-#ifdef ISEND_IRECV
-    MPI_Request *send_req, recv_req;
-#endif
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve --
-                             Count the number of local block products to
-                             be summed into lsum[lk]. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
-                             from processes in this row. 
-                             It is only valid on the diagonal processes. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for U-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -9;
-    if ( *info ) {
-	pxerbla("PDGSTRS", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    nsupers = supno[n-1] + 1;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pdgstrs()");
-#endif
-
-    stat->ops[SOLVE] = 0.0;
-    Llu->SolveMsgSent = 0;
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#ifdef ISEND_IRECV
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Obtain ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
-	ABORT("Malloc fails for x[].");
-    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-    /* Redistribute B into X on the diagonal processes. */
-    pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
-			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
-
-    /* Set up the headers in lsum[]. */
-    ii = 0;
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
-	}
-	ii += knsupc;
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#else
-		hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "L", "N", "U", 
-                       &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#endif
-		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   MPI_DOUBLE, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			   send_req, stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /* -----------------------------------------------------------
-       Compute the internal nodes asynchronously by all processes.
-       ----------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#ifdef ISEND_IRECV
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE,
-                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req );
-	MPI_Wait( &recv_req, &status );
-#else
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
-                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-#endif
-
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM: /* Receiver must be a diagonal process */
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j) {
-		  for (i = 0; i < knsupc; ++i)
-		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-	      }
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#else
-		  hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "L", "N", "U", 
-                         &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#endif
-		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p) {
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
-                                     MPI_DOUBLE, pi, Xk, grid->comm,
-                                     &send_req[Llu->SolveMsgSent++]);
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				    MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-                  }
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=2 )
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( DEBUGlevel==2 )
-    {
-      printf("(%d) .. After L-solve: y =\n", iam);
-      for (i = 0, k = 0; k < nsupers; ++k) {
-	  krow = PROW( k, grid );
-	  kcol = PCOL( k, grid );
-	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	      knsupc = SuperSize( k );
-	      lk = LBi( k, grid );
-	      ii = X_BLK( lk );
-	      for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	      fflush(stdout);
-	  }
-	  MPI_Barrier( grid->comm );
-      }
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    Llu->SolveMsgSent = 0;
-#endif
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j) {
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
-	    }
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nub is the number of local block columns. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb) {
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    }
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#else
-		hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "U", "N", "N", 
-                       &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#endif
-		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   MPI_DOUBLE, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                  MPI_DOUBLE, pi, Xk,
-                                  grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
-                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-	k = *recvbuf;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM: /* Receiver must be a diagonal process */
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j) {
-		    for (i = 0; i < knsupc; ++i)
-			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
-		}
-
-		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#else
-		    hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "U", "N", "N", 
-                           &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#endif
-		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p) {
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       MPI_DOUBLE, pi, Xk, grid->comm,
-                                       &send_req[Llu->SolveMsgSent++] );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                      MPI_DOUBLE, pi, Xk,
-                                      grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		    }
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-#if ( DEBUGlevel>=2 )
-    {
-	double *x_col;
-	int diag;
-	printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
-	ii = 0;
-	for (k = 0; k < nsupers; ++k) {
-	    knsupc = SuperSize( k );
-	    krow = PROW( k, grid );
-	    kcol = PCOL( k, grid );
-	    diag = PNUM( krow, kcol, grid);
-	    if ( iam == diag ) { /* Diagonal process. */
-		lk = LBi( k, grid );
-		jj = X_BLK( lk );
-		x_col = &x[jj];
-		RHS_ITERATE(j) {
-		    for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
-			printf("\t(%d)\t%4d\t%.10f\n",
-			       iam, xsup[k]+i, x_col[i]);
-		    }
-		    x_col += knsupc;
-		}
-	    }
-	    ii += knsupc;
-	} /* for k ... */
-    }
-#endif
-
-    pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
-			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
-
-
-    /* Deallocate storage. */
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(x);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i) {
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    }
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    SUPERLU_FREE(send_req);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pdgstrs()");
-#endif
-
-} /* PDGSTRS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdgstrs_lsum.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,363 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
-		   double*, int*, double*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, 
-		   int*, double*, int*, double*, double*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-/************************************************************************/
-void dlsum_fmod
-/************************************************************************/
-(
- double *lsum,    /* Sum of local modifications.                        */
- double *x,       /* X array (local)                                    */
- double *xk,      /* X[k].                                              */
- double *rtemp,   /* Result of full matrix-vector multiply.             */
- int   nrhs,      /* Number of right-hand sides.                        */
- int   knsupc,    /* Size of supernode k.                               */
- int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
- int_t nlb,       /* Number of L blocks.                                */
- int_t lptr,      /* Starting position in lsub[*].                      */
- int_t luptr,     /* Starting position in lusup[*].                     */
- int_t *xsup,
- gridinfo_t *grid,
- LocalLU_t *Llu,
- MPI_Request send_req[],
- SuperLUStat_t *stat
-)
-{
-/*
- * Purpose
- * =======
- *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
- */
-    double alpha = 1.0, beta = 0.0;
-    double *lusup, *lusup1;
-    double *dest;
-    int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
-    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, rel;
-    int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
-    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    MPI_Status status;
-    int test_flag;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    lk = LBj( k, grid ); /* Local block number, column-wise. */
-    lsub = Llu->Lrowind_bc_ptr[lk];
-    lusup = Llu->Lnzval_bc_ptr[lk];
-    nsupr = lsub[1];
-
-    for (lb = 0; lb < nlb; ++lb) {
-	ik = lsub[lptr]; /* Global block number, row-wise. */
-	nbrow = lsub[lptr+1];
-#ifdef _CRAY
-	SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
-	      &alpha, &lusup[luptr], &nsupr, xk,
-	      &knsupc, &beta, rtemp, &nbrow );
-#elif defined (USE_VENDOR_BLAS)
-	dgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
-	       &alpha, &lusup[luptr], &nsupr, xk,
-	       &knsupc, &beta, rtemp, &nbrow );
-#else
-	hypre_F90_NAME_BLAS(dgemm,DGEMM)( "N", "N", &nbrow, &nrhs, &knsupc,
-	       &alpha, &lusup[luptr], &nsupr, xk,
-	       &knsupc, &beta, rtemp, &nbrow, 1, 1 );
-#endif
-	stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs;
-   
-	lk = LBi( ik, grid ); /* Local block number, row-wise. */
-	iknsupc = SuperSize( ik );
-	il = LSUM_BLK( lk );
-	dest = &lsum[il];
-	lptr += LB_DESCRIPTOR;
-	rel = xsup[ik]; /* Global row index of block ik. */
-	for (i = 0; i < nbrow; ++i) {
-	    irow = lsub[lptr++] - rel; /* Relative row. */
-	    RHS_ITERATE(j)
-		dest[irow + j*iknsupc] -= rtemp[i + j*nbrow];
-	}
-	luptr += nbrow;
-		    
-	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
-	    ikcol = PCOL( ik, grid );
-	    p = PNUM( myrow, ikcol, grid );
-	    if ( iam != p ) {
-#ifdef ISEND_IRECV
-		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   MPI_DOUBLE, p, LSUM, grid->comm,
-                           &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   MPI_DOUBLE, p, LSUM, grid->comm );
-#else
-		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			 MPI_DOUBLE, p, LSUM, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
-		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
-#endif
-	    } else { /* Diagonal process: X[i] += lsum[i]. */
-		ii = X_BLK( lk );
-		RHS_ITERATE(j)
-		    for (i = 0; i < iknsupc; ++i)
-			x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc];
-		if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
-		    fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( ik, grid );/* Local block number, column-wise. */
-		    lsub1 = Llu->Lrowind_bc_ptr[lk];
-		    lusup1 = Llu->Lnzval_bc_ptr[lk];
-		    nsupr1 = lsub1[1];
-#ifdef _CRAY
-		    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
-			  lusup1, &nsupr1, &x[ii], &iknsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-			   lusup1, &nsupr1, &x[ii], &iknsupc);
-#else
-		    hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "L", "N", "U", 
-                           &iknsupc, &nrhs, &alpha, 
-			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
-#endif
-		    stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, ik);
-#endif
-		
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    for (p = 0; p < grid->nprow; ++p) {
-			if ( fsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, ikcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       MPI_DOUBLE, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii-XK_H], pi);
-#endif
-			}
-                    }
-		    /*
-		     * Perform local block modifications.
-		     */
-		    nlb1 = lsub1[0] - 1;
-		    lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc;
-		    luptr1 = iknsupc; /* Skip diagonal block L(I,I). */
-
-		    dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
-			       fmod, nlb1, lptr1, luptr1, xsup,
-			       grid, Llu, send_req, stat);
-		} /* if frecv[lk] == 0 */
-	    } /* if iam == p */
-	} /* if fmod[lk] == 0 */
-
-    } /* for lb ... */
-
-} /* dLSUM_FMOD */
-
-
-/************************************************************************/
-void dlsum_bmod
-/************************************************************************/
-(
- double *lsum,        /* Sum of local modifications.                    */
- double *x,           /* X array (local).                               */
- double *xk,          /* X[k].                                          */
- int    nrhs,	      /* Number of right-hand sides.                    */
- int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
- int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
- Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
- int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
- int_t  *xsup,
- gridinfo_t *grid,
- LocalLU_t *Llu,
- MPI_Request send_req[],
- SuperLUStat_t *stat
- )
-{
-/*
- * Purpose
- * =======
- *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
- */
-    double alpha = 1.0;
-    int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
-    int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
-           j, jj, lk, lk1, nub, ub, uptr;
-    int_t  *usub;
-    double *uval, *dest, *y;
-    int_t  *lsub;
-    double *lusup;
-    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *brecv = Llu->brecv;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    MPI_Status status;
-    int test_flag;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    knsupc = SuperSize( k );
-    lk = LBj( k, grid ); /* Local block number, column-wise. */
-    nub = Urbs[lk];      /* Number of U blocks in block column lk */
-
-    for (ub = 0; ub < nub; ++ub) {
-	ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
-	usub = Llu->Ufstnz_br_ptr[ik];
-	uval = Llu->Unzval_br_ptr[ik];
-	i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
-	i += UB_DESCRIPTOR;
-	il = LSUM_BLK( ik );
-	gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
-	iknsupc = SuperSize( gik );
-	ikfrow = FstBlockC( gik );
-	iklrow = FstBlockC( gik+1 );
-
-	RHS_ITERATE(j) {
-	    dest = &lsum[il + j*iknsupc];
-	    y = &xk[j*knsupc];
-	    uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
-	    for (jj = 0; jj < knsupc; ++jj) {
-		fnz = usub[i + jj];
-		if ( fnz < iklrow ) { /* Nonzero segment. */
-		    /* AXPY */
-		    for (irow = fnz; irow < iklrow; ++irow)
-			dest[irow - ikfrow] -= uval[uptr++] * y[jj];
-		    stat->ops[SOLVE] += 2 * (iklrow - fnz);
-		}
-	    } /* for jj ... */
-	}
-
-	if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */
-	    gikcol = PCOL( gik, grid );
-	    p = PNUM( myrow, gikcol, grid );
-	    if ( iam != p ) {
-#ifdef ISEND_IRECV
-		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   MPI_DOUBLE, p, LSUM, grid->comm,
-                           &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   MPI_DOUBLE, p, LSUM, grid->comm );
-#else
-		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			  MPI_DOUBLE, p, LSUM, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
-		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
-#endif
-	    } else { /* Diagonal process: X[i] += lsum[i]. */
-		ii = X_BLK( ik );
-		dest = &x[ii];
-		RHS_ITERATE(j)
-		    for (i = 0; i < iknsupc; ++i)
-			dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
-		if ( !brecv[ik] ) { /* Becomes a leaf node. */
-		    bmod[ik] = -1; /* Do not solve X[k] in the future. */
-		    lk1 = LBj( gik, grid ); /* Local block number. */
-		    lsub = Llu->Lrowind_bc_ptr[lk1];
-		    lusup = Llu->Lnzval_bc_ptr[lk1];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &iknsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
-#else
-		    hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L", "U", "N", "N", 
-                           &iknsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &iknsupc);
-#endif
-		    stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, gik);
-#endif
-
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    for (p = 0; p < grid->nprow; ++p) {
-			if ( bsendx_plist[lk1][p] != EMPTY ) {
-			    pi = PNUM( p, gikcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       MPI_DOUBLE, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       MPI_DOUBLE, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				     MPI_DOUBLE, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii-XK_H], pi);
-#endif
-			}
-                     }
-		    /*
-		     * Perform local block modifications.
-		     */
-		    if ( Urbs[lk1] )
-			dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if brecv[ik] == 0 */
-	    }
-	} /* if bmod[ik] == 0 */
-
-    } /* for ub ... */
-
-} /* dlSUM_BMOD */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlangs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,128 +0,0 @@
-
-
-/*
- * File name:	pdlangs.c
- * History:     Modified from lapack routine DLANGE
- */
-#include <math.h>
-#include "superlu_ddefs.h"
-
-double pdlangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
-{
-/* 
-    Purpose   
-    =======   
-
-    PDLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    PDLANGE returns the value   
-
-       PDLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                 (   
-                 ( norm1(A),         NORM = '1', 'O' or 'o'   
-                 (   
-                 ( normI(A),         NORM = 'I' or 'i'   
-                 (   
-                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in DLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-    GRID    (input) gridinof_t*
-            The 2D process mesh.
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NRformat_loc *Astore;
-    int_t    m_loc;
-    double   *Aval;
-    int_t    i, j, irow, jcol;
-    double   value=0., sum;
-    double   *rwork;
-    double   tempvalue;
-    double   *temprwork;
-
-    Astore = (NRformat_loc *) A->Store;
-    m_loc = Astore->m_loc;
-    Aval   = (double *) Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-		value = SUPERLU_MAX( value, fabs(Aval[j]) );
-	}
-
-	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-	value = tempvalue;
-
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-#if 0
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += fabs(Aval[i]);
-	    value = SUPERLU_MAX(value,sum);
-	}
-#else /* XSL ==> */
-	if ( !(rwork = (double *) doubleCalloc_dist(A->ncol)) )
-	    ABORT("doubleCalloc_dist fails for rwork.");
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	        jcol = Astore->colind[j];
-		rwork[jcol] += fabs(Aval[j]);
-	    }
-	}
-
-	if ( !(temprwork = (double *) doubleCalloc_dist(A->ncol)) )
-	    ABORT("doubleCalloc_dist fails for temprwork.");
-	MPI_Allreduce(rwork, temprwork, A->ncol, MPI_DOUBLE, MPI_SUM, grid->comm);
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    value = SUPERLU_MAX(value, temprwork[j]);
-	}
-	SUPERLU_FREE (temprwork);
-	SUPERLU_FREE (rwork);
-#endif	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	value = 0.;
-	sum = 0.;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-	        sum += fabs(Aval[j]);
-	    value = SUPERLU_MAX(value, sum);
-	}
-	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-	value = tempvalue;
-
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else {
-	ABORT("Illegal norm specified.");
-    }
-    
-    return (value);
-
-} /* pdlangs */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdlaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,136 +0,0 @@
-
-
-/*
- * File name:	pdlaqgs.c
- * History:     Modified from LAPACK routine DLAQGE
- */
-#include <math.h>
-#include "superlu_ddefs.h"
-
-void
-pdlaqgs(SuperMatrix *A, double *r, double *c, 
-       double rowcnd, double colcnd, double amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    PDLAQGS equilibrates a general sparse M by N matrix A using the row
-    and column scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
-	    
-    R       (input) double*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) double*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) double
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) double
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) double
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NRformat_loc *Astore;
-    double *Aval;
-    int_t i, j, irow, jcol, m_loc;
-    double large, small, cj;
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(char *);
-    /* extern double dlamch_(char *); */
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    m_loc = Astore->m_loc;
-    
-    /* Initialize LARGE and SMALL. */
-    /* small = dlamch_("Safe minimum") / dlamch_("Precision"); */
-    small = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum") / hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    irow = Astore->fst_row;
-	    for (i = 0; i < m_loc; ++i) {
-	        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-		    jcol = Astore->colind[j];
-		    Aval[j] *= c[jcol];
-	      }
-	      ++irow;
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	irow = Astore->fst_row;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-	        Aval[j] *= r[irow];
-	    ++irow;
-	}
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Both row and column scaling */
-	irow = Astore->fst_row;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	        jcol = Astore->colind[j];
-	        Aval[j] = Aval[j] * r[irow] * c[jcol];
-	    }
-	    ++irow;
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* pdlaqgs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1943 +0,0 @@
-
-
-/*
- * -- Parallel symbolic factorization auxialiary routine (version 2.2) --
- * -- Distributes the data from parallel symbolic factorization 
- * -- to numeric factorization
- * INRIA France -  July 1, 2004
- * Laura Grigori
- *
- * November 1, 2007
- * Feburary 20, 2008
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-
-#include "superlu_ddefs.h"
-#include "psymbfact.h"
-
-static float
-dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
-	     Glu_persist_t *Glu_persist, 
-	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
-	     gridinfo_t *grid
-	     )
-/*
- * Purpose
- * =======
- * 
- * Redistribute the symbolic structure of L and U from the distribution
- * used in the parallel symbolic factorization step to the distdibution
- * used in the parallel numeric factorization step.  On exit, the L and U
- * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
- * information is also computed and it is stored in Glu_persist->supno
- * and Glu_persist->xsup.
- *
- * This routine allocates memory for storing the structure of L and U
- * and the supernodes information.  This represents the arrays:
- * p_xlsub, p_lsub, p_xusub, p_usub,
- * Glu_persist->supno,  Glu_persist->xsup.
- *
- * This routine also deallocates memory allocated during symbolic 
- * factorization routine.  That is, the folloing arrays are freed:
- * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
- * Pslu_freeable->xusub, Pslu_freeable->usub, 
- * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
- * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
- *
- * Arguments
- * =========
- *
- * n      (Input) int_t
- *        Order of the input matrix
- * Pslu_freeable  (Input) Pslu_freeable_t *
- *        Local L and U structure, 
- *        global to local indexing information.
- * 
- * Glu_persist (Output) Glu_persist_t *
- *        Stores on output the information on supernodes mapping.
- * 
- * p_xlsub (Output) int_t **
- *         Pointer to structure of L distributed on a 2D grid 
- *         of processors, stored by columns.
- * 
- * p_lsub  (Output) int_t **
- *         Structure of L distributed on a 2D grid of processors, 
- *         stored by columns.
- *
- * p_xusub (Output) int_t **
- *         Pointer to structure of U distributed on a 2D grid 
- *         of processors, stored by rows.
- * 
- * p_usub  (Output) int_t **
- *         Structure of U distributed on a 2D grid of processors, 
- *         stored by rows.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU.
- *   > 0, number of bytes allocated in this routine when out of memory.
- *        (an approximation).
- */
-{
-  int   iam, nprocs, pc, pr, p, np, p_diag;
-  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
-    *tmp_ptrToSend, *mem;
-  int_t *nnzToRecv_l, *nnzToRecv_u;
-  int_t *send_1, *send_2, nsend_1, nsend_2;
-  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
-  int_t nsupers, nsupers_i, nsupers_j;
-  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
-  int_t maxszsn, maxNvtcsPProc;
-  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
-  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
-  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
-  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
-  int_t *globToLoc, nvtcs_loc;
-  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
-    RecvCnt_l, RecvCnt_u, ind_loc;
-  int_t i, k, j, gb, szsn,  gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
-  int_t nelts, isize;
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword;
-  
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_symbLU()");
-#endif
-  nprocs = (int) grid->nprow * grid->npcol;
-  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
-  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
-  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
-  xsup_end_s    = Pslu_freeable->xsup_end_loc;
-  supno_s       = Pslu_freeable->supno_loc;
-  rcv_luind     = NULL;
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-  memAux = 0.; memRet = 0.;
-  
-  mem           = intCalloc_dist(12 * nprocs);
-  if (!mem)
-    return (ERROR_RET);
-  memAux     = (float) (12 * nprocs * sizeof(int_t));
-  nnzToRecv     = mem;
-  nnzToSend     = nnzToRecv + 2*nprocs;
-  nnzToSend_l   = nnzToSend + 2 * nprocs;
-  nnzToSend_u   = nnzToSend_l + nprocs;
-  send_1        = nnzToSend_u + nprocs;
-  send_2        = send_1 + nprocs;
-  tmp_ptrToSend = send_2 + nprocs;
-  nnzToRecv_l   = tmp_ptrToSend + nprocs;
-  nnzToRecv_u   = nnzToRecv_l + nprocs;
-  
-  ptrToSend = nnzToSend;
-  ptrToRecv = nnzToSend + nprocs;
-
-  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
-  intBuf1 = nvtcs + nprocs;
-  intBuf2 = nvtcs + 2 * nprocs;
-  intBuf3 = nvtcs + 3 * nprocs;
-  intBuf4 = nvtcs + 4 * nprocs;
-  memAux += 5 * nprocs * sizeof(int);
-
-  maxszsn   = sp_ienv_dist(3);
-  
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(supno_n = intMalloc_dist(n+1)) ) {
-    fprintf (stderr, "Malloc fails for supno_n[].");
-    return (memAux);
-  }
-  memRet += (float) ((n+1) * sizeof(int_t));
-
-  /* ------------------------------------------------------------
-     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
-     ------------------------------------------------------------*/
-  
-  if (nvtcs_loc > INT_MAX)
-    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
-  intNvtcs_loc = (int) nvtcs_loc;
-  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
-	      0, grid->comm);
-
-  if (!iam) {
-    /* set ptrToRecv to point to the beginning of the data for
-       each processor */
-    for (k = 0, p = 0; p < nprocs; p++) {
-      ptrToRecv[p] = k;
-      k += nvtcs[p];
-    }
-  }
-  
-  if (nprocs > 1) {
-    temp = NULL;
-    if (!iam ) {
-      if ( !(temp = intMalloc_dist (n+1)) ) {
-	fprintf (stderr, "Malloc fails for temp[].");
-	return (memAux + memRet);
-      }
-      memAux += (float) (n+1) * iword;
-    }
-#if defined (_LONGINT)
-    for (p=0; p<nprocs; p++) {
-      if (ptrToRecv[p] > INT_MAX)
-	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-      intBuf1[p] = (int) ptrToRecv[p];
-    }
-#else  /* Default */
-    intBuf1 = ptrToRecv;
-#endif
-    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
-		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
-  }
-  else
-    temp = supno_s;
-
-  if (!iam) {
-    nsupers = 0;
-    p = (int) OWNER( globToLoc[0] );
-    gb = temp[ptrToRecv[p]];
-    supno_n[0] = nsupers;
-    ptrToRecv[p] ++;
-    szsn = 1;
-    for (j = 1; j < n; j ++) {
-      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
-	nsupers ++;
-	p  = (int) OWNER( globToLoc[j] );
-	gb = temp[ptrToRecv[p]];
-	szsn = 1;
-      }
-      else {
-	szsn ++;
-      }
-      ptrToRecv[p] ++;
-      supno_n[j] = nsupers;
-    }
-    nsupers++;
-    if (nprocs > 1) {
-      SUPERLU_FREE (temp);
-      memAux -= (float) (n+1) * iword;
-    }
-    supno_n[n] = nsupers;
-  }
-
-  /* reset to 0 nnzToSend */
-  for (p = 0; p < 2 *nprocs; p++)
-    nnzToSend[p] = 0;
-  
-  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
-  nsupers = supno_n[n];
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
-    fprintf (stderr, "Malloc fails for xsup_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers+1) * iword;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
-     ------------------------------------------------------------*/
-  gb = EMPTY;
-  for (i = 0; i < n; i++) {
-    if (gb != supno_n[i]) {
-      /* a new supernode starts */
-      gb = supno_n[i];
-      xsup_n[gb] = i;
-    }
-  }
-  xsup_n[nsupers] = n;
-  
-  for (p = 0; p < nprocs; p++) {
-    send_1[p] = FALSE;
-    send_2[p] = FALSE;
-  }
-  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
-    i = xsup_n[gb_n];
-    if (iam == (int) OWNER( globToLoc[i] )) {
-      pc = PCOL( gb_n, grid );
-      pr = PROW( gb_n, grid );
-      p_diag = PNUM( pr, pc, grid);
-      
-      i_loc = LOCAL_IND( globToLoc[i] );
-      gb_s  = supno_s[i_loc];
-      fst_s = xsup_beg_s[gb_s];
-      lst_s = xsup_end_s[gb_s];
-      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
-	k = lsub_s[j];
-	if (k >= i) {
-	  gb = supno_n[k];
-	  p = (int) PNUM( PROW(gb, grid), pc, grid );
-	  nnzToSend[2*p] ++;
-	  send_1[p] = TRUE;
-	}
-      }
-      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
-	k = usub_s[j];
-	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
-	  gb = supno_n[k];
-	  p = PNUM( pr, PCOL(gb, grid), grid);
-	  nnzToSend[2*p+1] ++;	
-	  send_2[p] = TRUE;
-	}
-      }
-      
-      nsend_2 = 0;
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	nnzToSend[2*p+1] += 2;
-	if (send_2[p])  nsend_2 ++;	  
-      }
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
-	if (send_2[p] || p == p_diag) {
-	  if (p == p_diag && !send_2[p])
-	    nnzToSend[2*p+1] += nsend_2;
-	  else
-	    nnzToSend[2*p+1] += nsend_2-1;
-	  send_2[p] = FALSE;
-	}
-      nsend_1 = 0;
-      for (p = pc; p < nprocs; p += grid->npcol) {
-	nnzToSend[2*p] += 2;
-	if (send_1[p]) nsend_1 ++;
-      }
-      for (p = pc; p < nprocs; p += grid->npcol) 
-	if (send_1[p]) {
-	  nnzToSend[2*p] += nsend_1-1;
-	  send_1[p] = FALSE;
-	}
-	else
-	  nnzToSend[2*p] += nsend_1;
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
-		grid->comm);
-  
-  nnz_loc_l = nnz_loc_u = 0;
-  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
-  for (p = 0; p < nprocs; p++) {
-    if ( p != iam ) {
-      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
-      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
-      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
-      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    } else {
-      nnz_loc_l += nnzToRecv[2*p];
-      nnz_loc_u += nnzToRecv[2*p+1];
-      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
-      nnzToRecv_l[p] = nnzToRecv[2*p]; 
-      nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    }
-  }
-  
-  /* Allocate space for storing the symbolic structure after redistribution. */
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for xlsub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_j+1) * iword;
-
-  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for xusub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_i+1) * iword;  
-
-  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
-  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
-    if (!(rcv_luind = 
-	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
-      fprintf (stderr, "Malloc fails for rcv_luind[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
-      * iword;
-  }
-  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
-      fprintf (stderr, "Malloc fails for index[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-  } 
-  
-  /* ------------------------------------------------------------------
-     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
-     ------------------------------------------------------------------*/
-  sendL = TRUE;
-  sendU = FALSE;
-  while (sendL || sendU) {
-    if (sendL) {
-      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
-      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
-    }
-    if (sendU) {
-      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
-      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
-    }
-    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
-      if ( p != iam ) {
-	ptrToSend[p] = i;  i += nnzToSend[p];
-      }
-      ptrToRecv[p] = j;  j += nnzToRecv[p];
-    }
-    nnzToRecv[iam] = 0;
-    
-    ind_loc = ptrToRecv[iam];
-    for (gb_n = 0; gb_n < nsupers; gb_n++) {
-      nsend_2 = 0;    
-      i = xsup_n[gb_n];
-      if (iam == OWNER( globToLoc[i] )) {
-	pc = PCOL( gb_n, grid );
-	pr = PROW( gb_n, grid );
-	p_diag = PNUM( pr, pc, grid );
-	
-	i_loc = LOCAL_IND( globToLoc[i] );
-	gb_s  = supno_s[i_loc];
-	fst_s = xsup_beg_s[gb_s];
-	lst_s = xsup_end_s[gb_s];
-	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-
-	if (sendL) {
-	  p = pc;                np = grid->nprow;	  
-	} else {
-	  p = pr * grid->npcol;  np = grid->npcol;
-	}
-	for (j = 0; j < np; j++) {
-	  if (p == iam) {
-	    rcv_luind[ind_loc] = gb_n;
-	    rcv_luind[ind_loc+1] = 0;
-	    tmp_ptrToSend[p] = ind_loc + 1;
-	    ind_loc += 2;	 
-	  }
-	  else {
-	    snd_luind[ptrToSend[p]] = gb_n;
-	    snd_luind[ptrToSend[p]+1] = 0;
-	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
-	    ptrToSend[p] += 2;	 
-	  }
-	  if (sendL) p += grid->npcol;
-	  if (sendU) p++;
-	}
-	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
-	  k = sub_s[j];
-	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
-	    gb = supno_n[k];
-	    if (sendL)
-	      p = PNUM( PROW(gb, grid), pc, grid );
-	    else 
-	      p = PNUM( pr, PCOL(gb, grid), grid);
-	    if (send_1[p] == FALSE) {
-	      send_1[p] = TRUE;
-	      send_2[nsend_2] = k; nsend_2 ++;
-	    }
-	    if (p == iam) {
-	      rcv_luind[ind_loc] = k;  ind_loc++;
-	      if (sendL)
-		xsub_n[LBj( gb_n, grid )] ++;
-	      else
-		xsub_n[LBi( gb_n, grid )] ++;
-	    }
-	    else {
-	      snd_luind[ptrToSend[p]] = k;
-	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-	    }
-	  }
-	}
-	if (sendL)
-	  for (p = pc; p < nprocs; p += grid->npcol) {
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if (PNUM(PROW(gb, grid), pc, grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBj( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }
-		}
-	      }
-	      send_1[p] = FALSE;
-	  }  
-	if (sendU)
-	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	    if (send_1[p] || p == p_diag) {	      
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBi( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }	     
-		}
-	      } 
-	      send_1[p] = FALSE;
-	    }
-	  }
-      }
-    }
-    
-    /* reset ptrToSnd to point to the beginning of the data for
-       each processor (structure needed in MPI_Alltoallv) */
-    for (i = 0, p = 0; p < nprocs; p++) {
-      ptrToSend[p] = i;  i += nnzToSend[p];
-    }
-
-    /* ------------------------------------------------------------
-       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-       Note: it uses MPI_Alltoallv.
-       ------------------------------------------------------------*/
-    if (nprocs > 1) {
-#if defined (_LONGINT)
-      nnzToSend[iam] = 0;
-      for (p=0; p<nprocs; p++) {
-	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
-	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
-	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-	intBuf1[p] = (int) nnzToSend[p];
-	intBuf2[p] = (int) ptrToSend[p];
-	intBuf3[p] = (int) nnzToRecv[p];
-	intBuf4[p] = (int) ptrToRecv[p];
-      }
-#else  /* Default */
-      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
-      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
-#endif
-
-      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
-		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
-		     grid->comm);
-    }
-    if (sendL)
-      nnzToRecv[iam] = nnz_loc_l;
-    else 
-      nnzToRecv[iam] = nnz_loc_u;
-    
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE.
-       -------------------------------------------------------------*/
-    if (sendU) 
-      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-	SUPERLU_FREE (snd_luind);
-	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-      }
-    
-    /* ------------------------------------------------------------
-       CONVERT THE FORMAT.
-       ------------------------------------------------------------*/
-    /* Initialize the array of column of L/ row of U pointers */
-    k = 0;
-    for (p = 0; p < nprocs; p ++) {
-      if (p != iam) {
-	i = k;
-	while (i < k + nnzToRecv[p]) {
-	  gb = rcv_luind[i];
-	  nelts = rcv_luind[i+1];
-	  if (sendL)
-	    xsub_n[LBj( gb, grid )] = nelts;
-	  else
-	    xsub_n[LBi( gb, grid )] = nelts;
-	  i += nelts + 2;
-	}
-      }
-      k += nnzToRecv[p];
-    }
-
-    if (sendL) j = nsupers_j;
-    else j = nsupers_i;
-    k = 0; 
-    isize = xsub_n[0];
-    xsub_n[0] = 0; 
-    for (gb_l = 1; gb_l < j; gb_l++) {
-      k += isize;
-      isize = xsub_n[gb_l];
-      xsub_n[gb_l] = k;
-    }
-    xsub_n[gb_l] = k + isize;
-    nnz_loc = xsub_n[gb_l];
-    if (sendL) {
-      lsub_n = NULL;
-      if (nnz_loc) {
-	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for lsub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = lsub_n;
-    }
-    if (sendU) {
-      usub_n = NULL;
-      if (nnz_loc) {
-	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for usub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = usub_n;
-    }
-    
-    /* Copy the data into the L column / U row oriented storage */
-    k = 0;
-    for (p = 0; p < nprocs; p++) {
-      i = k;
-      while (i < k + nnzToRecv[p]) {
-	gb = rcv_luind[i];
-	if (gb >= nsupers)
-	  printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n",
-		  iam, p, gb, nsupers, i, i-k);
-	i += 2;
-	if (sendL) gb_l = LBj( gb, grid );
-	if (sendU) gb_l = LBi( gb, grid );
-	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
-	  sub_n[j] = rcv_luind[i];
-	}
-      }      
-      k += nnzToRecv[p];
-    }
-    if (sendL) {
-      sendL = FALSE;  sendU = TRUE;
-    }
-    else
-      sendU = FALSE;
-  }
-
-  /* deallocate memory allocated during symbolic factorization routine */
-  if (rcv_luind != NULL) {
-    SUPERLU_FREE (rcv_luind);
-    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
-  }
-  SUPERLU_FREE (mem);  
-  memAux -= (float) (12 * nprocs * iword);
-  SUPERLU_FREE(nvtcs);
-  memAux -= (float) (5 * nprocs * sizeof(int));
-  
-  if (xlsub_s != NULL) {
-    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
-  }
-  if (xusub_s != NULL) {
-    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
-  }
-  SUPERLU_FREE (globToLoc); 
-  if (supno_s != NULL) {
-    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
-    SUPERLU_FREE (supno_s);
-  }
-  
-  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
-  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
-  *p_xusub = xusub_n; *p_usub = usub_n;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit dist_symbLU()");
-#endif
-  
-  return (-memRet);
-}
-  
-static float
-ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
-	int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val,
-	int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val,
-	int_t *ilsum_i, int_t *ilsum_j
-	)
-{
-/*
- *
- * Purpose
- * =======
- *   Re-distribute A on the 2D process mesh.  The lower part is
- *   stored using a column format and the upper part
- *   is stored using a row format.
- * 
- * Arguments
- * =========
- * 
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_persist  (Input) Glu_persist_t *
- *        Information on supernodes mapping.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * p_ainf_colptr (Output) int_t**
- *         Pointer to the lower part of A distributed on a 2D grid 
- *         of processors, stored by columns.
- *
- * p_ainf_rowind (Output) int_t**
- *         Structure of of the lower part of A distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_ainf_val    (Output) double**
- *         Numerical values of the lower part of A, distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_asup_rowptr (Output) int_t**
- *         Pointer to the upper part of A distributed on a 2D grid 
- *         of processors, stored by rows.
- *
- * p_asup_colind (Output) int_t**
- *         Structure of of the upper part of A distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * p_asup_val    (Output) double**
- *         Numerical values of the upper part of A, distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * ilsum_i  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block row wise).
- *
- * ilsum_j  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block column wise).
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated when out of memory.
- *        (an approximation).
- *
- */
-  int    iam, p, procs;
-  NRformat_loc *Astore;
-  int_t  *perm_r; /* row permutation vector */
-  int_t  *perm_c; /* column permutation vector */
-  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
-  int_t  nsupers, nsupers_i, nsupers_j;
-  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
-  int_t  nnz_remote; /* number of remote nonzeros to be sent */
-  int_t  SendCnt; /* number of remote nonzeros to be sent */
-  int_t  RecvCnt; /* number of remote nonzeros to be sent */
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  double *asup_val, *ainf_val;
-  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-  int_t  *ia, *ja, **ia_send, *index, *itemp;
-  int_t  *ptr_to_send;
-  double *aij, **aij_send, *nzval, *dtemp;
-  double *nzval_a;
-  MPI_Request *send_req;
-  MPI_Status  status;
-  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-  int_t *supno = Glu_persist->supno;   
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword, szbuf;
-
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter ddist_A()");
-#endif
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-  
-  perm_r = ScalePermstruct->perm_r;
-  perm_c = ScalePermstruct->perm_c;
-  procs = grid->nprow * grid->npcol;
-  Astore = (NRformat_loc *) A->Store;
-  n = A->ncol;
-  m_loc = Astore->m_loc;
-  fst_row = Astore->fst_row;
-  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (ERROR_RET);
-  }
-  memAux = (float) (2 * procs * iword);
-  memRet = 0.;
-  nnzToSend = nnzToRecv + procs;
-  nsupers  = supno[n-1] + 1;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF A.
-     ------------------------------------------------------------*/
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      ++nnzToSend[p]; 
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		grid->comm);
-  
-  maxnnzToRecv = 0;
-  nnz_loc = SendCnt = RecvCnt = 0;
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      SendCnt += nnzToSend[p];
-      RecvCnt += nnzToRecv[p];
-      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
-    } else {
-      nnz_loc += nnzToRecv[p];
-      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
-    }
-  }
-  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-  szbuf = k;
-
-  /* Allocate space for storing the triplets after redistribution. */
-  if ( !(ia = intMalloc_dist(2*k)) ) {
-    fprintf (stderr, "Malloc fails for ia[].");
-    return (memAux);
-  }
-  memAux += (float) (2*k*iword);
-  ja = ia + k;
-  if ( !(aij = doubleMalloc_dist(k)) ) {
-    fprintf (stderr, "Malloc fails for aij[].");
-    return (memAux);
-  }
-  memAux += (float) (k*dword);
-  
-  /* Allocate temporary storage for sending/receiving the A triplets. */
-  if ( procs > 1 ) {
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
-      fprintf (stderr, "Malloc fails for send_req[].");
-      return (memAux);
-    }
-    memAux += (float) (2*procs *sizeof(MPI_Request));
-    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
-      fprintf(stderr, "Malloc fails for ia_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(int_t*));
-    if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) {
-      fprintf(stderr, "Malloc fails for aij_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(double*));    
-    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for index[].");
-      return (memAux);
-    }
-    memAux += (float) (2*SendCnt*iword);
-    if ( !(nzval = doubleMalloc_dist(SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for nzval[].");
-      return (memAux);
-    }
-    memAux += (float) (SendCnt * dword);
-    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
-      fprintf(stderr, "Malloc fails for ptr_to_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs * iword);
-    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for itemp[].");
-      return (memAux);
-    }
-    memAux += (float) (2*maxnnzToRecv*iword);
-    if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for dtemp[].");
-      return (memAux);
-    }
-    memAux += (float) (maxnnzToRecv * dword);
-    
-    for (i = 0, j = 0, p = 0; p < procs; ++p) {
-      if ( p != iam ) {
-	ia_send[p] = &index[i];
-	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	aij_send[p] = &nzval[j];
-	j += nnzToSend[p];
-      }
-    }
-  } /* if procs > 1 */
-  
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
-    return (memAux);
-  }
-  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
-  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
-    return (memAux+memRet);
-  }
-  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
-  
-  /* ------------------------------------------------------------
-     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF A.
-     ------------------------------------------------------------*/
-  nnz_loc = 0; /* Reset the local nonzero count. */
-  nnz_loc_ainf = nnz_loc_asup = 0;
-  nzval_a = Astore->nzval;
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      
-      if ( p != iam ) { /* remote */
-	k = ptr_to_send[p];
-	ia_send[p][k] = irow;
-	ia_send[p][k + nnzToSend[p]] = jcol;
-	aij_send[p][k] = nzval_a[j];
-	++ptr_to_send[p]; 
-      } else {          /* local */
-	ia[nnz_loc] = irow;
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = nzval_a[j];
-	++nnz_loc;
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-
-  /* ------------------------------------------------------------
-     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-     NOTE: Can possibly use MPI_Alltoallv.
-     ------------------------------------------------------------*/
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToSend[p];
-      MPI_Isend( ia_send[p], it, mpi_int_t,
-		 p, iam, grid->comm, &send_req[p] );
-      it = nnzToSend[p];
-      MPI_Isend( aij_send[p], it, MPI_DOUBLE,
-		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToRecv[p];
-      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
-      it = nnzToRecv[p];
-      MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
-		grid->comm, &status );
-      for (i = 0; i < nnzToRecv[p]; ++i) {
-	ia[nnz_loc] = itemp[i];
-	irow = itemp[i];
-	jcol = itemp[i + nnzToRecv[p]];
-	/* assert(jcol<n); */
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = dtemp[i];
-	++nnz_loc;
-	
-	gbi = BlockNum( irow );
-	gbj = BlockNum( jcol );
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      MPI_Wait( &send_req[p], &status);
-      MPI_Wait( &send_req[procs+p], &status);
-    }
-  }
-  
-  /* ------------------------------------------------------------
-     DEALLOCATE TEMPORARY STORAGE
-     ------------------------------------------------------------*/
-  
-  SUPERLU_FREE(nnzToRecv);
-  memAux -= 2 * procs * iword;
-  if ( procs > 1 ) {
-    SUPERLU_FREE(send_req);
-    SUPERLU_FREE(ia_send);
-    SUPERLU_FREE(aij_send);
-    SUPERLU_FREE(index);
-    SUPERLU_FREE(nzval);
-    SUPERLU_FREE(ptr_to_send);
-    SUPERLU_FREE(itemp);
-    SUPERLU_FREE(dtemp);
-    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
-      procs*sizeof(double*) + 2*SendCnt * iword +
-      SendCnt* dword + procs*iword +
-      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
-  }
-  
-  /* ------------------------------------------------------------
-     CONVERT THE TRIPLET FORMAT.
-     ------------------------------------------------------------*/
-  if (nnz_loc_ainf != 0) {
-    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * iword);
-    if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_val[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * dword);
-  }
-  else {
-    ainf_rowind = NULL;
-    ainf_val = NULL;
-  }
-  if (nnz_loc_asup != 0) {
-    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_colind[].");
-      return (memAux + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * iword);
-    if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_val[].");
-      return (memAux  + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * dword);
-  }
-  else {
-    asup_colind = NULL;
-    asup_val = NULL;
-  }
-
-  /* Initialize the array of column pointers */
-  k = 0; 
-  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
-  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
-    k += jsize;              
-    jsize = ainf_colptr[j];  
-    ainf_colptr[j] = k;
-  }
-  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
-  i = 0;
-  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
-  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
-    i += isize;
-    isize = asup_rowptr[j];  
-    asup_rowptr[j] = i;
-  }
-  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
-
-  /* Copy the triplets into the column oriented storage */
-  for (i = 0; i < nnz_loc; ++i) {
-    jcol = ja[i];
-    irow = ia[i];
-    gbi = BlockNum( irow );
-    gbj = BlockNum( jcol );
-    /* Count nonzeros in each column of L / row of U */
-    if (gbi >= gbj) {
-      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
-      k = ainf_colptr[j];
-      ainf_rowind[k] = irow;
-      ainf_val[k] = aij[i];
-      ainf_colptr[j] ++;
-    }
-    else {
-      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
-      k = asup_rowptr[j];
-      asup_colind[k] = jcol;
-      asup_val[k] = aij[i];
-      asup_rowptr[j] ++;
-    }
-  }
-
-  /* Reset the column pointers to the beginning of each column */
-  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
-    ainf_colptr[j] = ainf_colptr[j-1];
-  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
-    asup_rowptr[j] = asup_rowptr[j-1];
-  ainf_colptr[0] = 0;
-  asup_rowptr[0] = 0;
-  
-  SUPERLU_FREE(ia);
-  SUPERLU_FREE(aij);
-  memAux -= 2*szbuf*iword + szbuf*dword;
-  
-  *p_ainf_colptr = ainf_colptr;
-  *p_ainf_rowind = ainf_rowind; 
-  *p_ainf_val    = ainf_val;
-  *p_asup_rowptr = asup_rowptr;
-  *p_asup_colind = asup_colind;
-  *p_asup_val    = asup_val;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit ddist_A()");
-  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
-#endif
-
-  return (-memRet);
-} /* dist_A */
-
-int_t
-ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
-		ScalePermstruct_t *ScalePermstruct,
-		Pslu_freeable_t *Pslu_freeable, 
-		LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- *
- *
- * Purpose
- * =======
- *   Distribute the input matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *          This routine should not be called for this case, an error
- *          is generated.  Instead, pddistribute routine should be called.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (Input) int
- *        Dimension of the matrix.
- *
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
- *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (Input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * LUstruct (Input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated for performing the distribution
- *       of the data, when out of memory.
- *        (an approximation).
- *
- */
-{
-  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-  Glu_freeable_t Glu_freeable_n;
-  LocalLU_t *Llu = LUstruct->Llu;
-  int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, 
-    len, len1, nsupc, nsupc_gb, ii, nprocs;
-  int_t ljb;  /* local block column number */
-  int_t nrbl; /* number of L blocks in current block column */
-  int_t nrbu; /* number of U blocks in current block column */
-  int_t gb;   /* global block number; 0 < gb <= nsuper */
-  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-  int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
-  int_t mybufmax[NBUFFERS];
-  NRformat_loc *Astore;
-  double *a;
-  int_t *asub, *xa;
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  double *asup_val, *ainf_val;
-  int_t *xsup, *supno;    /* supernode and column mapping */
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
-  int_t next_ind;      /* next available position in index[*] */
-  int_t next_val;      /* next available position in nzval[*] */
-  int_t *index;         /* indices consist of headers and row subscripts */
-  double *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
-  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
-  double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-  double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  
-  /*-- Counts to be used in factorization. --*/
-  int_t  *ToRecv, *ToSendD, **ToSendR;
-  
-  /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
-  
-  /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
-  int_t  *ilsum;         /* starting position of each supernode in 
-			    the full array (local)                 */  
-  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
-				the full array (local, block column wise) */  
-  /*-- Auxiliary arrays; freed on return --*/
-  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  int_t *LUb_length; /* L,U block length; size nsupers_ij */
-  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
-  int_t *LUb_number; /* global block number; size nsupers_ij */
-  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
-  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  double *dense, *dense_col; /* SPA */
-  double zero = 0.0;
-  int_t ldaspa;     /* LDA of SPA */
-  int_t iword, dword;
-  float memStrLU, memA,
-    memDist = 0.; /* memory used for redistributing the data, which does
-		   not include the memory for the numerical values of L and U */
-  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
-		    L and U, that will be used in the numeric factorization */
-
-#if ( PRNTlevel>=1 )
-  int_t nLblocks = 0, nUblocks = 0;
-#endif
-  
-  /* Initialization. */
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
-#endif
-  myrow = MYROW( iam, grid );
-  mycol = MYCOL( iam, grid );
-  nprocs = grid->npcol * grid->nprow;
-  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-  Astore   = (NRformat_loc *) A->Store;
-  
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-
-  if (fact == SamePattern_SameRowPerm) {
-    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
-  }
-
-  if ((memStrLU = 
-       dist_symbLU (n, Pslu_freeable, 
-		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
-    return (memStrLU);
-  memDist += (-memStrLU);
-  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
-  supno = Glu_persist->supno;   
-  nsupers  = supno[n-1] + 1;
-  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
-  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
-  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
-  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum[].");  
-    return (memDist + memNLU);
-  }
-  memNLU += (nsupers_i+1) * iword;
-  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum_j[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_j+1) * iword;
-
-  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
-  ilsum[0] = 0;
-  ldaspa = 0;
-  for (gb = 0; gb < nsupers; gb++) 
-    if ( myrow == PROW( gb, grid ) ) {
-      i = SuperSize( gb );
-      ldaspa += i;
-      lb = LBi( gb, grid );
-      ilsum[lb + 1] = ilsum[lb] + i;
-    }
-  ilsum[nsupers_i] = ldaspa;
-
-  ldaspa_j = 0; ilsum_j[0] = 0;  
-  for (gb = 0; gb < nsupers; gb++) 
-    if (mycol == PCOL( gb, grid )) {
-      i = SuperSize( gb );
-      ldaspa_j += i;
-      lb = LBj( gb, grid );
-      ilsum_j[lb + 1] = ilsum_j[lb] + i;
-    }
-  ilsum_j[nsupers_j] = ldaspa_j;
-  
-  if ((memA = ddist_A(A, ScalePermstruct, Glu_persist,
-		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
-		      &asup_rowptr, &asup_colind, &asup_val,
-		      ilsum, ilsum_j)) > 0)
-    return (memDist + memA + memNLU);
-  memDist += (-memA);
-
-  /* ------------------------------------------------------------
-     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
-     ------------------------------------------------------------*/
-  
-  /* We first need to set up the L and U data structures and then
-   * propagate the values of A into them.
-   */
-  if ( !(ToRecv = intCalloc_dist(nsupers)) ) {
-    fprintf(stderr, "Calloc fails for ToRecv[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers * iword;
-  
-  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-  if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for ToSendR[].");
-    return (memDist + memNLU);
-  }
-  memNLU += k*sizeof(int_t*);
-  j = k * grid->npcol;
-  if ( !(index = intMalloc_dist(j)) ) {
-    fprintf(stderr, "Malloc fails for index[].");
-    return (memDist + memNLU);
-  }
-  memNLU += j*iword;
-  
-  for (i = 0; i < j; ++i) index[i] = EMPTY;
-  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-  
-  /* Auxiliary arrays used to set up L and U block data structures.
-     They are freed on return. */
-  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_length[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Malloc fails for LUb_indptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_number[].");
-    return (memDist + memNLU);
-  }    
-  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_valptr[].");
-    return (memDist + memNLU);
-  }
-  memDist += 4 * nsupers_ij * iword;
-  
-  k = CEILING( nsupers, grid->nprow ); 
-  /* Pointers to the beginning of each block row of U. */
-  if ( !(Unzval_br_ptr = 
-	 (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) {
-    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*);
-  Unzval_br_ptr[nsupers_i-1] = NULL;
-  Ufstnz_br_ptr[nsupers_i-1] = NULL;
-
-  if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Malloc fails for ToSendD[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*iword;  
-  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_i + nsupers_j)*iword;
-  
-  /* Auxiliary arrays used to set up L, U block data structures.
-     They are freed on return.
-     k is the number of local row blocks.   */
-  if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
-				   * sp_ienv_dist(3))) ) {
-    fprintf(stderr, "Calloc fails for SPA dense[].");
-    return (memDist + memNLU);
-  }
-  /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for fmod[].");
-    return (memDist + memNLU);
-  }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for bmod[].");
-    return (memDist + memNLU);
-  }
-  /* ------------------------------------------------ */
-  memNLU += 2*nsupers_i*iword + 
-    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
-  
-  /* Pointers to the beginning of each block column of L. */
-  if ( !(Lnzval_bc_ptr = 
-	 (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
-    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*);
-  Lnzval_bc_ptr[nsupers_j-1] = NULL;
-  Lrowind_bc_ptr[nsupers_j-1] = NULL;
-  
-  /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
-  /* -------------------------------------------------------------- */
-  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
-  
-  /*------------------------------------------------------------
-    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-    ------------------------------------------------------------*/
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */
-    ljb_i = LBi( jb, grid);  /* Local block number row wise */
-    fsupc = FstBlockC( jb );
-    nsupc = SuperSize( jb );
-    
-    if ( myrow == jbrow ) { /* Block row jb in my process row */
-      /* Scatter A into SPA. */
-      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
-	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
-	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
-	    printf ("ERR7\n");
-	  jcol = asup_colind[i];
-	  if (jcol >= n)
-	    printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  gb = BlockNum( jcol );
-	  lb = LBj( gb, grid );
-	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
-	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
-	  if (jcol >= ldaspa_j)
-	    printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  dense_col[jcol] = asup_val[i];
-	}
-	dense_col += ldaspa_j;
-      }
-      
-      /*------------------------------------------------
-       * SET UP U BLOCKS.
-       *------------------------------------------------*/
-      /* Count number of blocks and length of each block. */
-      nrbu = 0;
-      len = 0; /* Number of column subscripts I own. */
-      len1 = 0; /* number of fstnz subscripts */
-      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
-	jcol = usub[i];
-	gb = BlockNum( jcol ); /* Global block number */
-	
-	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
-	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
-	  iam, jb, gb, jcol, jbcol, pc); */
-	
-	lb = LBj( gb, grid );  /* Local block number */
-	pc = PCOL( gb, grid ); /* Process col owning this block */
-	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
-	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
-	pr = PROW( gb, grid );
-	if ( pr != jbrow  && mycol == pc)
-	  bsendx_plist[lb][jbrow] = YES; 
-	if (mycol == pc) {
-	  len += nsupc;
-	  LUb_length[lb] += nsupc;
-	  ToSendD[ljb_i] = YES;
-	  if (Urb_marker[lb] <= jb) { /* First see this block */
-	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
-	    Urb_marker[lb] = jb + 1;
-	    LUb_number[nrbu] = gb;
-	    /* if (gb == 391825 && jb == 145361)
-	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
-	       iam, jb, gb, nrbu); */
-	    nrbu ++;
-	    len1 += SuperSize( gb );
-	    if ( gb != jb )/* Exclude diagonal block. */
-	      ++bmod[ljb_i];/* Mod. count for back solve */
-#if ( PRNTlevel>=1 )
-	    ++nUblocks;
-#endif
-	  }
-	}
-      } /* for i ... */
-      
-      if ( nrbu ) { 
-	/* Sort the blocks of U in increasing block column index.
-	   SuperLU_DIST assumes this is true */
-	/* simple insert sort algorithm */
-	/* to be transformed in quick sort */
-	for (j = 1; j < nrbu; j++) {
-	  k = LUb_number[j];
-	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
-	    LUb_number[i+1] = LUb_number[i];
-	  }
-	  LUb_number[i+1] = k;
-	} 
-	
-	/* Set up the initial pointers for each block in
-	   index[] and nzval[]. */
-	/* Add room for descriptors */
-	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1+1)) ) {
-	  fprintf (stderr, "Malloc fails for Uindex[]");
-	  return (memDist + memNLU);
-	}
-	Ufstnz_br_ptr[ljb_i] = index;
-	if (!(Unzval_br_ptr[ljb_i] =
-	      doubleMalloc_dist(len))) {
-	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
-	  return (memDist + memNLU);
-	}
-	memNLU += (len1+1)*iword + len*dword;
-	uval = Unzval_br_ptr[ljb_i];
-	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-	index[0] = nrbu;  /* Number of column blocks */
-	index[1] = len;   /* Total length of nzval[] */
-	index[2] = len1;  /* Total length of index */
-	index[len1] = -1; /* End marker */
-	next_ind = BR_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbu; k++) {
-	  gb = LUb_number[k];
-	  lb = LBj( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;  /* Reset vector of block length */
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len;
-	  LUb_indptr[lb] = next_ind;
-	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
-	    index[next_ind] = FstBlockC( jb + 1 );
-	  LUb_valptr[lb] = next_val;
-	  next_val += len;
-	}
-	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
-	   and the initial values of A from SPA into Unzval_br_ptr[]. */
-	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	  jcol = usub[i];
-	  gb = BlockNum( jcol );
-	  
-	  if ( mycol == PCOL( gb, grid ) ) {
-	    lb = LBj( gb, grid );
-	    k = LUb_indptr[lb]; /* Start fstnz in index */
-	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
-	  }
-	}  /* for i ... */
-	
-	for (i = 0; i < nrbu; i++) {
-	  gb = LUb_number[i];
-	  lb = LBj( gb, grid );   
-	  next_ind = LUb_indptr[lb];
-	  k = FstBlockC( jb + 1);
-	  jcol = ilsum_j[lb];
-	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
-	    dense_col = dense;
-	    j = index[next_ind+jj];
-	    for (ii = j; ii < k; ii++) {
-	      uval[LUb_valptr[lb]++] = dense_col[jcol];
-	      dense_col[jcol] = zero;
-	      dense_col += ldaspa_j;	      
-	    }
-	  }
-	}
-      } else {
-	Ufstnz_br_ptr[ljb_i] = NULL;
-	Unzval_br_ptr[ljb_i] = NULL;
-      } /* if nrbu ... */	
-    } /* if myrow == jbrow */
-    
-      /*------------------------------------------------
-       * SET UP L BLOCKS.
-       *------------------------------------------------*/
-    if (mycol == jbcol) {  /* Block column jb in my process column */
-      /* Scatter A_inf into SPA. */
-      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
-	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
-	  irow = ainf_rowind[i];
-	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
-	  gb = BlockNum( irow );
-	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
-	  if ( myrow == PROW( gb, grid ) ) {
-	    lb = LBi( gb, grid );
-	    irow = ilsum[lb] + irow - FstBlockC( gb );
-	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
-	    dense_col[irow] = ainf_val[i];
-	  }
-	}
-	dense_col += ldaspa;
-      }      
-      
-      /* sort the indices of the diagonal block at the beginning of xlsub */
-      if (myrow == jbrow) {
-	k = xlsub[ljb_j];
-	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	  irow = lsub[i];
-	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
-	    lsub[i] = lsub[k + irow - fsupc];
-	    lsub[k + irow - fsupc] = irow;
-	    i --;
-	  }
-	}
-      }
-      
-      /* Count number of blocks and length of each block. */
-      nrbl = 0;
-      len = 0; /* Number of row subscripts I own. */
-      kseen = 0;
-      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	irow = lsub[i];
-	gb = BlockNum( irow ); /* Global block number */	  
-	pr = PROW( gb, grid ); /* Process row owning this block */
-	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
-	     myrow == jbrow) {
-	  fsendx_plist[ljb_j][pr] = YES;
-	  ++nfsendx;
-	}
-	if ( myrow == pr ) {
-	  lb = LBi( gb, grid );  /* Local block number */
-	  if (Lrb_marker[lb] <= jb) { /* First see this block */
-	    Lrb_marker[lb] = jb + 1;
-	    LUb_length[lb] = 1;
-	    LUb_number[nrbl++] = gb;
-	    if ( gb != jb ) /* Exclude diagonal block. */
-	      ++fmod[lb]; /* Mod. count for forward solve */
-	    if ( kseen == 0 && myrow != jbrow ) {
-	      ++nfrecvx;
-	      kseen = 1;
-	    }
-#if ( PRNTlevel>=1 )
-	    ++nLblocks;
-#endif
-	  } else 
-	    ++LUb_length[lb];	    
-	  ++len;
-	}
-      } /* for i ... */
-      
-      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-	/* Set up the initial pointers for each block in 
-	   index[] and nzval[]. */
-	/* If I am the owner of the diagonal block, order it first in LUb_number.
-	   Necessary for SuperLU_DIST routines */
-	kseen = EMPTY;
-	for (j = 0; j < nrbl; j++) {
-	  if (LUb_number[j] == jb)
-	    kseen = j;
-	}
-	if (kseen != EMPTY && kseen != 0) {
-	  LUb_number[kseen] = LUb_number[0];
-	  LUb_number[0] = jb;
-	}
-	
-	/* Add room for descriptors */
-	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1)) ) {
-	  fprintf (stderr, "Malloc fails for index[]");
-	  return (memDist + memNLU);
-	}
-	Lrowind_bc_ptr[ljb_j] = index;
-	if (!(Lnzval_bc_ptr[ljb_j] = 
-	      doubleMalloc_dist(len*nsupc))) {
-	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb);
-	  return (memDist + memNLU);
-	}
-	memNLU += len1*iword + len*nsupc*dword;
-	
-	lusup = Lnzval_bc_ptr[ljb_j];
-	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-	index[0] = nrbl;  /* Number of row blocks */
-	index[1] = len;   /* LDA of the nzval[] */
-	next_ind = BC_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbl; ++k) {
-	  gb = LUb_number[k];
-	  lb = LBi( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len; 
-	  LUb_indptr[lb] = next_ind;
-	    LUb_valptr[lb] = next_val;
-	    next_ind += len;
-	    next_val += len;
-	  }
-	  /* Propagate the compressed row subscripts to Lindex[],
-	     and the initial values of A from SPA into Lnzval[]. */
-	  len = index[1];  /* LDA of lusup[] */
-	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	    irow = lsub[i];
-	    gb = BlockNum( irow );
-	    if ( myrow == PROW( gb, grid ) ) {
-	      lb = LBi( gb, grid );
-	      k = LUb_indptr[lb]++; /* Random access a block */
-	      index[k] = irow;
-	      k = LUb_valptr[lb]++;
-	      irow = ilsum[lb] + irow - FstBlockC( gb );
-	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
-		lusup[k] = dense_col[irow];
-		dense_col[irow] = zero;
-		k += len;
-		dense_col += ldaspa;
-	      }
-	    }
-	  } /* for i ... */
-	} else {
-	  Lrowind_bc_ptr[ljb_j] = NULL;
-	  Lnzval_bc_ptr[ljb_j] = NULL;
-	} /* if nrbl ... */		  
-      } /* if mycol == pc */
-  } /* for jb ... */
-
-  SUPERLU_FREE(ilsum_j);
-  SUPERLU_FREE(Urb_marker);
-  SUPERLU_FREE(LUb_length);
-  SUPERLU_FREE(LUb_indptr);
-  SUPERLU_FREE(LUb_number);
-  SUPERLU_FREE(LUb_valptr);
-  SUPERLU_FREE(Lrb_marker);
-  SUPERLU_FREE(dense);
-  
-  /* Free the memory used for storing L and U */
-  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
-  if (lsub != NULL)
-    SUPERLU_FREE(lsub);  
-  if (usub != NULL)
-    SUPERLU_FREE(usub);
-  
-  /* Free the memory used for storing A */
-  SUPERLU_FREE(ainf_colptr);
-  if (ainf_rowind != NULL) {
-    SUPERLU_FREE(ainf_rowind);
-    SUPERLU_FREE(ainf_val);
-  }
-  SUPERLU_FREE(asup_rowptr);
-  if (asup_colind != NULL) {
-    SUPERLU_FREE(asup_colind);	
-    SUPERLU_FREE(asup_val);	
-  }
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
-    fprintf (stderr, "Malloc fails for recvBuf[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  
-  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
-    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
-  
-  for (p = 0; p < nprocs; p++)
-    nnzToRecv[p] = 0;
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    nnzToRecv[p] += grid->npcol;
-  }    
-  i = 0;
-  for (p = 0; p < nprocs; p++) {
-    ptrToRecv[p] = i;
-    i += nnzToRecv[p];
-    ptrToSend[p] = 0;
-    if (p != iam)
-      nnzToSend[p] = nnzToRecv[iam];
-    else
-      nnzToSend[p] = 0;
-  }
-  nnzToRecv[iam] = 0;
-  i = ptrToRecv[iam];
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    if (p == iam) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      for (j = 0; j < grid->npcol; j++, i++)
-	recvBuf[i] = ToSendR[ljb_j][j];
-    }
-  }   
-  
-  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
-		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
-    /* (myrow == jbrow) {
-       if (ToSendD[ljb_i] == YES)
-       ToRecv[jb] = 1;
-       }
-       else {
-       if (recvBuf[ptrToRecv[p] + mycol] == YES)
-       ToRecv[jb] = 2;
-       } */
-    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
-      if (myrow == jbrow)
-	ToRecv[jb] = 1;
-      else
-	ToRecv[jb] = 2;
-    }
-    if (mycol == jbcol) {
-      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
-	ToSendR[ljb_j][i] = recvBuf[j];  
-      ToSendR[ljb_j][mycol] = EMPTY;
-    }
-    ptrToRecv[p] += grid->npcol;
-  }   
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
-		 MPI_MAX, grid->cscp.comm);
-  
-  for (jb = 0; jb < nsupers; jb ++) {
-    jbcol = PCOL( jb, grid);
-    jbrow = PROW( jb, grid);
-    if (mycol == jbcol) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      if (myrow == jbrow ) {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
-	  (*bsendx_plist)[k] = recvBuf[k];
-	  if ((*bsendx_plist)[k] != EMPTY)
-	    nbsendx ++;
-	}
-      }
-      else {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
-	  (*bsendx_plist)[k] = EMPTY;
-      }
-    }
-  }
-  
-  SUPERLU_FREE(nnzToRecv);
-  SUPERLU_FREE(ptrToRecv);
-  SUPERLU_FREE(nnzToSend);
-  SUPERLU_FREE(ptrToSend);
-  SUPERLU_FREE(recvBuf);
-  
-  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-  Llu->Unzval_br_ptr = Unzval_br_ptr;
-  Llu->ToRecv = ToRecv;
-  Llu->ToSendD = ToSendD;
-  Llu->ToSendR = ToSendR;
-  Llu->fmod = fmod;
-  Llu->fsendx_plist = fsendx_plist;
-  Llu->nfrecvx = nfrecvx;
-  Llu->nfsendx = nfsendx;
-  Llu->bmod = bmod;
-  Llu->bsendx_plist = bsendx_plist;
-  Llu->nbrecvx = nbrecvx;
-  Llu->nbsendx = nbsendx;
-  Llu->ilsum = ilsum;
-  Llu->ldalsum = ldaspa;
-  LUstruct->Glu_persist = Glu_persist;	
-#if ( PRNTlevel>=1 )
-  if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-		     nLblocks, nUblocks);
-#endif
-  
-  /* Find the maximum buffer size. */
-  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		MPI_MAX, grid->comm);
-  
-#if ( DEBUGlevel>=1 )
-  /* Memory allocated but not freed:
-     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
-     ToRecv, ToSendR, ToSendD
-  */
-  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
-#endif
-    
-  return (- (memDist+memNLU));
-} /* dist_psymbtonum */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdsymbfact_distdata.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,1940 +0,0 @@
-
-
-/*
- * -- Parallel symbolic factorization auxialiary routine (version 2.1) --
- * -- Distributes the data from parallel symbolic factorization 
- * -- to numeric factorization
- * INRIA France -  July 1, 2004
- * Laura Grigori
- *
- * November 1, 2007
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-
-#include "superlu_ddefs.h"
-
-static float
-dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
-	     Glu_persist_t *Glu_persist, 
-	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
-	     gridinfo_t *grid
-	     )
-/*
- * Purpose
- * =======
- * 
- * Redistribute the symbolic structure of L and U from the distribution
- * used in the parallel symbolic factorization step to the distdibution
- * used in the parallel numeric factorization step.  On exit, the L and U
- * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
- * information is also computed and it is stored in Glu_persist->supno
- * and Glu_persist->xsup.
- *
- * This routine allocates memory for storing the structure of L and U
- * and the supernodes information.  This represents the arrays:
- * p_xlsub, p_lsub, p_xusub, p_usub,
- * Glu_persist->supno,  Glu_persist->xsup.
- *
- * This routine also deallocates memory allocated during symbolic 
- * factorization routine.  That is, the folloing arrays are freed:
- * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
- * Pslu_freeable->xusub, Pslu_freeable->usub, 
- * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
- * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
- *
- * Arguments
- * =========
- *
- * n      (Input) int_t
- *        Order of the input matrix
- * Pslu_freeable  (Input) Pslu_freeable_t *
- *        Local L and U structure, 
- *        global to local indexing information.
- * 
- * Glu_persist (Output) Glu_persist_t *
- *        Stores on output the information on supernodes mapping.
- * 
- * p_xlsub (Output) int_t **
- *         Pointer to structure of L distributed on a 2D grid 
- *         of processors, stored by columns.
- * 
- * p_lsub  (Output) int_t **
- *         Structure of L distributed on a 2D grid of processors, 
- *         stored by columns.
- *
- * p_xusub (Output) int_t **
- *         Pointer to structure of U distributed on a 2D grid 
- *         of processors, stored by rows.
- * 
- * p_usub  (Output) int_t **
- *         Structure of U distributed on a 2D grid of processors, 
- *         stored by rows.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU.
- *   > 0, number of bytes allocated in this routine when out of memory.
- *        (an approximation).
- */
-{
-  int   iam, nprocs, pc, pr, p, np, p_diag;
-  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
-    *tmp_ptrToSend, *mem;
-  int_t *nnzToRecv_l, *nnzToRecv_u;
-  int_t *send_1, *send_2, nsend_1, nsend_2;
-  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
-  int_t nsupers, nsupers_i, nsupers_j;
-  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
-  int_t maxszsn, maxNvtcsPProc;
-  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
-  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
-  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
-  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
-  int_t *globToLoc, nvtcs_loc;
-  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
-    RecvCnt_l, RecvCnt_u, ind_loc;
-  int_t i, k, j, gb, szsn,  gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
-  int_t nelts, isize;
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword;
-  
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_symbLU()");
-#endif
-  nprocs = (int) grid->nprow * grid->npcol;
-  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
-  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
-  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
-  xsup_end_s    = Pslu_freeable->xsup_end_loc;
-  supno_s       = Pslu_freeable->supno_loc;
-  rcv_luind     = NULL;
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-  memAux = 0.; memRet = 0.;
-  
-  mem           = intCalloc_dist(12 * nprocs);
-  if (!mem)
-    return (ERROR_RET);
-  memAux     = (float) (12 * nprocs * sizeof(int_t));
-  nnzToRecv     = mem;
-  nnzToSend     = nnzToRecv + 2*nprocs;
-  nnzToSend_l   = nnzToSend + 2 * nprocs;
-  nnzToSend_u   = nnzToSend_l + nprocs;
-  send_1        = nnzToSend_u + nprocs;
-  send_2        = send_1 + nprocs;
-  tmp_ptrToSend = send_2 + nprocs;
-  nnzToRecv_l   = tmp_ptrToSend + nprocs;
-  nnzToRecv_u   = nnzToRecv_l + nprocs;
-  
-  ptrToSend = nnzToSend;
-  ptrToRecv = nnzToSend + nprocs;
-
-  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
-  intBuf1 = nvtcs + nprocs;
-  intBuf2 = nvtcs + 2 * nprocs;
-  intBuf3 = nvtcs + 3 * nprocs;
-  intBuf4 = nvtcs + 4 * nprocs;
-  memAux += 5 * nprocs * sizeof(int);
-
-  maxszsn   = sp_ienv_dist(3);
-  
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(supno_n = intMalloc_dist(n+1)) ) {
-    fprintf (stderr, "Malloc fails for supno_n[].");
-    return (memAux);
-  }
-  memRet += (float) ((n+1) * sizeof(int_t));
-
-  /* ------------------------------------------------------------
-     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
-     ------------------------------------------------------------*/
-  
-  if (nvtcs_loc > INT_MAX)
-    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
-  intNvtcs_loc = (int) nvtcs_loc;
-  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
-	      0, grid->comm);
-
-  if (!iam) {
-    /* set ptrToRecv to point to the beginning of the data for
-       each processor */
-    for (k = 0, p = 0; p < nprocs; p++) {
-      ptrToRecv[p] = k;
-      k += nvtcs[p];
-    }
-  }
-  
-  if (nprocs > 1) {
-    temp = NULL;
-    if (!iam ) {
-      if ( !(temp = intMalloc_dist (n+1)) ) {
-	fprintf (stderr, "Malloc fails for temp[].");
-	return (memAux + memRet);
-      }
-      memAux += (float) (n+1) * iword;
-    }
-#if defined (_LONGINT)
-    for (p=0; p<nprocs; p++) {
-      if (ptrToRecv[p] > INT_MAX)
-	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-      intBuf1[p] = (int) ptrToRecv[p];
-    }
-#else  /* Default */
-    intBuf1 = ptrToRecv;
-#endif
-    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
-		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
-  }
-  else
-    temp = supno_s;
-
-  if (!iam) {
-    nsupers = 0;
-    p = (int) OWNER( globToLoc[0] );
-    gb = temp[ptrToRecv[p]];
-    supno_n[0] = nsupers;
-    ptrToRecv[p] ++;
-    szsn = 1;
-    for (j = 1; j < n; j ++) {
-      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
-	nsupers ++;
-	p  = (int) OWNER( globToLoc[j] );
-	gb = temp[ptrToRecv[p]];
-	szsn = 1;
-      }
-      else {
-	szsn ++;
-      }
-      ptrToRecv[p] ++;
-      supno_n[j] = nsupers;
-    }
-    nsupers++;
-    if (nprocs > 1) {
-      SUPERLU_FREE (temp);
-      memAux -= (float) (n+1) * iword;
-    }
-    supno_n[n] = nsupers;
-  }
-
-  /* reset to 0 nnzToSend */
-  for (p = 0; p < 2 *nprocs; p++)
-    nnzToSend[p] = 0;
-  
-  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
-  nsupers = supno_n[n];
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
-    fprintf (stderr, "Malloc fails for xsup_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers+1) * iword;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
-     ------------------------------------------------------------*/
-  gb = EMPTY;
-  for (i = 0; i < n; i++) {
-    if (gb != supno_n[i]) {
-      /* a new supernode starts */
-      gb = supno_n[i];
-      xsup_n[gb] = i;
-    }
-  }
-  xsup_n[nsupers] = n;
-  
-  for (p = 0; p < nprocs; p++) {
-    send_1[p] = FALSE;
-    send_2[p] = FALSE;
-  }
-  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
-    i = xsup_n[gb_n];
-    if (iam == (int) OWNER( globToLoc[i] )) {
-      pc = PCOL( gb_n, grid );
-      pr = PROW( gb_n, grid );
-      p_diag = PNUM( pr, pc, grid);
-      
-      i_loc = LOCAL_IND( globToLoc[i] );
-      gb_s  = supno_s[i_loc];
-      fst_s = xsup_beg_s[gb_s];
-      lst_s = xsup_end_s[gb_s];
-      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
-	k = lsub_s[j];
-	if (k >= i) {
-	  gb = supno_n[k];
-	  p = (int) PNUM( PROW(gb, grid), pc, grid );
-	  nnzToSend[2*p] ++;
-	  send_1[p] = TRUE;
-	}
-      }
-      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
-	k = usub_s[j];
-	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
-	  gb = supno_n[k];
-	  p = PNUM( pr, PCOL(gb, grid), grid);
-	  nnzToSend[2*p+1] ++;	
-	  send_2[p] = TRUE;
-	}
-      }
-      
-      nsend_2 = 0;
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	nnzToSend[2*p+1] += 2;
-	if (send_2[p])  nsend_2 ++;	  
-      }
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
-	if (send_2[p] || p == p_diag) {
-	  if (p == p_diag && !send_2[p])
-	    nnzToSend[2*p+1] += nsend_2;
-	  else
-	    nnzToSend[2*p+1] += nsend_2-1;
-	  send_2[p] = FALSE;
-	}
-      nsend_1 = 0;
-      for (p = pc; p < nprocs; p += grid->npcol) {
-	nnzToSend[2*p] += 2;
-	if (send_1[p]) nsend_1 ++;
-      }
-      for (p = pc; p < nprocs; p += grid->npcol) 
-	if (send_1[p]) {
-	  nnzToSend[2*p] += nsend_1-1;
-	  send_1[p] = FALSE;
-	}
-	else
-	  nnzToSend[2*p] += nsend_1;
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
-		grid->comm);
-  
-  nnz_loc_l = nnz_loc_u = 0;
-  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
-  for (p = 0; p < nprocs; p++) {
-    if ( p != iam ) {
-      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
-      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
-      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
-      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    } else {
-      nnz_loc_l += nnzToRecv[2*p];
-      nnz_loc_u += nnzToRecv[2*p+1];
-      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
-      nnzToRecv_l[p] = nnzToRecv[2*p]; 
-      nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    }
-  }
-  
-  /* Allocate space for storing the symbolic structure after redistribution. */
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for xlsub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_j+1) * iword;
-
-  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for xusub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_i+1) * iword;  
-
-  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
-  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
-    if (!(rcv_luind = 
-	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
-      fprintf (stderr, "Malloc fails for rcv_luind[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
-      * iword;
-  }
-  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
-      fprintf (stderr, "Malloc fails for index[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-  } 
-  
-  /* ------------------------------------------------------------------
-     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
-     ------------------------------------------------------------------*/
-  sendL = TRUE;
-  sendU = FALSE;
-  while (sendL || sendU) {
-    if (sendL) {
-      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
-      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
-    }
-    if (sendU) {
-      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
-      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
-    }
-    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
-      if ( p != iam ) {
-	ptrToSend[p] = i;  i += nnzToSend[p];
-      }
-      ptrToRecv[p] = j;  j += nnzToRecv[p];
-    }
-    nnzToRecv[iam] = 0;
-    
-    ind_loc = ptrToRecv[iam];
-    for (gb_n = 0; gb_n < nsupers; gb_n++) {
-      nsend_2 = 0;    
-      i = xsup_n[gb_n];
-      if (iam == OWNER( globToLoc[i] )) {
-	pc = PCOL( gb_n, grid );
-	pr = PROW( gb_n, grid );
-	p_diag = PNUM( pr, pc, grid );
-	
-	i_loc = LOCAL_IND( globToLoc[i] );
-	gb_s  = supno_s[i_loc];
-	fst_s = xsup_beg_s[gb_s];
-	lst_s = xsup_end_s[gb_s];
-	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-
-	if (sendL) {
-	  p = pc;                np = grid->nprow;	  
-	} else {
-	  p = pr * grid->npcol;  np = grid->npcol;
-	}
-	for (j = 0; j < np; j++) {
-	  if (p == iam) {
-	    rcv_luind[ind_loc] = gb_n;
-	    rcv_luind[ind_loc+1] = 0;
-	    tmp_ptrToSend[p] = ind_loc + 1;
-	    ind_loc += 2;	 
-	  }
-	  else {
-	    snd_luind[ptrToSend[p]] = gb_n;
-	    snd_luind[ptrToSend[p]+1] = 0;
-	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
-	    ptrToSend[p] += 2;	 
-	  }
-	  if (sendL) p += grid->npcol;
-	  if (sendU) p++;
-	}
-	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
-	  k = sub_s[j];
-	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
-	    gb = supno_n[k];
-	    if (sendL)
-	      p = PNUM( PROW(gb, grid), pc, grid );
-	    else 
-	      p = PNUM( pr, PCOL(gb, grid), grid);
-	    if (send_1[p] == FALSE) {
-	      send_1[p] = TRUE;
-	      send_2[nsend_2] = k; nsend_2 ++;
-	    }
-	    if (p == iam) {
-	      rcv_luind[ind_loc] = k;  ind_loc++;
-	      if (sendL)
-		xsub_n[LBj( gb_n, grid )] ++;
-	      else
-		xsub_n[LBi( gb_n, grid )] ++;
-	    }
-	    else {
-	      snd_luind[ptrToSend[p]] = k;
-	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-	    }
-	  }
-	}
-	if (sendL)
-	  for (p = pc; p < nprocs; p += grid->npcol) {
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if (PNUM(PROW(gb, grid), pc, grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBj( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }
-		}
-	      }
-	      send_1[p] = FALSE;
-	  }  
-	if (sendU)
-	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	    if (send_1[p] || p == p_diag) {	      
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBi( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }	     
-		}
-	      } 
-	      send_1[p] = FALSE;
-	    }
-	  }
-      }
-    }
-    
-    /* reset ptrToSnd to point to the beginning of the data for
-       each processor (structure needed in MPI_Alltoallv) */
-    for (i = 0, p = 0; p < nprocs; p++) {
-      ptrToSend[p] = i;  i += nnzToSend[p];
-    }
-
-    /* ------------------------------------------------------------
-       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-       Note: it uses MPI_Alltoallv.
-       ------------------------------------------------------------*/
-    if (nprocs > 1) {
-#if defined (_LONGINT)
-      for (p=0; p<nprocs; p++) {
-	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
-	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
-	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-	intBuf1[p] = (int) nnzToSend[p];
-	intBuf2[p] = (int) ptrToSend[p];
-	intBuf3[p] = (int) nnzToRecv[p];
-	intBuf4[p] = (int) ptrToRecv[p];
-      }
-#else  /* Default */
-      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
-      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
-#endif
-
-      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
-		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
-		     grid->comm);
-    }
-    if (sendL)
-      nnzToRecv[iam] = nnz_loc_l;
-    else 
-      nnzToRecv[iam] = nnz_loc_u;
-    
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE.
-       -------------------------------------------------------------*/
-    if (sendU) 
-      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-	SUPERLU_FREE (snd_luind);
-	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-      }
-    
-    /* ------------------------------------------------------------
-       CONVERT THE FORMAT.
-       ------------------------------------------------------------*/
-    /* Initialize the array of column of L/ row of U pointers */
-    k = 0;
-    for (p = 0; p < nprocs; p ++) {
-      if (p != iam) {
-	i = k;
-	while (i < k + nnzToRecv[p]) {
-	  gb = rcv_luind[i];
-	  nelts = rcv_luind[i+1];
-	  if (sendL)
-	    xsub_n[LBj( gb, grid )] = nelts;
-	  else
-	    xsub_n[LBi( gb, grid )] = nelts;
-	  i += nelts + 2;
-	}
-      }
-      k += nnzToRecv[p];
-    }
-
-    if (sendL) j = nsupers_j;
-    else j = nsupers_i;
-    k = 0; 
-    isize = xsub_n[0];
-    xsub_n[0] = 0; 
-    for (gb_l = 1; gb_l < j; gb_l++) {
-      k += isize;
-      isize = xsub_n[gb_l];
-      xsub_n[gb_l] = k;
-    }
-    xsub_n[gb_l] = k + isize;
-    nnz_loc = xsub_n[gb_l];
-    if (sendL) {
-      lsub_n = NULL;
-      if (nnz_loc) {
-	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for lsub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = lsub_n;
-    }
-    if (sendU) {
-      usub_n = NULL;
-      if (nnz_loc) {
-	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for usub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = usub_n;
-    }
-    
-    /* Copy the data into the L column / U row oriented storage */
-    k = 0;
-    for (p = 0; p < nprocs; p++) {
-      i = k;
-      while (i < k + nnzToRecv[p]) {
-	gb = rcv_luind[i];
-	if (gb >= nsupers)
-	  printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n",
-		  iam, p, gb, nsupers, i, i-k);
-	i += 2;
-	if (sendL) gb_l = LBj( gb, grid );
-	if (sendU) gb_l = LBi( gb, grid );
-	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
-	  sub_n[j] = rcv_luind[i];
-	}
-      }      
-      k += nnzToRecv[p];
-    }
-    if (sendL) {
-      sendL = FALSE;  sendU = TRUE;
-    }
-    else
-      sendU = FALSE;
-  }
-
-  /* deallocate memory allocated during symbolic factorization routine */
-  if (rcv_luind != NULL) {
-    SUPERLU_FREE (rcv_luind);
-    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
-  }
-  SUPERLU_FREE (mem);  
-  memAux -= (float) (12 * nprocs * iword);
-  SUPERLU_FREE(nvtcs);
-  memAux -= (float) (5 * nprocs * sizeof(int));
-  
-  if (xlsub_s != NULL) {
-    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
-  }
-  if (xusub_s != NULL) {
-    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
-  }
-  SUPERLU_FREE (globToLoc); 
-  if (supno_s != NULL) {
-    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
-    SUPERLU_FREE (supno_s);
-  }
-  
-  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
-  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
-  *p_xusub = xusub_n; *p_usub = usub_n;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit dist_symbLU()");
-#endif
-  
-  return (-memRet);
-}
-  
-static float
-ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
-	int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val,
-	int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val,
-	int_t *ilsum_i, int_t *ilsum_j
-	)
-{
-/*
- *
- * Purpose
- * =======
- *   Re-distribute A on the 2D process mesh.  The lower part is
- *   stored using a column format and the upper part
- *   is stored using a row format.
- * 
- * Arguments
- * =========
- * 
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_persist  (Input) Glu_persist_t *
- *        Information on supernodes mapping.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * p_ainf_colptr (Output) int_t**
- *         Pointer to the lower part of A distributed on a 2D grid 
- *         of processors, stored by columns.
- *
- * p_ainf_rowind (Output) int_t**
- *         Structure of of the lower part of A distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_ainf_val    (Output) double**
- *         Numerical values of the lower part of A, distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_asup_rowptr (Output) int_t**
- *         Pointer to the upper part of A distributed on a 2D grid 
- *         of processors, stored by rows.
- *
- * p_asup_colind (Output) int_t**
- *         Structure of of the upper part of A distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * p_asup_val    (Output) double**
- *         Numerical values of the upper part of A, distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * ilsum_i  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block row wise).
- *
- * ilsum_j  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block column wise).
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated when out of memory.
- *        (an approximation).
- *
- */
-  int    iam, p, procs;
-  NRformat_loc *Astore;
-  int_t  *perm_r; /* row permutation vector */
-  int_t  *perm_c; /* column permutation vector */
-  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
-  int_t  nsupers, nsupers_i, nsupers_j;
-  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
-  int_t  nnz_remote; /* number of remote nonzeros to be sent */
-  int_t  SendCnt; /* number of remote nonzeros to be sent */
-  int_t  RecvCnt; /* number of remote nonzeros to be sent */
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  double *asup_val, *ainf_val;
-  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-  int_t  *ia, *ja, **ia_send, *index, *itemp;
-  int_t  *ptr_to_send;
-  double *aij, **aij_send, *nzval, *dtemp;
-  double *nzval_a;
-  MPI_Request *send_req;
-  MPI_Status  status;
-  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-  int_t *supno = Glu_persist->supno;   
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword, szbuf;
-
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter ddist_A()");
-#endif
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-  
-  perm_r = ScalePermstruct->perm_r;
-  perm_c = ScalePermstruct->perm_c;
-  procs = grid->nprow * grid->npcol;
-  Astore = (NRformat_loc *) A->Store;
-  n = A->ncol;
-  m_loc = Astore->m_loc;
-  fst_row = Astore->fst_row;
-  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (ERROR_RET);
-  }
-  memAux = (float) (2 * procs * iword);
-  memRet = 0.;
-  nnzToSend = nnzToRecv + procs;
-  nsupers  = supno[n-1] + 1;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF A.
-     ------------------------------------------------------------*/
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      ++nnzToSend[p]; 
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		grid->comm);
-  
-  maxnnzToRecv = 0;
-  nnz_loc = SendCnt = RecvCnt = 0;
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      SendCnt += nnzToSend[p];
-      RecvCnt += nnzToRecv[p];
-      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
-    } else {
-      nnz_loc += nnzToRecv[p];
-      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
-    }
-  }
-  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-  szbuf = k;
-
-  /* Allocate space for storing the triplets after redistribution. */
-  if ( !(ia = intMalloc_dist(2*k)) ) {
-    fprintf (stderr, "Malloc fails for ia[].");
-    return (memAux);
-  }
-  memAux += (float) (2*k*iword);
-  ja = ia + k;
-  if ( !(aij = doubleMalloc_dist(k)) ) {
-    fprintf (stderr, "Malloc fails for aij[].");
-    return (memAux);
-  }
-  memAux += (float) (k*dword);
-  
-  /* Allocate temporary storage for sending/receiving the A triplets. */
-  if ( procs > 1 ) {
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
-      fprintf (stderr, "Malloc fails for send_req[].");
-      return (memAux);
-    }
-    memAux += (float) (2*procs *sizeof(MPI_Request));
-    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
-      fprintf(stderr, "Malloc fails for ia_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(int_t*));
-    if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) {
-      fprintf(stderr, "Malloc fails for aij_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(double*));    
-    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for index[].");
-      return (memAux);
-    }
-    memAux += (float) (2*SendCnt*iword);
-    if ( !(nzval = doubleMalloc_dist(SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for nzval[].");
-      return (memAux);
-    }
-    memAux += (float) (SendCnt * dword);
-    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
-      fprintf(stderr, "Malloc fails for ptr_to_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs * iword);
-    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for itemp[].");
-      return (memAux);
-    }
-    memAux += (float) (2*maxnnzToRecv*iword);
-    if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for dtemp[].");
-      return (memAux);
-    }
-    memAux += (float) (maxnnzToRecv * dword);
-    
-    for (i = 0, j = 0, p = 0; p < procs; ++p) {
-      if ( p != iam ) {
-	ia_send[p] = &index[i];
-	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	aij_send[p] = &nzval[j];
-	j += nnzToSend[p];
-      }
-    }
-  } /* if procs > 1 */
-  
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
-    return (memAux);
-  }
-  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
-  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
-    return (memAux+memRet);
-  }
-  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
-  
-  /* ------------------------------------------------------------
-     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF A.
-     ------------------------------------------------------------*/
-  nnz_loc = 0; /* Reset the local nonzero count. */
-  nnz_loc_ainf = nnz_loc_asup = 0;
-  nzval_a = Astore->nzval;
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      
-      if ( p != iam ) { /* remote */
-	k = ptr_to_send[p];
-	ia_send[p][k] = irow;
-	ia_send[p][k + nnzToSend[p]] = jcol;
-	aij_send[p][k] = nzval_a[j];
-	++ptr_to_send[p]; 
-      } else {          /* local */
-	ia[nnz_loc] = irow;
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = nzval_a[j];
-	++nnz_loc;
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-
-  /* ------------------------------------------------------------
-     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-     NOTE: Can possibly use MPI_Alltoallv.
-     ------------------------------------------------------------*/
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToSend[p];
-      MPI_Isend( ia_send[p], it, mpi_int_t,
-		 p, iam, grid->comm, &send_req[p] );
-      it = nnzToSend[p];
-      MPI_Isend( aij_send[p], it, MPI_DOUBLE,
-		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToRecv[p];
-      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
-      it = nnzToRecv[p];
-      MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
-		grid->comm, &status );
-      for (i = 0; i < nnzToRecv[p]; ++i) {
-	ia[nnz_loc] = itemp[i];
-	irow = itemp[i];
-	jcol = itemp[i + nnzToRecv[p]];
-	/* assert(jcol<n); */
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = dtemp[i];
-	++nnz_loc;
-	
-	gbi = BlockNum( irow );
-	gbj = BlockNum( jcol );
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      MPI_Wait( &send_req[p], &status);
-      MPI_Wait( &send_req[procs+p], &status);
-    }
-  }
-  
-  /* ------------------------------------------------------------
-     DEALLOCATE TEMPORARY STORAGE
-     ------------------------------------------------------------*/
-  
-  SUPERLU_FREE(nnzToRecv);
-  memAux -= 2 * procs * iword;
-  if ( procs > 1 ) {
-    SUPERLU_FREE(send_req);
-    SUPERLU_FREE(ia_send);
-    SUPERLU_FREE(aij_send);
-    SUPERLU_FREE(index);
-    SUPERLU_FREE(nzval);
-    SUPERLU_FREE(ptr_to_send);
-    SUPERLU_FREE(itemp);
-    SUPERLU_FREE(dtemp);
-    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
-      procs*sizeof(double*) + 2*SendCnt * iword +
-      SendCnt* dword + procs*iword +
-      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
-  }
-  
-  /* ------------------------------------------------------------
-     CONVERT THE TRIPLET FORMAT.
-     ------------------------------------------------------------*/
-  if (nnz_loc_ainf != 0) {
-    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * iword);
-    if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_val[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * dword);
-  }
-  else {
-    ainf_rowind = NULL;
-    ainf_val = NULL;
-  }
-  if (nnz_loc_asup != 0) {
-    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_colind[].");
-      return (memAux + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * iword);
-    if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_val[].");
-      return (memAux  + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * dword);
-  }
-  else {
-    asup_colind = NULL;
-    asup_val = NULL;
-  }
-
-  /* Initialize the array of column pointers */
-  k = 0; 
-  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
-  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
-    k += jsize;              
-    jsize = ainf_colptr[j];  
-    ainf_colptr[j] = k;
-  }
-  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
-  i = 0;
-  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
-  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
-    i += isize;
-    isize = asup_rowptr[j];  
-    asup_rowptr[j] = i;
-  }
-  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
-
-  /* Copy the triplets into the column oriented storage */
-  for (i = 0; i < nnz_loc; ++i) {
-    jcol = ja[i];
-    irow = ia[i];
-    gbi = BlockNum( irow );
-    gbj = BlockNum( jcol );
-    /* Count nonzeros in each column of L / row of U */
-    if (gbi >= gbj) {
-      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
-      k = ainf_colptr[j];
-      ainf_rowind[k] = irow;
-      ainf_val[k] = aij[i];
-      ainf_colptr[j] ++;
-    }
-    else {
-      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
-      k = asup_rowptr[j];
-      asup_colind[k] = jcol;
-      asup_val[k] = aij[i];
-      asup_rowptr[j] ++;
-    }
-  }
-
-  /* Reset the column pointers to the beginning of each column */
-  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
-    ainf_colptr[j] = ainf_colptr[j-1];
-  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
-    asup_rowptr[j] = asup_rowptr[j-1];
-  ainf_colptr[0] = 0;
-  asup_rowptr[0] = 0;
-  
-  SUPERLU_FREE(ia);
-  SUPERLU_FREE(aij);
-  memAux -= 2*szbuf*iword + szbuf*dword;
-  
-  *p_ainf_colptr = ainf_colptr;
-  *p_ainf_rowind = ainf_rowind; 
-  *p_ainf_val    = ainf_val;
-  *p_asup_rowptr = asup_rowptr;
-  *p_asup_colind = asup_colind;
-  *p_asup_val    = asup_val;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit ddist_A()");
-  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
-#endif
-
-  return (-memRet);
-} /* dist_A */
-
-int_t
-ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
-		ScalePermstruct_t *ScalePermstruct,
-		Pslu_freeable_t *Pslu_freeable, 
-		LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- *
- *
- * Purpose
- * =======
- *   Distribute the input matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *          This routine should not be called for this case, an error
- *          is generated.  Instead, pddistribute routine should be called.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (Input) int
- *        Dimension of the matrix.
- *
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
- *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (Input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * LUstruct (Input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated for performing the distribution
- *       of the data, when out of memory.
- *        (an approximation).
- *
- */
-{
-  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-  Glu_freeable_t Glu_freeable_n;
-  LocalLU_t *Llu = LUstruct->Llu;
-  int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, 
-    len, len1, nsupc, nsupc_gb, ii, nprocs;
-  int_t ljb;  /* local block column number */
-  int_t nrbl; /* number of L blocks in current block column */
-  int_t nrbu; /* number of U blocks in current block column */
-  int_t gb;   /* global block number; 0 < gb <= nsuper */
-  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-  int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
-  int_t mybufmax[NBUFFERS];
-  NRformat_loc *Astore;
-  double *a;
-  int_t *asub, *xa;
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  double *asup_val, *ainf_val;
-  int_t *xsup, *supno;    /* supernode and column mapping */
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
-  int_t next_ind;      /* next available position in index[*] */
-  int_t next_val;      /* next available position in nzval[*] */
-  int_t *index;         /* indices consist of headers and row subscripts */
-  double *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
-  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
-  double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-  double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  
-  /*-- Counts to be used in factorization. --*/
-  int_t  *ToRecv, *ToSendD, **ToSendR;
-  
-  /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
-  
-  /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
-  int_t  *ilsum;         /* starting position of each supernode in 
-			    the full array (local)                 */  
-  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
-				the full array (local, block column wise) */  
-  /*-- Auxiliary arrays; freed on return --*/
-  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  int_t *LUb_length; /* L,U block length; size nsupers_ij */
-  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
-  int_t *LUb_number; /* global block number; size nsupers_ij */
-  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
-  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  double *dense, *dense_col; /* SPA */
-  double zero = 0.0;
-  int_t ldaspa;     /* LDA of SPA */
-  int_t iword, dword;
-  float memStrLU, memA,
-    memDist = 0.; /* memory used for redistributing the data, which does
-		   not include the memory for the numerical values of L and U */
-  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
-		    L and U, that will be used in the numeric factorization */
-
-#if ( PRNTlevel>=1 )
-  int_t nLblocks = 0, nUblocks = 0;
-#endif
-  
-  /* Initialization. */
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
-#endif
-  myrow = MYROW( iam, grid );
-  mycol = MYCOL( iam, grid );
-  nprocs = grid->npcol * grid->nprow;
-  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-  Astore   = (NRformat_loc *) A->Store;
-  
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-
-  if (fact == SamePattern_SameRowPerm) {
-    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
-  }
-
-  if ((memStrLU = 
-       dist_symbLU (n, Pslu_freeable, 
-		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
-    return (memStrLU);
-  memDist += (-memStrLU);
-  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
-  supno = Glu_persist->supno;   
-  nsupers  = supno[n-1] + 1;
-  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
-  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
-  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
-  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum[].");  
-    return (memDist + memNLU);
-  }
-  memNLU += (nsupers_i+1) * iword;
-  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum_j[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_j+1) * iword;
-
-  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
-  ilsum[0] = 0;
-  ldaspa = 0;
-  for (gb = 0; gb < nsupers; gb++) 
-    if ( myrow == PROW( gb, grid ) ) {
-      i = SuperSize( gb );
-      ldaspa += i;
-      lb = LBi( gb, grid );
-      ilsum[lb + 1] = ilsum[lb] + i;
-    }
-  ilsum[nsupers_i] = ldaspa;
-
-  ldaspa_j = 0; ilsum_j[0] = 0;  
-  for (gb = 0; gb < nsupers; gb++) 
-    if (mycol == PCOL( gb, grid )) {
-      i = SuperSize( gb );
-      ldaspa_j += i;
-      lb = LBj( gb, grid );
-      ilsum_j[lb + 1] = ilsum_j[lb] + i;
-    }
-  ilsum_j[nsupers_j] = ldaspa_j;
-  
-  if ((memA = ddist_A(A, ScalePermstruct, Glu_persist,
-		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
-		      &asup_rowptr, &asup_colind, &asup_val,
-		      ilsum, ilsum_j)) > 0)
-    return (memDist + memA + memNLU);
-  memDist += (-memA);
-
-  /* ------------------------------------------------------------
-     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
-     ------------------------------------------------------------*/
-  
-  /* We first need to set up the L and U data structures and then
-   * propagate the values of A into them.
-   */
-  if ( !(ToRecv = intCalloc_dist(nsupers)) ) {
-    fprintf(stderr, "Calloc fails for ToRecv[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers * iword;
-  
-  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-  if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for ToSendR[].");
-    return (memDist + memNLU);
-  }
-  memNLU += k*sizeof(int_t*);
-  j = k * grid->npcol;
-  if ( !(index = intMalloc_dist(j)) ) {
-    fprintf(stderr, "Malloc fails for index[].");
-    return (memDist + memNLU);
-  }
-  memNLU += j*iword;
-  
-  for (i = 0; i < j; ++i) index[i] = EMPTY;
-  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-  
-  /* Auxiliary arrays used to set up L and U block data structures.
-     They are freed on return. */
-  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_length[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Malloc fails for LUb_indptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_number[].");
-    return (memDist + memNLU);
-  }    
-  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_valptr[].");
-    return (memDist + memNLU);
-  }
-  memDist += 4 * nsupers_ij * iword;
-  
-  k = CEILING( nsupers, grid->nprow ); 
-  /* Pointers to the beginning of each block row of U. */
-  if ( !(Unzval_br_ptr = 
-	 (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) {
-    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*);
-  Unzval_br_ptr[nsupers_i-1] = NULL;
-  Ufstnz_br_ptr[nsupers_i-1] = NULL;
-
-  if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Malloc fails for ToSendD[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*iword;  
-  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_i + nsupers_j)*iword;
-  
-  /* Auxiliary arrays used to set up L, U block data structures.
-     They are freed on return.
-     k is the number of local row blocks.   */
-  if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
-				   * sp_ienv_dist(3))) ) {
-    fprintf(stderr, "Calloc fails for SPA dense[].");
-    return (memDist + memNLU);
-  }
-  /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for fmod[].");
-    return (memDist + memNLU);
-  }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for bmod[].");
-    return (memDist + memNLU);
-  }
-  /* ------------------------------------------------ */
-  memNLU += 2*nsupers_i*iword + 
-    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
-  
-  /* Pointers to the beginning of each block column of L. */
-  if ( !(Lnzval_bc_ptr = 
-	 (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
-    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*);
-  Lnzval_bc_ptr[nsupers_j-1] = NULL;
-  Lrowind_bc_ptr[nsupers_j-1] = NULL;
-  
-  /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
-  /* -------------------------------------------------------------- */
-  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
-  
-  /*------------------------------------------------------------
-    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-    ------------------------------------------------------------*/
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */
-    ljb_i = LBi( jb, grid);  /* Local block number row wise */
-    fsupc = FstBlockC( jb );
-    nsupc = SuperSize( jb );
-    
-    if ( myrow == jbrow ) { /* Block row jb in my process row */
-      /* Scatter A into SPA. */
-      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
-	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
-	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
-	    printf ("ERR7\n");
-	  jcol = asup_colind[i];
-	  if (jcol >= n)
-	    printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  gb = BlockNum( jcol );
-	  lb = LBj( gb, grid );
-	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
-	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
-	  if (jcol >= ldaspa_j)
-	    printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  dense_col[jcol] = asup_val[i];
-	}
-	dense_col += ldaspa_j;
-      }
-      
-      /*------------------------------------------------
-       * SET UP U BLOCKS.
-       *------------------------------------------------*/
-      /* Count number of blocks and length of each block. */
-      nrbu = 0;
-      len = 0; /* Number of column subscripts I own. */
-      len1 = 0; /* number of fstnz subscripts */
-      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
-	jcol = usub[i];
-	gb = BlockNum( jcol ); /* Global block number */
-	
-	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
-	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
-	  iam, jb, gb, jcol, jbcol, pc); */
-	
-	lb = LBj( gb, grid );  /* Local block number */
-	pc = PCOL( gb, grid ); /* Process col owning this block */
-	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
-	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
-	pr = PROW( gb, grid );
-	if ( pr != jbrow  && mycol == pc)
-	  bsendx_plist[lb][jbrow] = YES; 
-	if (mycol == pc) {
-	  len += nsupc;
-	  LUb_length[lb] += nsupc;
-	  ToSendD[ljb_i] = YES;
-	  if (Urb_marker[lb] <= jb) { /* First see this block */
-	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
-	    Urb_marker[lb] = jb + 1;
-	    LUb_number[nrbu] = gb;
-	    /* if (gb == 391825 && jb == 145361)
-	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
-	       iam, jb, gb, nrbu); */
-	    nrbu ++;
-	    len1 += SuperSize( gb );
-	    if ( gb != jb )/* Exclude diagonal block. */
-	      ++bmod[ljb_i];/* Mod. count for back solve */
-#if ( PRNTlevel>=1 )
-	    ++nUblocks;
-#endif
-	  }
-	}
-      } /* for i ... */
-      
-      if ( nrbu ) { 
-	/* Sort the blocks of U in increasing block column index.
-	   SuperLU_DIST assumes this is true */
-	/* simple insert sort algorithm */
-	/* to be transformed in quick sort */
-	for (j = 1; j < nrbu; j++) {
-	  k = LUb_number[j];
-	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
-	    LUb_number[i+1] = LUb_number[i];
-	  }
-	  LUb_number[i+1] = k;
-	} 
-	
-	/* Set up the initial pointers for each block in
-	   index[] and nzval[]. */
-	/* Add room for descriptors */
-	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1+1)) ) {
-	  fprintf (stderr, "Malloc fails for Uindex[]");
-	  return (memDist + memNLU);
-	}
-	Ufstnz_br_ptr[ljb_i] = index;
-	if (!(Unzval_br_ptr[ljb_i] =
-	      doubleMalloc_dist(len))) {
-	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
-	  return (memDist + memNLU);
-	}
-	memNLU += (len1+1)*iword + len*dword;
-	uval = Unzval_br_ptr[ljb_i];
-	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-	index[0] = nrbu;  /* Number of column blocks */
-	index[1] = len;   /* Total length of nzval[] */
-	index[2] = len1;  /* Total length of index */
-	index[len1] = -1; /* End marker */
-	next_ind = BR_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbu; k++) {
-	  gb = LUb_number[k];
-	  lb = LBj( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;  /* Reset vector of block length */
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len;
-	  LUb_indptr[lb] = next_ind;
-	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
-	    index[next_ind] = FstBlockC( jb + 1 );
-	  LUb_valptr[lb] = next_val;
-	  next_val += len;
-	}
-	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
-	   and the initial values of A from SPA into Unzval_br_ptr[]. */
-	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	  jcol = usub[i];
-	  gb = BlockNum( jcol );
-	  
-	  if ( mycol == PCOL( gb, grid ) ) {
-	    lb = LBj( gb, grid );
-	    k = LUb_indptr[lb]; /* Start fstnz in index */
-	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
-	  }
-	}  /* for i ... */
-	
-	for (i = 0; i < nrbu; i++) {
-	  gb = LUb_number[i];
-	  lb = LBj( gb, grid );   
-	  next_ind = LUb_indptr[lb];
-	  k = FstBlockC( jb + 1);
-	  jcol = ilsum_j[lb];
-	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
-	    dense_col = dense;
-	    j = index[next_ind+jj];
-	    for (ii = j; ii < k; ii++) {
-	      uval[LUb_valptr[lb]++] = dense_col[jcol];
-	      dense_col[jcol] = zero;
-	      dense_col += ldaspa_j;	      
-	    }
-	  }
-	}
-      } else {
-	Ufstnz_br_ptr[ljb_i] = NULL;
-	Unzval_br_ptr[ljb_i] = NULL;
-      } /* if nrbu ... */	
-    } /* if myrow == jbrow */
-    
-      /*------------------------------------------------
-       * SET UP L BLOCKS.
-       *------------------------------------------------*/
-    if (mycol == jbcol) {  /* Block column jb in my process column */
-      /* Scatter A_inf into SPA. */
-      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
-	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
-	  irow = ainf_rowind[i];
-	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
-	  gb = BlockNum( irow );
-	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
-	  if ( myrow == PROW( gb, grid ) ) {
-	    lb = LBi( gb, grid );
-	    irow = ilsum[lb] + irow - FstBlockC( gb );
-	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
-	    dense_col[irow] = ainf_val[i];
-	  }
-	}
-	dense_col += ldaspa;
-      }      
-      
-      /* sort the indices of the diagonal block at the beginning of xlsub */
-      if (myrow == jbrow) {
-	k = xlsub[ljb_j];
-	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	  irow = lsub[i];
-	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
-	    lsub[i] = lsub[k + irow - fsupc];
-	    lsub[k + irow - fsupc] = irow;
-	    i --;
-	  }
-	}
-      }
-      
-      /* Count number of blocks and length of each block. */
-      nrbl = 0;
-      len = 0; /* Number of row subscripts I own. */
-      kseen = 0;
-      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	irow = lsub[i];
-	gb = BlockNum( irow ); /* Global block number */	  
-	pr = PROW( gb, grid ); /* Process row owning this block */
-	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
-	     myrow == jbrow) {
-	  fsendx_plist[ljb_j][pr] = YES;
-	  ++nfsendx;
-	}
-	if ( myrow == pr ) {
-	  lb = LBi( gb, grid );  /* Local block number */
-	  if (Lrb_marker[lb] <= jb) { /* First see this block */
-	    Lrb_marker[lb] = jb + 1;
-	    LUb_length[lb] = 1;
-	    LUb_number[nrbl++] = gb;
-	    if ( gb != jb ) /* Exclude diagonal block. */
-	      ++fmod[lb]; /* Mod. count for forward solve */
-	    if ( kseen == 0 && myrow != jbrow ) {
-	      ++nfrecvx;
-	      kseen = 1;
-	    }
-#if ( PRNTlevel>=1 )
-	    ++nLblocks;
-#endif
-	  } else 
-	    ++LUb_length[lb];	    
-	  ++len;
-	}
-      } /* for i ... */
-      
-      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-	/* Set up the initial pointers for each block in 
-	   index[] and nzval[]. */
-	/* If I am the owner of the diagonal block, order it first in LUb_number.
-	   Necessary for SuperLU_DIST routines */
-	kseen = EMPTY;
-	for (j = 0; j < nrbl; j++) {
-	  if (LUb_number[j] == jb)
-	    kseen = j;
-	}
-	if (kseen != EMPTY && kseen != 0) {
-	  LUb_number[kseen] = LUb_number[0];
-	  LUb_number[0] = jb;
-	}
-	
-	/* Add room for descriptors */
-	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1)) ) {
-	  fprintf (stderr, "Malloc fails for index[]");
-	  return (memDist + memNLU);
-	}
-	Lrowind_bc_ptr[ljb_j] = index;
-	if (!(Lnzval_bc_ptr[ljb_j] = 
-	      doubleMalloc_dist(len*nsupc))) {
-	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb);
-	  return (memDist + memNLU);
-	}
-	memNLU += len1*iword + len*nsupc*dword;
-	
-	lusup = Lnzval_bc_ptr[ljb_j];
-	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-	index[0] = nrbl;  /* Number of row blocks */
-	index[1] = len;   /* LDA of the nzval[] */
-	next_ind = BC_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbl; ++k) {
-	  gb = LUb_number[k];
-	  lb = LBi( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len; 
-	  LUb_indptr[lb] = next_ind;
-	    LUb_valptr[lb] = next_val;
-	    next_ind += len;
-	    next_val += len;
-	  }
-	  /* Propagate the compressed row subscripts to Lindex[],
-	     and the initial values of A from SPA into Lnzval[]. */
-	  len = index[1];  /* LDA of lusup[] */
-	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	    irow = lsub[i];
-	    gb = BlockNum( irow );
-	    if ( myrow == PROW( gb, grid ) ) {
-	      lb = LBi( gb, grid );
-	      k = LUb_indptr[lb]++; /* Random access a block */
-	      index[k] = irow;
-	      k = LUb_valptr[lb]++;
-	      irow = ilsum[lb] + irow - FstBlockC( gb );
-	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
-		lusup[k] = dense_col[irow];
-		dense_col[irow] = zero;
-		k += len;
-		dense_col += ldaspa;
-	      }
-	    }
-	  } /* for i ... */
-	} else {
-	  Lrowind_bc_ptr[ljb_j] = NULL;
-	  Lnzval_bc_ptr[ljb_j] = NULL;
-	} /* if nrbl ... */		  
-      } /* if mycol == pc */
-  } /* for jb ... */
-
-  SUPERLU_FREE(ilsum_j);
-  SUPERLU_FREE(Urb_marker);
-  SUPERLU_FREE(LUb_length);
-  SUPERLU_FREE(LUb_indptr);
-  SUPERLU_FREE(LUb_number);
-  SUPERLU_FREE(LUb_valptr);
-  SUPERLU_FREE(Lrb_marker);
-  SUPERLU_FREE(dense);
-  
-  /* Free the memory used for storing L and U */
-  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
-  if (lsub != NULL)
-    SUPERLU_FREE(lsub);  
-  if (usub != NULL)
-    SUPERLU_FREE(usub);
-  
-  /* Free the memory used for storing A */
-  SUPERLU_FREE(ainf_colptr);
-  if (ainf_rowind != NULL) {
-    SUPERLU_FREE(ainf_rowind);
-    SUPERLU_FREE(ainf_val);
-  }
-  SUPERLU_FREE(asup_rowptr);
-  if (asup_colind != NULL) {
-    SUPERLU_FREE(asup_colind);	
-    SUPERLU_FREE(asup_val);	
-  }
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
-    fprintf (stderr, "Malloc fails for recvBuf[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  
-  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
-    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
-  
-  for (p = 0; p < nprocs; p++)
-    nnzToRecv[p] = 0;
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    nnzToRecv[p] += grid->npcol;
-  }    
-  i = 0;
-  for (p = 0; p < nprocs; p++) {
-    ptrToRecv[p] = i;
-    i += nnzToRecv[p];
-    ptrToSend[p] = 0;
-    if (p != iam)
-      nnzToSend[p] = nnzToRecv[iam];
-    else
-      nnzToSend[p] = 0;
-  }
-  nnzToRecv[iam] = 0;
-  i = ptrToRecv[iam];
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    if (p == iam) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      for (j = 0; j < grid->npcol; j++, i++)
-	recvBuf[i] = ToSendR[ljb_j][j];
-    }
-  }   
-  
-  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
-		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
-    /* (myrow == jbrow) {
-       if (ToSendD[ljb_i] == YES)
-       ToRecv[jb] = 1;
-       }
-       else {
-       if (recvBuf[ptrToRecv[p] + mycol] == YES)
-       ToRecv[jb] = 2;
-       } */
-    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
-      if (myrow == jbrow)
-	ToRecv[jb] = 1;
-      else
-	ToRecv[jb] = 2;
-    }
-    if (mycol == jbcol) {
-      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
-	ToSendR[ljb_j][i] = recvBuf[j];  
-      ToSendR[ljb_j][mycol] = EMPTY;
-    }
-    ptrToRecv[p] += grid->npcol;
-  }   
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
-		 MPI_MAX, grid->cscp.comm);
-  
-  for (jb = 0; jb < nsupers; jb ++) {
-    jbcol = PCOL( jb, grid);
-    jbrow = PROW( jb, grid);
-    if (mycol == jbcol) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      if (myrow == jbrow ) {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
-	  (*bsendx_plist)[k] = recvBuf[k];
-	  if ((*bsendx_plist)[k] != EMPTY)
-	    nbsendx ++;
-	}
-      }
-      else {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
-	  (*bsendx_plist)[k] = EMPTY;
-      }
-    }
-  }
-  
-  SUPERLU_FREE(nnzToRecv);
-  SUPERLU_FREE(ptrToRecv);
-  SUPERLU_FREE(nnzToSend);
-  SUPERLU_FREE(ptrToSend);
-  SUPERLU_FREE(recvBuf);
-  
-  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-  Llu->Unzval_br_ptr = Unzval_br_ptr;
-  Llu->ToRecv = ToRecv;
-  Llu->ToSendD = ToSendD;
-  Llu->ToSendR = ToSendR;
-  Llu->fmod = fmod;
-  Llu->fsendx_plist = fsendx_plist;
-  Llu->nfrecvx = nfrecvx;
-  Llu->nfsendx = nfsendx;
-  Llu->bmod = bmod;
-  Llu->bsendx_plist = bsendx_plist;
-  Llu->nbrecvx = nbrecvx;
-  Llu->nbsendx = nbsendx;
-  Llu->ilsum = ilsum;
-  Llu->ldalsum = ldaspa;
-  LUstruct->Glu_persist = Glu_persist;	
-#if ( PRNTlevel>=1 )
-  if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-		     nLblocks, nUblocks);
-#endif
-  
-  /* Find the maximum buffer size. */
-  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		MPI_MAX, grid->comm);
-  
-#if ( DEBUGlevel>=1 )
-  /* Memory allocated but not freed:
-     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
-     ToRecv, ToSendR, ToSendD
-  */
-  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
-#endif
-    
-  return (- (memDist+memNLU));
-} /* dist_psymbtonum */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdutil.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pdutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pdutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,529 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-/*
- * Gather A from the distributed compressed row format to
- * global A in compressed column format.
- */
-int pdCompRow_loc_to_CompCol_global
-(
- int_t need_value, /* Input. Whether need to gather numerical values */
- SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
- gridinfo_t *grid, /* Input */
- SuperMatrix *GA   /* Output */
-)
-{
-    NRformat_loc *Astore;
-    NCformat *GAstore;
-    double *a, *a_loc;
-    int_t *colind, *rowptr;
-    int_t *colptr_loc, *rowind_loc;
-    int_t m_loc, n, i, j, k, l;
-    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
-    double *a_recv;  /* Buffer to receive the blocks of values. */
-    double *a_buf;   /* Buffer to merge blocks into block columns. */
-    int_t *colcnt, *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
-			   local block rows.
-			   Use n_loc+1 pointers for each block. */
-    int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
-			   Use n_loc+1 pointers for each block. */
-    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
-    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
-    int_t *fst_rows, *n_locs;
-    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
-    int   it, n_loc, procs;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pdCompRow_loc_to_CompCol_global");
-#endif
-
-    /* Initialization. */
-    n = A->ncol;
-    Astore = (NRformat_loc *) A->Store;
-    nnz_loc = Astore->nnz_loc;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    a = Astore->nzval;
-    rowptr = Astore->rowptr;
-    colind = Astore->colind;
-    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */
-
-    /* ------------------------------------------------------------
-       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
-       ------------------------------------------------------------*/
-    dCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
-                             &rowind_loc, &colptr_loc);
-    /* Change local row index numbers to global numbers. */
-    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;
-
-#if ( DEBUGlevel>=2 )
-    printf("Proc %d\n", grid->iam);
-    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
-    PrintInt10("colptr_loc", n+1, colptr_loc);
-#endif
-
-    procs = grid->nprow * grid->npcol;
-    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
-	  ABORT("Malloc fails for fst_rows[]");
-    n_locs = fst_rows + procs;
-    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
-		  grid->comm);
-    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
-    n_locs[procs-1] = n - fst_rows[procs-1];
-    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
-	  ABORT("Malloc fails for recvcnts[]");
-    sendcnts = recvcnts + procs;
-    rdispls = sendcnts + procs;
-    sdispls = rdispls + procs;
-    itemp_32 = sdispls + procs;
-
-    /* All-to-all transfer column pointers of each block.
-       Now the matrix view is P-by-P block-partition. */
-    /* n column starts for each column, and procs column ends for each block */
-    if ( !(colptr_send = intMalloc_dist(n + procs)) )
-	   ABORT("Malloc fails for colptr_send[]");
-    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
-	   ABORT("Malloc fails for colptr_blk[]");
-    for (i = 0, j = 0; i < procs; ++i) {
-        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
-	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
-	sendcnts[i] = n_locs[i] + 1;
-#if ( DEBUGlevel>=1 )
-	assert(j == fst_rows[i]);
-#endif
-	sdispls[i] = j + i;
-	recvcnts[i] = n_loc + 1;
-	rdispls[i] = i * (n_loc + 1);
-	j += n_locs[i]; /* First column of next block in colptr_loc[] */
-    }
-    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
-		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);
-
-    /* Adjust colptr_blk[] so that they contain the local indices of the
-       column pointers in the receive buffer. */
-    nnz = 0; /* The running sum of the nonzeros counted by far */
-    k = 0;
-    for (i = 0; i < procs; ++i) {
-	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
-	    colnnz = colptr_blk[j+1] - colptr_blk[j];
-	    /*assert(k<=j);*/
-	    colptr_blk[k] = nnz;
-	    nnz += colnnz; /* Start of the next column */
-	    ++k;
-	}
-	colptr_blk[k++] = nnz; /* Add an END marker for each block */
-    }
-    /*assert(k == (n_loc+1)*procs);*/
-
-    /* Now prepare to transfer row indices and values. */
-    sdispls[0] = 0;
-    for (i = 0; i < procs-1; ++i) {
-        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
-	sdispls[i+1] = sdispls[i] + sendcnts[i];
-    }
-    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
-    for (i = 0; i < procs; ++i) {
-        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
-	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
-    }
-    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
-    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
-
-    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
-    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
-        ABORT("Malloc fails for rowind_recv[]");
-    rowind_buf = rowind_recv + k;
-    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
-		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
-    if ( need_value ) {
-        if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) )
-	    ABORT("Malloc fails for rowind_recv[]");
-	a_buf = a_recv + k;
-	MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE,
-                      a_recv, recvcnts, rdispls, MPI_DOUBLE,
-                      grid->comm);
-    }
-      
-    /* Reset colptr_loc[] to point to the n_loc global columns. */
-    colptr_loc[0] = 0;
-    itemp = colptr_send;
-    for (j = 0; j < n_loc; ++j) {
-        colnnz = 0;
-	for (i = 0; i < procs; ++i) {
-	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
-	    colnnz += colptr_blk[k+1] - colptr_blk[k];
-	}
-	colptr_loc[j+1] = colptr_loc[j] + colnnz;
-	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
-    }
-    itemp[n_loc] = colptr_loc[n_loc];
-      
-    /* Merge blocks of row indices into columns of row indices. */
-    for (i = 0; i < procs; ++i) {
-        k = i * (n_loc + 1);
-	for (j = 0; j < n_loc; ++j) { /* i-th block */
-	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
-	        rowind_buf[itemp[j]] = rowind_recv[l];
-		++itemp[j];
-	    }
-	}
-    }
-
-    if ( need_value ) {
-        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
-        for (i = 0; i < procs; ++i) {
-	    k = i * (n_loc + 1);
-	    for (j = 0; j < n_loc; ++j) { /* i-th block */
-	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
-		    a_buf[itemp[j]] = a_recv[l];
-		    ++itemp[j];
-		}
-	    }
-	}
-    }
-
-    /* ------------------------------------------------------------
-       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
-       ------------------------------------------------------------*/
-    GA->nrow  = A->nrow;
-    GA->ncol  = A->ncol;
-    GA->Stype = SLU_NC;
-    GA->Dtype = A->Dtype;
-    GA->Mtype = A->Mtype;
-    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
-    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");
-
-    /* First gather the size of each piece. */
-    nnz_loc = colptr_loc[n_loc];
-    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
-    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
-    GAstore->nnz = nnz;
-    
-    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
-        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
-    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
-        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
-    /* Allgatherv for row indices. */
-    rdispls[0] = 0;
-    for (i = 0; i < procs-1; ++i) {
-        rdispls[i+1] = rdispls[i] + itemp[i];
-        itemp_32[i] = itemp[i];
-    }
-    itemp_32[procs-1] = itemp[procs-1];
-    it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
-		   itemp_32, rdispls, mpi_int_t, grid->comm);
-    if ( need_value ) {
-      if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) )
-          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, 
-		     itemp_32, rdispls, MPI_DOUBLE, grid->comm);
-    } else GAstore->nzval = NULL;
-
-    /* Now gather the column pointers. */
-    rdispls[0] = 0;
-    for (i = 0; i < procs-1; ++i) {
-        rdispls[i+1] = rdispls[i] + n_locs[i];
-        itemp_32[i] = n_locs[i];
-    }
-    itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
-		   itemp_32, rdispls, mpi_int_t, grid->comm);
-
-    /* Recompute column pointers. */
-    for (i = 1; i < procs; ++i) {
-        k = rdispls[i];
-	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
-	itemp[i] += itemp[i-1]; /* prefix sum */
-    }
-    GAstore->colptr[n] = nnz;
-
-#if ( DEBUGlevel>=2 )
-    if ( !grid->iam ) {
-        printf("After pdCompRow_loc_to_CompCol_global()\n");
-	dPrint_CompCol_Matrix_dist(GA);
-    }
-#endif
-
-    SUPERLU_FREE(a_loc);
-    SUPERLU_FREE(rowind_loc);
-    SUPERLU_FREE(colptr_loc);
-    SUPERLU_FREE(fst_rows);
-    SUPERLU_FREE(recvcnts);
-    SUPERLU_FREE(colptr_send);
-    SUPERLU_FREE(colptr_blk);
-    SUPERLU_FREE(rowind_recv);
-    if ( need_value) SUPERLU_FREE(a_recv);
-#if ( DEBUGlevel>=1 )
-    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
-    CHECK_MALLOC(grid->iam, "Exit pdCompRow_loc_to_CompCol_global");
-#endif
-    return 0;
-} /* pdCompRow_loc_to_CompCol_global */
-
-
-/*
- * Permute the distributed dense matrix: B <= perm(X).
- * perm[i] = j means the i-th row of X is in the j-th row of B.
- */
-int pdPermute_Dense_Matrix
-(
- int_t fst_row,
- int_t m_loc,
- int_t row_to_proc[],
- int_t perm[],
- double X[], int ldx,
- double B[], int ldb,
- int nrhs,
- gridinfo_t *grid
-)
-{
-    int_t i, j, k, l;
-    int p, procs;
-    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
-    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
-    int *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t *send_ibuf, *recv_ibuf;
-    double *send_dbuf, *recv_dbuf;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pdPermute_Dense_Matrix()");
-#endif
-
-    procs = grid->nprow * grid->npcol;
-    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
-        ABORT("Malloc fails for sendcnts[].");
-    sendcnts_nrhs = sendcnts + procs;
-    recvcnts = sendcnts_nrhs + procs;
-    recvcnts_nrhs = recvcnts + procs;
-    sdispls = recvcnts_nrhs + procs;
-    sdispls_nrhs = sdispls + procs;
-    rdispls = sdispls_nrhs + procs;
-    rdispls_nrhs = rdispls + procs;
-    ptr_to_ibuf = rdispls_nrhs + procs;
-    ptr_to_dbuf = ptr_to_ibuf + procs;
-
-    for (i = 0; i < procs; ++i) sendcnts[i] = 0;
-
-    /* Count the number of X entries to be sent to each process.*/
-    for (i = fst_row; i < fst_row + m_loc; ++i) {
-        p = row_to_proc[perm[i]];
-	++sendcnts[p];
-    }
-    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
-    sdispls[0] = rdispls[0] = 0;
-    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
-    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
-    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
-    for (i = 1; i < procs; ++i) {
-        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
-	sdispls_nrhs[i] = sdispls[i] * nrhs;
-	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
-	rdispls_nrhs[i] = rdispls[i] * nrhs;
-	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
-	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
-    }
-    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
-    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
-    /*assert(k == m_loc);*/
-    /*assert(l == m_loc);*/
-    if ( !(send_ibuf = intMalloc_dist(k + l)) )
-        ABORT("Malloc fails for send_ibuf[].");
-    recv_ibuf = send_ibuf + k;
-    if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) )
-        ABORT("Malloc fails for send_dbuf[].");
-    recv_dbuf = send_dbuf + k * nrhs;
-
-    for (i = 0; i < procs; ++i) {
-        ptr_to_ibuf[i] = sdispls[i];
-	ptr_to_dbuf[i] = sdispls_nrhs[i];
-    }
-
-    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
-    for (i = fst_row; i < fst_row + m_loc; ++i) {
-        j = perm[i];
-	p = row_to_proc[j];
-	send_ibuf[ptr_to_ibuf[p]] = j;
-	j = ptr_to_dbuf[p];
-	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
-	    send_dbuf[j++] = X[i-fst_row + k*ldx];
-	}
-	++ptr_to_ibuf[p];
-	ptr_to_dbuf[p] += nrhs;
-    }
-	  
-    /* Transfer the (permuted) row indices and numerical values. */
-    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
-		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
-    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_DOUBLE,
-		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE,
-		  grid->comm);
-
-    /* Copy the buffer into b. */
-    for (i = 0, l = 0; i < m_loc; ++i) {
-        j = recv_ibuf[i] - fst_row; /* Relative row number */
-	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
-	    B[j + k*ldb] = recv_dbuf[l++];
-	}
-    }
-
-    SUPERLU_FREE(sendcnts);
-    SUPERLU_FREE(send_ibuf);
-    SUPERLU_FREE(send_dbuf);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Exit pdPermute_Dense_Matrix()");
-#endif
-    return 0;
-} /* pdPermute_Dense_Matrix */
-
-
-/*
- * Initialize the data structure for the solution phase.
- */
-int dSolveInit(superlu_options_t *options, SuperMatrix *A, 
-	       int_t perm_r[], int_t perm_c[], int_t nrhs,
-	       LUstruct_t *LUstruct, gridinfo_t *grid,
-	       SOLVEstruct_t *SOLVEstruct)
-{
-    int_t *row_to_proc, *inv_perm_c, *itemp;
-    NRformat_loc *Astore;
-    int_t        i, fst_row, m_loc, p;
-    int          procs;
-
-    Astore = (NRformat_loc *) A->Store;
-    fst_row = Astore->fst_row;
-    m_loc = Astore->m_loc;
-    procs = grid->nprow * grid->npcol;
-    
-    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
-	ABORT("Malloc fails for row_to_proc[]");
-    SOLVEstruct->row_to_proc = row_to_proc;
-    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
-        ABORT("Malloc fails for inv_perm_c[].");
-    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
-    SOLVEstruct->inv_perm_c = inv_perm_c;
-
-    /* ------------------------------------------------------------
-       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
-       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
-       
-       NOTE: For those processes that do not own any row, it must
-             must be set so that fst_row == A->nrow. 
-       ------------------------------------------------------------*/
-    if ( !(itemp = intMalloc_dist(procs+1)) )
-        ABORT("Malloc fails for itemp[]");
-    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
-		  grid->comm);
-    itemp[procs] = A->nrow;
-    for (p = 0; p < procs; ++p) {
-        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
-    }
-#if ( DEBUGlevel>=2 )
-    if ( !grid->iam ) {
-      printf("fst_row = %d\n", fst_row);
-      PrintInt10("row_to_proc", A->nrow, row_to_proc);
-      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
-    }
-#endif
-    SUPERLU_FREE(itemp);
-
-#if 0
-    /* Compute the mapping between rows and processes. */
-    /* XSL NOTE: What happens if # of mapped processes is smaller
-       than total Procs?  For the processes without any row, let
-       fst_row be EMPTY (-1). Make sure this case works! */
-    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
-		  grid->comm);
-    itemp[procs] = n;
-    for (p = 0; p < procs; ++p) {
-        j = itemp[p];
-	if ( j != EMPTY ) {
-	    k = itemp[p+1];
-	    if ( k == EMPTY ) k = n;
-	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
-	}
-    }
-#endif    
-
-    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
-		   &SOLVEstruct->num_diag_procs,
-		   &SOLVEstruct->diag_procs,
-		   &SOLVEstruct->diag_len);
-
-    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
-	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
-        ABORT("Malloc fails for gstrs_comm[]");
-    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
-		 LUstruct->Glu_persist, SOLVEstruct);
-
-    if ( !(SOLVEstruct->gsmv_comm = (pdgsmv_comm_t *)
-           SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) )
-        ABORT("Malloc fails for gsmv_comm[]");
-    SOLVEstruct->A_colind_gsmv = NULL;
-    
-    options->SolveInitialized = YES;
-    return 0;
-} /* dSolveInit */
-
-/*
- * Release the resources used for the solution phase.
- */
-void dSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct)
-{
-    int_t *it;
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-    if ( options->RefineInitialized ) {
-        pdgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
-    }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
-    options->SolveInitialized = NO;
-} /* dSolveFinalize */
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
-		      double xtrue[], int_t ldxtrue, gridinfo_t *grid) 
-{
-    double err, xnorm, temperr, tempxnorm;
-    double *x_work, *xtrue_work;
-    int i, j;
-
-    for (j = 0; j < nrhs; j++) {
-      x_work = &x[j*ldx];
-      xtrue_work = &xtrue[j*ldxtrue];
-      err = xnorm = 0.0;
-      for (i = 0; i < n; i++) {
-	err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
-	xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
-      }
-
-      /* get the golbal max err & xnrom */
-      temperr = err;
-      tempxnorm = xnorm;
-      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-
-      err = err / xnorm;
-      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
-    }
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,5174 +0,0 @@
-
-/*
- * -- Parallel symbolic factorization routine  (version 2.2) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
- * INRIA France - January 2004
- * Laura Grigori
- *
- * November 1, 2007
- * Feburary 20, 2008
- */
-
-
-/* 
- * The function symbfact_dist implements the parallel symbolic factorization
- * algorithm described in the paper:
- *
- * Parallel Symbolic Factorization for Sparse LU with Static Pivoting,
- * Laura Grigori, James W. Demmel and Xiaoye S. Li,
- * Pages 1289-1314, SIAM Journal on Scientific Computing, Volume 29, Issue 3.
- *
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-#include <math.h>
-#include "superlu_ddefs.h"
-#include "psymbfact.h"
-
-/*
- * Internal protypes
- */
-
-static int_t *
-intMalloc_symbfact(int_t );
-
-static int_t *
-intCalloc_symbfact(int_t );
-
-static int_t
-initParmsAndStats
-(psymbfact_stat_t *PS);
-
-static void
-estimate_memUsage
-(int_t, int, mem_usage_t *, float *, float *,
- Pslu_freeable_t *, Llu_symbfact_t *,
- vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);
-
-static void
-symbfact_free 
-(int, int, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *);
-
-static int_t
-denseSep_symbfact 
-(int , int_t, int, int, int, int_t *, int_t *, int, 
- int,  int, int_t, int_t, int_t *, int_t *, int_t *,
- int_t *, int_t *, MPI_Comm, MPI_Comm *, Llu_symbfact_t *,
- Pslu_freeable_t *_freeable, vtcsInfo_symbfact_t *, 
- comm_symbfact_t *, psymbfact_stat_t * );
-
-static int_t
-dnsUpSeps_symbfact
-(int_t, int, int, int, int, int_t *, int_t *, int_t,
- Llu_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, 
- comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *);
-
-static void
-intraLvl_symbfact 
-(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int, 
- int, int_t, int_t,  Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, 
- comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, 
- int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *);
-
-static void
-initLvl_symbfact
-(int_t, int, int_t, int_t, Pslu_freeable_t *, 
- Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, 
- int_t *, int_t, int_t);
-
-static void
-createComm (int, int, MPI_Comm *, MPI_Comm *);
-
-static void
-freeComm (int, int, MPI_Comm *, MPI_Comm *);
-
-static void
-domain_symbfact
-(SuperMatrix *, int, int, int,  int, int, int_t *, int_t *,
- int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *,
- comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, 
- int_t *, int_t *, int_t *);
-
-static float
-allocPrune_domain
-(int_t, int_t, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-static float
-allocPrune_lvl
-(Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-static int
-symbfact_alloc
-(int_t, int, Pslu_freeable_t *, Llu_symbfact_t *, 
- vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);
-
-static float 
-symbfact_mapVtcs
-(int, int, int, SuperMatrix *, int_t *, int_t *, 
- Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, int_t, psymbfact_stat_t *);
-
-static void 
-symbfact_distributeMatrix 
-(int, int, int, SuperMatrix *, int_t *, int_t *, matrix_symbfact_t *, 
- Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, MPI_Comm *);
-
-static int_t
-interLvl_symbfact
-(SuperMatrix *, int, int, int, int, int, int, int, 
- int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *,
- Llu_symbfact_t *, Pslu_freeable_t*, comm_symbfact_t *, vtcsInfo_symbfact_t *,
- psymbfact_stat_t *, MPI_Comm, MPI_Comm *);
-
-static float
-cntsVtcs 
-(int_t, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, 
- int_t *, int_t *, int_t *, psymbfact_stat_t *, MPI_Comm *);
-
-/************************************************************************/
-float symbfact_dist
-/************************************************************************/
-(
- int         nprocs_num,  /* Input - no of processors */
- int         nprocs_symb, /* Input - no of processors for the symbolic
-			     factorization */
- SuperMatrix *A,          /* Input - distributed input matrix */
- int_t       *perm_c,     /* Input - column permutation */
- int_t       *perm_r,     /* Input - row permutation */
- int_t       *sizes,      /* Input - sizes of each node in the separator tree */
- int_t       *fstVtxSep,  /* Input - first vertex of each node in the tree */
- Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, 
-				    global to local indexing information */
- MPI_Comm    *num_comm,   /* Input - communicator for numerical factorization */
- MPI_Comm    *symb_comm,  /* Input - communicator for symbolic factorization */
- mem_usage_t *symb_mem_usage
- )
-{
-/* 
- * Purpose
- * =======
- *   symbfact_dist() performs symbolic factorization of matrix A suitable
- *   for performing the supernodal Gaussian elimination with no pivoting (GEPP). 
- *   This routine computes the structure of one column of L and one row of U 
- *   at a time.  It uses:
- *        o distributed input matrix
- *        o supernodes
- *        o symmetric structure pruning
- *
- *
- * Arguments
- * =========
- *
- * nprocs_num (input) int
- *         Number of processors SuperLU_DIST is executed on, and the input 
- *         matrix is distributed on.
- *
- * nprocs_symb (input) int
- *         Number of processors on which the symbolic factorization is
- *         performed.  It is equal to the number of independent domains
- *         idenfied in the graph partitioning algorithm executed
- *         previously and has to be a power of 2.  It corresponds to
- *         number of leaves in the separator tree.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The
- *         number of the linear equations is A->nrow.  Matrix A is
- *         distributed in NRformat_loc format.
- *         Matrix A is not yet permuted by perm_c.
- *
- * perm_c  (input) int_t*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * perm_r  (input) int_t*
- *	   Row permutation vector of size A->nrow, which defines the 
- *         permutation matrix Pr; perm_r[i] = j means column i of A is 
- *         in position j in Pr*A.
- *
- * sizes   (input) int_t*
- *         Contains the number of vertices in each separator.
- *
- * fstVtxSep (input) int_t*
- *         Contains first vertex for each separator.
- *
- * Pslu_freeable (output) Pslu_freeable_t*
- *         Returns the local L and U structure, and global to local
- *         information on the indexing of the vertices.  Contains all
- *         the information necessary for performing the data
- *         distribution towards the numeric factorization.
- *				    
- * num_comm (input) MPI_Comm*
- *         Communicator for numerical factorization 
- *
- * symb_comm (input) MPI_Comm*
- *         Communicator for symbolic factorization 
- *
- * symb_mem_usage (input) mem_usage_t *
- *         Statistics on memory usage.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the symbolic factorization.
- *   > 0, number of bytes allocated when out of memory.
- *
- * Sketch of the algorithm
- * =======================
- *
- *  Distrbute the vertices on the processors using a subtree to
- *  subcube algorithm.
- *
- *  Redistribute the structure of the input matrix A according to the
- *  subtree to subcube computed previously for the symbolic
- *  factorization routine.  This implies in particular a distribution
- *  from nprocs_num processors to nprocs_symb processors.
- *
- *  Perform symbolic factorization guided by the separator tree provided by
- *  a graph partitioning algorithm.  The symbolic factorization uses a 
- *  combined left-looking, right-looking approach. 
- *
- */
-  NRformat_loc *Astore;
-  int iam, szSep, fstP, lstP, npNode, nlvls, lvl, p, iSep, jSep;
-  int iinfo; /* return code */
-  int_t m, n;
-  int_t nextl, nextu, neltsZr, neltsTotal, nsuper_loc, szLGr, szUGr;
-  int_t ind_blk, nsuper, vtx, min_mn, nnzL, nnzU, szsn;
-  float stat_loc[23], stat_glob[23], mem_glob[15];
-  
-  Llu_symbfact_t Llu_symbfact; /* local L and U and pruned L and U data structures */
-  vtcsInfo_symbfact_t VInfo; /* local information on number of blocks,
-				number of vertices in a block etc */
-  matrix_symbfact_t   AS; /* temporary storage for the input matrix after redistribution */
-  comm_symbfact_t CS;  /* information on communication */
-  /* relaxation parameters (for future release) and 
-     statistics collected during the symbolic factorization */
-  psymbfact_stat_t PS; 
-  /* temp array of size n, used as a marker by the subroutines */
-  int_t *tempArray; 
-  int_t i, j, k;
-  int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc;
-  int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd;
-  float totalMemLU, overestimMem;
-  MPI_Comm *commLvls;  
-
-  /* maximum block size */
-  int_t  maxSzBlk;
-  float flinfo;
-#if ( PRNTlevel >= 1)
-  float stat_msgs_l[10], stat_msgs_g[10]; 
-#endif  
-#if ( PROFlevel>=1 )
-  double t, t_symbFact[3], t_symbFact_loc[3];
-  double *time_lvlsT, *time_lvls, t1, t2, time_lvlsg[9];
-#endif
-  
-  /* Initialization */
-  MPI_Comm_rank ((*num_comm), &iam);
-  commLvls = NULL;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter psymbfact()");
-#endif
-  initParmsAndStats (&PS);
-  if (nprocs_symb != 1) {
-    if (!(commLvls = (MPI_Comm *) SUPERLU_MALLOC(2*nprocs_symb*sizeof(MPI_Comm)))) {
-      fprintf (stderr, "Malloc fails for commLvls[].");  
-      return (PS.allocMem);
-    }
-    PS.allocMem += 2 * nprocs_symb * sizeof(MPI_Comm);
-  }
-  
-  nlvls = (int) LOG2( nprocs_num ) + 1;
-#if ( PROFlevel>=1 )
-  time_lvlsT = (double *) SUPERLU_MALLOC(3*nprocs_symb*(nlvls+1) 
-					 * sizeof(double));
-  time_lvls  = (double *) SUPERLU_MALLOC(3*(nlvls+1) * sizeof(double));
-  if (!time_lvls || !time_lvlsT) {
-    fprintf (stderr, "Malloc fails for time_lvls[].");  
-    return (PS.allocMem);
-  }
-  PS.allocMem += (3*nprocs_symb*(nlvls+1) + 3*(nlvls+1)) * sizeof(double);
-#endif
-  
-  VInfo.xlsub_nextLvl  = 0;
-  VInfo.xusub_nextLvl  = 0;
-  VInfo.maxSzBlk = sp_ienv_dist(3);
-  maxSzBlk = VInfo.maxSzBlk;
-  
-  mark = EMPTY;
-  nsuper_loc = 0;
-  nextl   = 0; nextu      = 0;
-  neltsZr = 0; neltsTotal = 0;
-  
-  m = A->nrow;
-  n = A->ncol;
-  min_mn = SUPERLU_MIN( m, n );
-  
-  if (!(tempArray = intMalloc_symbfact(n))) {
-    fprintf (stderr, "Malloc fails for tempArray[].\n");  
-    return (PS.allocMem);
-  }
-  PS.allocMem += n * sizeof(int_t);
-  
-#if ( PROFlevel>=1 )  
-  t = SuperLU_timer_();
-#endif
-  
-  /* Distribute vertices on processors */
-  if ((flinfo = 
-       symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes, 
-			 Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0) 
-    return (flinfo);
-
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  
-  /* Redistribute matrix A on processors following the distribution found
-     in symbfact_mapVtcs.  Store the redistributed A temporarily into AS */
-  symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb,  A, 
-			     perm_c, perm_r, &AS, 
-			     Pslu_freeable, &VInfo, tempArray, num_comm);
-  
-  /* THE REST OF THE SYMBOLIC FACTORIZATION IS EXECUTED ONLY BY NPROCS_SYMB
-     PROCESSORS */
-  if ( iam < nprocs_symb ) {
-    
-#if ( PROFlevel>=1 )
-    t_symbFact_loc[0] = SuperLU_timer_() - t;
-    t = SuperLU_timer_();
-    t_symbFact_loc[1] = t;
-#endif
-
-    /* Allocate storage common to the symbolic factor routines */
-    if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, 
-				&Llu_symbfact, &VInfo, &CS, &PS)) 
-      return (PS.allocMem);
-  
-    /* Copy the redistributed input matrix AS at the end of the memory buffer
-       allocated to store L and U.  That is, copy (AS.x_ainf, AS.ind_ainf) in
-       (xlsub, lsub), (AS.x_asup, AS.ind_asup) in (xusub, usub).  Free the
-       memory used to store the input matrix */
-    nnz_ainf_loc = VInfo.nnz_ainf_loc;
-    nnz_asup_loc = VInfo.nnz_asup_loc;
-    j = Llu_symbfact.szUsub - VInfo.nnz_asup_loc;
-    k = Llu_symbfact.szLsub - VInfo.nnz_ainf_loc;
-    for (i = 0; i <= VInfo.nvtcs_loc; i++) {
-      Llu_symbfact.xusub[i] = AS.x_asup[i] + j;
-      Llu_symbfact.xlsub[i] = AS.x_ainf[i] + k;
-    }
-    
-    for (i = 0; i < VInfo.nnz_asup_loc; i++, j++) 
-      Llu_symbfact.usub[j] = AS.ind_asup[i];
-    for (i = 0; i < VInfo.nnz_ainf_loc; i++, k++)   
-      Llu_symbfact.lsub[k] = AS.ind_ainf[i];
-    SUPERLU_FREE( AS.x_ainf );
-    SUPERLU_FREE( AS.x_asup );  
-    SUPERLU_FREE( AS.ind_ainf );  
-    SUPERLU_FREE( AS.ind_asup );  
-
-    if (nprocs_symb != 1) {
-      createComm (iam, nprocs_symb, commLvls, symb_comm);    
-
-#if ( PROFlevel>=1 )
-      t_symbFact_loc[2] = SuperLU_timer_();
-#endif
-      if ((flinfo = cntsVtcs (n, iam, nprocs_symb, Pslu_freeable, &Llu_symbfact, 
-			      &VInfo, tempArray, fstVtxSep, sizes, &PS, commLvls)) > 0) 
-	return (flinfo);
-			     
-#if ( PROFlevel>=1 )
-      t_symbFact_loc[2] = SuperLU_timer_() - t_symbFact_loc[2];
-#endif
-    }
-    
-    /* set to EMPTY marker[] array */
-    for (i = 0; i < n; i++)
-      tempArray[i] = EMPTY;
-    
-    szSep = nprocs_symb;
-    iSep = 0;
-    lvl = 0;
-    while (szSep >= 1) {
-      /* for each level in the separator tree */
-      npNode = nprocs_symb / szSep; 
-      fstP = 0; 
-      /* for each node in the level */
-      for (jSep = iSep; jSep < iSep + szSep; jSep++) {
-	fstVtx = fstVtxSep[jSep];
-	lstVtx  = fstVtx + sizes[jSep];
-	/* if this is the first level */
-	if (szSep == nprocs_symb) {
-	  /* compute symbolic factorization for my domain */
-	  if (fstP == iam) {
-	    /* allocate storage for the pruned structures */
-#if ( PROFlevel>=1 )
-	    t1 = SuperLU_timer_();
-#endif
-	    if ((flinfo = allocPrune_domain (fstVtx, lstVtx, 
-					     &Llu_symbfact, &VInfo, &PS)) > 0)
-	      return (flinfo);
-	    if (fstVtx < lstVtx)
-	      VInfo.fstVtx_nextLvl = VInfo.begEndBlks_loc[2];
-	    
-	    domain_symbfact 
-	      (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstVtx, lstVtx, 
-	       Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, tempArray, 
-	       &mark, &nextl, &nextu, &neltsZr, &neltsTotal, &nsuper_loc);
-	    
-	    PS.estimLSz = nextl;
-	    PS.estimUSz = nextu;
-	    if (nprocs_symb != 1) 
-	      if((flinfo = allocPrune_lvl (&Llu_symbfact, &VInfo, &PS)) > 0)
-		return (flinfo);
-#if ( PROFlevel>=1 )
-	    t2 = SuperLU_timer_();
-	    time_lvls[lvl] = 0.; time_lvls[lvl+1] = 0.;
-	    time_lvls[lvl + 2] = t2 - t1;
-#endif
-	  }
-	}
-	else {
-	  lstP = fstP + npNode;
-	  if (fstP <= iam && iam < lstP) {
-#if ( PROFlevel>=1 )
-	    t1 = SuperLU_timer_();	  
-#endif
-	    if (VInfo.filledSep != FILLED_SEPS)
-	      initLvl_symbfact(n, iam, fstVtx, lstVtx,
-			       Pslu_freeable, &Llu_symbfact, &VInfo, &PS, commLvls[jSep], 
-			       tempArray, nextl, nextu);
-#if ( PROFlevel>=1 )
-	    t2 = SuperLU_timer_();
-	    time_lvls[3*lvl] = t2 - t1;
-#endif
-	    interLvl_symbfact (A, iam, lvl, szSep, fstP, lstP,
-			       iSep, jSep, sizes, fstVtxSep, 
-			       &nextl, &nextu, &nsuper_loc, &mark, tempArray,
-			       &Llu_symbfact, Pslu_freeable, &CS, &VInfo, &PS,
-			       commLvls[jSep], symb_comm);
-#if ( PROFlevel>=1 )
-	    t1 = SuperLU_timer_();
-	    time_lvls[3*lvl+1] = t1 - t2;
-#endif
-	    if (VInfo.filledSep != FILLED_SEPS)
-	      intraLvl_symbfact 
-		(A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstP, lstP, 
-		 fstVtx, lstVtx, Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS,
-		 tempArray, &mark, &nextl, &nextu, &neltsZr, &neltsTotal, 
-		 &nsuper_loc, commLvls[jSep], symb_comm);
-#if ( PROFlevel>=1 )
-	    t2 = SuperLU_timer_();
-	    time_lvls[3*lvl+2] = t2 - t1;		 
-#endif
-	  }
-	}
-	fstP += npNode;
-      }
-      iSep += szSep;
-      szSep = szSep / 2;
-      lvl ++;
-    }
-  
-    SUPERLU_FREE( tempArray );
-    
-    /* Set up global information and collect statistics */
-    if (PS.maxSzLPr < Llu_symbfact.indLsubPr)
-      PS.maxSzLPr = Llu_symbfact.indLsubPr;
-    if (PS.maxSzUPr < Llu_symbfact.indUsubPr)
-      PS.maxSzUPr = Llu_symbfact.indUsubPr;
-    
-    Llu_symbfact.xlsub[VInfo.nvtcs_loc] = nextl;
-    Llu_symbfact.xusub[VInfo.nvtcs_loc] = nextu;
-    fill_rcmd = SUPERLU_MAX( nextl / nnz_ainf_loc, nextu / nnz_asup_loc) + 1;
-    Pslu_freeable->xsup_beg_loc = intMalloc_dist (nsuper_loc+1);
-    Pslu_freeable->xsup_end_loc = intMalloc_dist (nsuper_loc+1);
-    if (!Pslu_freeable->xsup_beg_loc || !Pslu_freeable->xsup_end_loc) {
-      fprintf (stderr, "Malloc fails for xsup_beg_loc, xsup_end_loc.");
-      return (PS.allocMem);
-    }
-    PS.allocMem += 2 * (nsuper_loc+1) * sizeof(int_t);
-    maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-    nnzL = 0; nnzU = 0;
-    
-    i = 0;
-    nsuper = 0;
-    ind_blk = 0;
-    for (ind_blk = 0; ind_blk < VInfo.nblks_loc; ind_blk ++) {
-      fstVtx = VInfo.begEndBlks_loc[2 * ind_blk];
-      lstVtx = VInfo.begEndBlks_loc[2 * ind_blk + 1];
-      fstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[fstVtx] );
-      nsuper = Pslu_freeable->supno_loc[fstVtx_lid];
-      Pslu_freeable->xsup_beg_loc[nsuper] = fstVtx;
-      szsn = 1;
-      if (INT_MAX - nnzL <= Llu_symbfact.xlsub[fstVtx_lid + 1] - 
-	  Llu_symbfact.xlsub[fstVtx_lid])
-	printf ("PE[%d] ERR nnzL %d\n", iam, nnzL); 
-      if (INT_MAX - nnzU <= Llu_symbfact.xusub[fstVtx_lid + 1] - 
-	  Llu_symbfact.xusub[fstVtx_lid])
-	printf ("PE[%d] ERR nnzU %d\n", iam, nnzU);
-      
-      j = Llu_symbfact.xlsub[fstVtx_lid + 1] - Llu_symbfact.xlsub[fstVtx_lid];
-      k = Llu_symbfact.xusub[fstVtx_lid + 1] - Llu_symbfact.xusub[fstVtx_lid];
-      nnzL += j;
-      nnzU += k;
-
-      for (vtx = fstVtx + 1, vtx_lid = fstVtx_lid + 1; 
-	   vtx < lstVtx; vtx++, vtx_lid ++) {
-	if (Pslu_freeable->supno_loc[vtx_lid] != nsuper) {
-	  nsuper = Pslu_freeable->supno_loc[vtx_lid];
-	  Pslu_freeable->xsup_end_loc[nsuper-1] = vtx;
-	  Pslu_freeable->xsup_beg_loc[nsuper] = vtx;
-	  szsn = 1;
-	  j = Llu_symbfact.xlsub[vtx_lid + 1] - Llu_symbfact.xlsub[vtx_lid];
-	  k = Llu_symbfact.xusub[vtx_lid + 1] - Llu_symbfact.xusub[vtx_lid];
-	}
-	else {
-	  szsn ++;
-	}
-	nnzL += j - szsn + 1;
-	nnzU += k - szsn + 1;
-      }
-      Pslu_freeable->xsup_end_loc[nsuper] = lstVtx;
-    }
-    Pslu_freeable->supno_loc[VInfo.nvtcs_loc] = nsuper_loc;
-    Pslu_freeable->nvtcs_loc = VInfo.nvtcs_loc; 
-
-    /* set up xsup data */
-    Pslu_freeable->lsub = Llu_symbfact.lsub;
-    Pslu_freeable->xlsub = Llu_symbfact.xlsub;
-    Pslu_freeable->usub = Llu_symbfact.usub;
-    Pslu_freeable->xusub = Llu_symbfact.xusub;
-    Pslu_freeable->szLsub = Llu_symbfact.szLsub;
-    Pslu_freeable->szUsub = Llu_symbfact.szUsub;
-    
-#if ( PROFlevel>=1 )
-    t_symbFact_loc[1] = SuperLU_timer_() - t_symbFact_loc[1];
-#endif  
-
-#if ( PRNTlevel>=1 )
-    estimate_memUsage (n, iam,  symb_mem_usage, 
-		       &totalMemLU, &overestimMem, 
-		       Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS);
-    stat_loc[0] = (float) nnzL;
-    stat_loc[1] = (float) nnzU;  
-    stat_loc[2] = (float) nsuper_loc;
-    stat_loc[3] = (float) Pslu_freeable->xlsub[VInfo.nvtcs_loc];
-    stat_loc[4] = (float) Pslu_freeable->xusub[VInfo.nvtcs_loc];
-    stat_loc[5] = totalMemLU;
-    stat_loc[6] = overestimMem;
-    stat_loc[7] = totalMemLU - overestimMem;
-    stat_loc[8] = (float) PS.maxSzBuf;
-    stat_loc[9] = (float) PS.nDnsUpSeps;
-    stat_loc[10] = (float) PS.nDnsCurSep;
-    stat_loc[11] = (float) (Llu_symbfact.no_expand + Llu_symbfact.no_expcp +
-			    Llu_symbfact.no_expand_pr);
-    stat_loc[12] = (float) Llu_symbfact.no_expand;
-    stat_loc[13] = (float) Llu_symbfact.no_expcp;
-    stat_loc[14] = (float) Llu_symbfact.no_expand_pr;
-    stat_loc[15] = (float) fill_rcmd;
-    stat_loc[16] = PS.nops;
-    stat_loc[17] = PS.fill_pelt[1];
-    stat_loc[18] = PS.fill_pelt[4];
-    stat_loc[19] = PS.fill_pelt[0];
-    stat_loc[20] = PS.fill_pelt[2];
-    stat_loc[21] = PS.fill_pelt[3];
-    stat_loc[22] = PS.fill_pelt[5];
-    
-    MPI_Reduce (stat_loc, stat_glob, 23, MPI_FLOAT, 
-		MPI_SUM, 0, (*symb_comm));
-    MPI_Reduce (&(stat_loc[5]), mem_glob, 14, MPI_FLOAT, 
-		MPI_MAX, 0, (*symb_comm));
-    fill_rcmd = (int_t) mem_glob[10];
-    PS.fill_pelt[0] = stat_glob[19];
-    PS.fill_pelt[1] = mem_glob[12];
-    PS.fill_pelt[2] = stat_glob[20];
-    PS.fill_pelt[3] = stat_glob[21];
-    PS.fill_pelt[4] = mem_glob[13];
-    PS.fill_pelt[5] = stat_glob[22];
-    if (PS.fill_pelt[2] == 0.) PS.fill_pelt[2] = 1.;
-    if (PS.fill_pelt[5] == 0.) PS.fill_pelt[5] = 1.;
-    
-#if ( PROFlevel>=1 )
-    MPI_Reduce (t_symbFact_loc, t_symbFact, 3, MPI_DOUBLE,
-		MPI_MAX, 0, (*symb_comm));
-    MPI_Gather (time_lvls, 3 * nlvls, MPI_DOUBLE,
-		time_lvlsT, 3 * nlvls , MPI_DOUBLE,
-		0, (*symb_comm));
-#endif
-    
-    stat_msgs_l[0] = (float) PS.maxsz_msgSnd;
-    stat_msgs_l[1] = (float) PS.maxsz_msgSnd;
-    if (PS.maxsz_msgSnd < PS.maxsz_msgCol)
-      stat_msgs_l[1] = PS.maxsz_msgCol;
-    stat_msgs_l[2] = PS.no_shmSnd + PS.no_msgsSnd + 
-      PS.no_shmRcvd + PS.no_msgsRcvd;
-    stat_msgs_l[3] = stat_msgs_l[2] + PS.no_msgsCol;
-    stat_msgs_l[4] = stat_msgs_l[2];
-    stat_msgs_l[5] = stat_msgs_l[3]; 
-    stat_msgs_l[6] = PS.no_msgsSnd;
-    stat_msgs_l[7] = PS.no_msgsSnd + PS.no_msgsCol;  
-    stat_msgs_l[8] = PS.sz_msgsSnd;
-    stat_msgs_l[9] = PS.sz_msgsSnd + PS.sz_msgsCol;
-    MPI_Reduce (stat_msgs_l, stat_msgs_g, 4, MPI_FLOAT,
-		MPI_MAX, 0, (*symb_comm));
-    MPI_Reduce (&(stat_msgs_l[4]), &(stat_msgs_g[4]), 6, MPI_FLOAT,
-		MPI_SUM, 0, (*symb_comm));
-    if (stat_msgs_g[6] == 0) stat_msgs_g[6] = 1;
-    if (stat_msgs_g[7] == 0) stat_msgs_g[7] = 1;
-    
-    if (!iam) {
-      nnzL   = (int_t) stat_glob[0]; nnzU  = (int_t) stat_glob[1];
-      nsuper = (int_t) stat_glob[2];
-      szLGr  = (int_t) stat_glob[3]; szUGr = (int_t) stat_glob[4];
-      printf("\tMax szBlk          %ld\n", VInfo.maxSzBlk);
-#if ( PRNTlevel>=2 )
-      printf("\t relax_gen %.2f, relax_curSep %.2f, relax_seps %.2f\n",
-	     PS.relax_gen, PS.relax_curSep, PS.relax_seps);
-#endif
-      printf("\tParameters: fill mem %ld fill pelt %ld\n",
-	     sp_ienv_dist(6), PS.fill_par);
-      printf("\tNonzeros in L       %ld\n", nnzL);
-      printf("\tNonzeros in U       %ld\n", nnzU);
-      printf("\tnonzeros in L+U-I   %ld\n", 
-	     nnzL + nnzU);
-      printf("\tNo of supers   %ld\n", nsuper);
-      printf("\tSize of G(L)   %ld\n", szLGr);
-      printf("\tSize of G(U)   %ld\n", szUGr);
-      printf("\tSize of G(L+U) %ld\n", szLGr+szUGr);
-
-      printf("\tParSYMBfact (MB)      :\tL\\U MAX %.2f\tAVG %.2f\n",
-	     mem_glob[0]*1e-6, 
-	     stat_glob[5]/nprocs_symb*1e-6);
-#if ( PRNTlevel>=2 )
-      printf("\tRL overestim (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
-	     mem_glob[1]*1e-6, 
-	     stat_glob[6]/nprocs_symb*1e-6);
-      printf("\tsnd/rcv buffers (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
-	     mem_glob[3]*1e-6, 
-	     stat_glob[8]/nprocs_symb*1e-6);
-      printf("\tSYMBfact 2*n+4*nvtcs_loc+2*maxNvtcsNds_loc:\tL\\U %.2f\n",
-	     (float) (2 * n * sizeof(int_t)) *1e-6);
-      printf("\tint_t %d, int %d, long int %d, short %d, float %d, double %d\n", 
-	     sizeof(int_t), sizeof(int),  sizeof(long int), sizeof(short), sizeof(float),
-	     sizeof(double));
-      printf("\tDNS ALLSEPS:\t MAX %d\tAVG %.2f\n",
-	     (int_t) mem_glob[4], stat_glob[9]/nprocs_symb);
-      printf("\tDNS CURSEP:\t MAX %d\tAVG %.2f\n\n",
-	     (int_t) mem_glob[5], stat_glob[10]/nprocs_symb);
-
-      printf("\t MAX FILL Mem(L+U) / Mem(A) per processor %ld\n", fill_rcmd);    
-      printf("\t      Per elt MAX %ld AVG %ld\n", 
-	     (int_t) PS.fill_pelt[4], (int_t)(PS.fill_pelt[3]/PS.fill_pelt[5]));
-      printf("\t      Per elt RL MAX %ld AVG %ld\n",
-	     (int_t) PS.fill_pelt[1], (int_t)(PS.fill_pelt[0]/PS.fill_pelt[2]));
-      printf("\tM Nops:\t MAX %.2f\tAVG %.2f\n",
-	     mem_glob[11]*1e-6, (stat_glob[16]/nprocs_symb)*1e-6);
-      
-      
-      printf("\tEXPANSIONS: MAX/AVG\n");
-      printf("\tTOTAL: %d / %.2f\n",
-	     (int_t) mem_glob[6], stat_glob[11]/nprocs_symb);
-      printf("\tREALLOC: %.f / %.2f RL_CP %.f / %.2f PR_CP %.f / %.2f\n",
-	     mem_glob[7], stat_glob[12]/nprocs_symb,
-	     mem_glob[8], stat_glob[13]/nprocs_symb,
-	     mem_glob[9], stat_glob[14]/nprocs_symb);
-      
-      printf ("\n\tDATA MSGS  noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n",
-	      stat_msgs_g[2]*1e-3, stat_msgs_g[4]/nprocs_symb*1e-3,
-	      stat_msgs_g[0]*1e-6, stat_msgs_g[8] / stat_msgs_g[6]*1e-6);
-      printf ("\tTOTAL MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n",
-	      stat_msgs_g[3]*1e-3, stat_msgs_g[5]/nprocs_symb*1e-3,
-	      stat_msgs_g[1]*1e-6, stat_msgs_g[9]/stat_msgs_g[7]*1e-6);
-#endif      
-
-#if ( PROFlevel>=1 )
-      printf("Distribute matrix time = %8.3f\n", t_symbFact[0]);
-      printf("Count vertices time    = %8.3f\n", t_symbFact[2]);
-      printf("Symbfact DIST time     = %8.3f\n", t_symbFact[1]);
-      
-      printf("\nLvl\t    Time\t    Init\t   Inter\t    Intra\n");
-      time_lvlsg[0] = 0.;
-      for (i = 0; i < nlvls; i++) {
-	for (j = 1; j < 9; j++)
-	  time_lvlsg[j] = 0.;
-	for (p = 0; p < nprocs_symb; p++) {
-	  k = p * 3 * nlvls;
-	  t = time_lvlsT[i*3+k] + time_lvlsT[i*3+k+1] + time_lvlsT[i*3+k+2];
-	  if (t > time_lvlsg[1]) {
-	    time_lvlsg[1] = t; j = p;
-	  }
-	  time_lvlsg[2] += t;
-	  if (time_lvlsT[i*3+k] > time_lvlsg[3])
-	    time_lvlsg[3] = time_lvlsT[i*3+k];
-	  time_lvlsg[4] += time_lvlsT[i*3+k];
-	  if (time_lvlsT[i*3+k+1] > time_lvlsg[5])
-	    time_lvlsg[5] = time_lvlsT[i*3+k+1];
-	  time_lvlsg[6] += time_lvlsT[i*3+k+1];
-	  if (time_lvlsT[i*3+k+2] > time_lvlsg[7])
-	    time_lvlsg[7] = time_lvlsT[i*3+k+2];
-	  time_lvlsg[8] += time_lvlsT[i*3+k+2];
-	}
-	time_lvlsg[0] += time_lvlsg[1];
-	printf ("%d \t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\n", i,
-		time_lvlsg[1], time_lvlsg[2] / nprocs_symb,
-		time_lvlsg[3], time_lvlsg[4] / nprocs_symb,
-		time_lvlsg[5], time_lvlsg[6] /nprocs_symb,
-		time_lvlsg[7], time_lvlsg[8] / nprocs_symb); 
-      }
-      printf("\t   %8.3f \n", time_lvlsg[0]);    
-#endif
-    }
-#endif
-#if ( PROFlevel>=1 )
-    SUPERLU_FREE (time_lvls);
-    SUPERLU_FREE (time_lvlsT);
-#endif
-    symbfact_free (iam, nprocs_symb, &Llu_symbfact, &VInfo, &CS);
-  } /* if (iam < nprocs_symb) */  
-  else {
-    /* update Pslu_freeable before returning */
-    Pslu_freeable->nvtcs_loc = 0; 
-    Pslu_freeable->xlsub = NULL; Pslu_freeable->lsub = NULL; 
-    Pslu_freeable->xusub = NULL; Pslu_freeable->usub = NULL; 
-    Pslu_freeable->supno_loc = NULL;
-    Pslu_freeable->xsup_beg_loc = NULL;     
-    Pslu_freeable->xsup_end_loc = NULL;
-    
-    SUPERLU_FREE( tempArray );
-    PS.allocMem -= n * sizeof(int_t);
-  }
-
-  if (iam < nprocs_symb && nprocs_symb != 1) 
-    freeComm (iam, nprocs_symb, commLvls, symb_comm);     
-  if (commLvls != NULL)
-    SUPERLU_FREE( commLvls );
-  
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit psymbfact()");
-#endif
-
-  return (- PS.allocMem);
-} /* SYMBFACT_DIST */
-
-
-static int_t
-initParmsAndStats
-(
- psymbfact_stat_t *PS /* Output -statistics*/
-)
-/* 
- * Purpose
- * =======
- * Initialize relaxation parameters and statistics variables
- */
-{
-  int  i;
-
-  PS->nDnsCurSep = 0;
-  PS->nDnsUpSeps = 0;
-  
-  PS->relax_gen = 1.0;
-  PS->relax_curSep = 1.0;
-  PS->relax_seps = 1.0;
-  PS->fill_par = sp_ienv_dist(6);
-  PS->nops = 0.;
-  PS->no_shmSnd = 0.;
-  PS->no_msgsSnd = 0.;
-  PS->maxsz_msgSnd = 0;
-  PS->sz_msgsSnd = 0.;
-  PS->no_shmRcvd = 0.;
-  PS->no_msgsRcvd = 0.;
-  PS->maxsz_msgRcvd = 0;
-  PS->sz_msgsRcvd = 0.;
-  PS->no_msgsCol = 0.;
-  PS->maxsz_msgCol = 0;
-  PS->sz_msgsCol = 0.;
-
-  for (i = 0; i < 6; i++)
-    PS->fill_pelt[i] = 0.;
-
-  PS->estimUSz = 0;
-  PS->estimLSz = 0;
-  PS->maxSzLPr = 0;
-  PS->maxSzUPr = 0;
-  PS->maxSzBuf = 0;
-  PS->szDnsSep = 0;  
-  PS->allocMem = 0;
-}
-
-static float
-cntsVtcs 
-(
- int_t  n,           /* Input - order of the input matrix */
- int    iam,         /* Input - my processor number */
- int    nprocs_symb, /* Input - no of processors for symbolic factorization */
- Pslu_freeable_t *Pslu_freeable, /* Input -globToLoc and maxNvtcsPProc */
- Llu_symbfact_t  *Llu_symbfact, /* Input/Output -local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices distribution */
- int_t            *tempArray, /* Input - temporary storage */
- int_t            *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t            *sizes,     /* Input - sizes of each node in the tree */
- psymbfact_stat_t *PS,  /* Input/Output -statistics */
- MPI_Comm         *commLvls
- )
-/* 
- * Purpose
- * =======
- * 
- * Computes an estimation of the number of elements in columns of L
- * and rows of U.  Stores this information in cntelt_vtcs, and it will
- * be used in the right-looking symbolic factorization.
- */
-{
-  int   fstP, lstP, szSep, npNode, i, j;
-  int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk;
-  int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk;
-  int_t nelts, nelts_new_blk;
-  int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc;
-  int_t *minElt_vtx, *cntelt_vtcs;
-  
-  /* Initialization */
-  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;
-  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;
-  cntelt_vtcs = Llu_symbfact->cntelt_vtcs;
-  globToLoc = Pslu_freeable->globToLoc;
-  nvtcs_loc = VInfo->nvtcs_loc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n)
-    minElt_vtx = lsub;
-  else { 
-    /* allocate memory for minElt_vtx */
-    if (!(minElt_vtx = intMalloc_dist(n))) {
-      fprintf(stderr, "Malloc fails for minElt_vtx[].");
-      return (PS->allocMem);
-    }
-    PS->allocMem += n * sizeof (int_t);
-  } 
-  
-  for (ii = 0; ii < n; ii++) 
-    tempArray[ii] = n;
-  for (ii = 0; ii < nvtcs_loc; ii++)
-    cntelt_vtcs[ii] = 0;
-
-  szSep = nprocs_symb;
-  i = 0;
-  cur_blk = 0;
-  vtx_lid = 0;
-  while (szSep >= 1) {
-    /* for each level in the separator tree */
-    npNode = nprocs_symb / szSep; 
-    fstP = 0; 
-    /* for each node in the level */
-    for (j = i; j < i + szSep; j++) {
-      fstVtx = fstVtxSep[j];
-      lstVtx  = fstVtx + sizes[j];
-      lstP = fstP + npNode;
-
-      if (fstP <= iam && iam < lstP) {      
-	ind_blk = cur_blk;
-	ii = vtx_lid;
-	while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-	       ind_blk < 2 * VInfo->nblks_loc) {	  
-	  fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-	  lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-	  ind_blk += 2;
-	  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) {
-	    for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) {
-	      vtx_elt = lsub[jj];
-	      if (tempArray[vtx_elt] == n) {
-		tempArray[vtx_elt] = vtx;
-	      }
-	    }
-	    for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) {
-	      vtx_elt = usub[jj];
-	      if (tempArray[vtx_elt] == n) {
-		tempArray[vtx_elt] = vtx;
-	      }
-	    }
-	  }	  
-	} 
-	if (szSep == nprocs_symb) 
-	  vtx_lid = ii;
-	else {
-	  MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), 
-			 (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]);
-#if ( PRNTlevel>=1 )
-	  PS->no_msgsCol += (float) (2 * (int) LOG2( npNode ));
-	  PS->sz_msgsCol += (float) (n - fstVtx);
-	  if (PS->maxsz_msgCol < n - fstVtx) 
-	    PS->maxsz_msgCol = n - fstVtx;      
-#endif
-
-	  nelts = 0;
-	  for (ii = fstVtx; ii < lstVtx; ii++)
-	    tempArray[ii] = 0;
-	  for (ii = fstVtx; ii < n; ii++) {
-	    if (minElt_vtx[ii] != n) {
-	      if (minElt_vtx[ii] < fstVtx)
-		nelts ++;
-	      else
-		tempArray[minElt_vtx[ii]] ++;
-	      if (ii > lstVtx)
-		tempArray[ii] = minElt_vtx[ii];
-	    }
-	  }
-	
-	  ind_blk = cur_blk;
-	  lv = fstVtx;
-	  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-		 ind_blk < 2 * VInfo->nblks_loc) {	  
-	    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-	    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-	    ind_blk += 2;
-	    
-	    for (ii = lv; ii < fstVtx_blk; ii++)
-	      nelts += tempArray[ii];
-	    lv = lstVtx_blk;
-
-	    nelts_new_blk = 0;
-	    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-	      nelts_new_blk += tempArray[vtx];
-	      cntelt_vtcs[vtx_lid] = nelts;
-	    }
-	    nelts += nelts_new_blk;
-	  }
-	} /* if (szSep != nprocs_symb) */
-	cur_blk = ind_blk;
-      }
-      fstP += npNode;
-    }
-    i += szSep;
-    szSep = szSep / 2;
-  }
-  /* free memory */
-  if (minElt_vtx != lsub) {
-    SUPERLU_FREE (minElt_vtx);
-    PS->allocMem -= n * sizeof(int_t);
-  }
-  return (SUCCES_RET);
-}
-
-static float
-symbfact_mapVtcs
-(
- int iam,             /* Input -process number */
- int nprocs_num,      /* Input -number of processors */
- int nprocs_symb,     /* Input -number of procs for symbolic factorization */
- SuperMatrix *A,      /* Input -input distributed matrix A */
- int_t *fstVtxSep,    /* Input -first vertex in each separator */
- int_t *sizes,        /* Input -size of each separator in the separator tree */
- Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc 
-				    computed */
- vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */
- int_t *tempArray,    /* Input -temp array of size n = order of the matrix */
- int_t  maxSzBlk,     /* Input -maximum number of vertices in a block */
- psymbfact_stat_t *PS /* Input/Output -statistics */
- ) 
-{
-/* 
- * Purpose
- * =======
- *
- *  symbfact_mapVtcs maps the vertices of the graph of the input
- *  matrix A on nprocs_symb processors, using the separator tree
- *  returned by a graph partitioning algorithm from the previous step
- *  of the symbolic factorization.  The number of processors
- *  nprocs_symb must be a power of 2.
- *
- * Description of the algorithm
- * ============================
- *
- *  A subtree to subcube algorithm is used first to map the processors
- *  on the nodes of the separator tree.
- *
- *  For each node of the separator tree, its corresponding vertices
- *  are distributed on the processors affected to this node, using a
- *  block cyclic distribution.
- *
- *  After the distribution, fields of the VInfo structure are
- *  computed.  The array globToLoc and maxNvtcsPProc of Pslu_freeable
- *  are also computed.
- *
- */
-  int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d;
-  int_t k, n, kk;
-  int_t fstVtx, lstVtx;
-  int_t fstVtxBlk, ind_blk;
-  int_t noVtcsProc, noBlk;
-  int_t nvtcs_loc; /* number of vertices owned by process iam */
-  int_t nblks_loc; /* no of blocks owned by process iam */
-  int_t *globToLoc;    /* global indexing to local indexing */
-  int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx;
-  int_t *begEndBlks_loc; /* begin and end vertex of each local block */
-  int_t *vtcs_pe;  /* contains the number of vertices on each processor */
-  int   *avail_pes; /* contains the processors to be used at each level */
-  
-  n = A->ncol;
-  /* allocate memory */
-  if (!(globToLoc = intMalloc_dist(n + 1))) {
-    fprintf (stderr, "Malloc fails for globToLoc[].");
-    return (PS->allocMem);
-  }
-  PS->allocMem += (n+1) * sizeof(int_t);
-  if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) {
-    fprintf (stderr, "Malloc fails for avail_pes[].");
-    return (PS->allocMem);
-  }
-  PS->allocMem += nprocs_symb*sizeof(int);
-  if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) {
-    fprintf (stderr, "Malloc fails for vtcs_pe[].");
-    return (PS->allocMem);
-  }
-  PS->allocMem += nprocs_symb*sizeof(int_t);
-  
-  /* Initialization */
-  globToLoc[n] = n;  
-  for (p = 0; p < nprocs_symb; p++) {
-    vtcs_pe[p] = 0;
-    avail_pes[p] = EMPTY;
-  }
-  nvtcs_loc = 0;
-  nblks_loc = 0;
-  maxNvtcsNds_loc = 0;
-  maxNeltsVtx     = 0;
-  
-  /* distribute data among processors */
-  szSep = nprocs_symb;
-  iSep = 0;
-  while (szSep >= 1) {
-    /* for each level in the separator tree */
-    npNode = nprocs_symb / szSep; 
-    firstP = 0; 
-    nvtcsNds_loc = 0;
-    
-    for (jSep = iSep; jSep < iSep + szSep; jSep++) {
-      /* for each node in the level */
-      fstVtx = fstVtxSep[jSep];
-      lstVtx = fstVtx + sizes[jSep];
-      if (firstP <= iam && iam < firstP + npNode)
-	maxNeltsVtx += lstVtx - fstVtx;
-
-      if (szSep == nprocs_symb) {
-	/* leaves of the separator tree */
-	for (k = fstVtx; k < lstVtx; k++) {
-	  globToLoc[k] = (int_t) firstP;
-	  vtcs_pe[firstP] ++;
-	}
-	if (firstP == iam) {	  
-	  nvtcs_loc += lstVtx - fstVtx;
-	  nblks_loc ++;
-	}
-      }
-      else {
-	/* superior levels of the separator tree */
-	k = fstVtx;
-	noVtcsProc = maxSzBlk;
-	fstVtxBlk = fstVtx;
-	if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode;
-	/* first allocate processors from previous levels */	
-	for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) {
-	  p = avail_pes[ind_ap_s];
-	  if (p != EMPTY && k < lstVtx) {
-	    /* for each column in the separator */	  
-	    avail_pes[ind_ap_s] = EMPTY;
-	    kk = 0;
-	    while (kk < noVtcsProc && k < lstVtx) {
-	      globToLoc[k] = p;
-	      vtcs_pe[p] ++;
-	      k ++;
-	      kk ++;
-	    }
-	    if (p == iam) {
-	      nvtcs_loc += kk;
-	      nblks_loc ++;
-	      nvtcsNds_loc += kk;
-	    }
-	  }
-	  else {
-	    if (p != EMPTY && k == lstVtx) {
-	      avail_pes[ind_ap_s] = EMPTY;
-	      avail_pes[ind_ap_d] = p; ind_ap_d ++;
-	    }
-	  }
-	} 
-	noBlk = 0;
-	p = firstP + npNode;
-	while (k < lstVtx) {
-	  /* for each column in the separator */
-	  kk = 0;
-	  p = (int) (noBlk % (int_t) npNode) + firstP;
-	  while (kk < noVtcsProc && k < lstVtx) {
-	    globToLoc[k] = p;
-	    vtcs_pe[p] ++;
-	    k ++;
-	    kk ++;
-	  }
-	  if (p == iam) {
-	    nvtcs_loc += kk;
-	    nblks_loc ++;
-	    nvtcsNds_loc += kk;
-	  }
-	  noBlk ++;
-	} /* while (k < lstVtx) */
-	/* Add the unused processors to the avail_pes list of pes */
-	for (p = p + 1; p < firstP + npNode; p ++) {
-	  avail_pes[ind_ap_d] = p; ind_ap_d ++;
-	}
-      }
-      firstP += npNode;
-    }
-    if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb)
-      maxNvtcsNds_loc = nvtcsNds_loc;
-    iSep += szSep;
-    szSep = szSep / 2;
-  }
-  
-#if ( PRNTlevel>=2 )
-  if (!iam)
-    PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe);
-#endif
-  /* determine maximum number of vertices among processors */
-  maxNvtcsPProc = vtcs_pe[0];
-  vtcs_pe[0] = 0;
-  for (p = 1; p < nprocs_symb; p++) {
-    if (maxNvtcsPProc < vtcs_pe[p])
-      maxNvtcsPProc = vtcs_pe[p];
-    vtcs_pe[p] = 0;
-  }
-#if ( PRNTlevel>=2 )
-  if (!iam)
-    printf ("  MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", 
-	    maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n);
-#endif
-
-  if (iam < nprocs_symb)
-    if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1)))
-      ABORT("Malloc fails for begEndBlks_loc[].");
-  
-  ind_blk = 0;
-  k = 0;
-  while (k < n) {
-    p = globToLoc[k];
-    if (p == iam) 
-      begEndBlks_loc[ind_blk] = k;
-    while (globToLoc[k] == p && k < n) {
-      globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p];
-      vtcs_pe[p] ++;
-      k ++;
-    }
-    if (p == iam) {
-      begEndBlks_loc[ind_blk + 1] = k;
-      ind_blk += 2;
-    }
-  }
-  if (iam < nprocs_symb)
-    begEndBlks_loc[2 * nblks_loc] = n;
- 
-  SUPERLU_FREE (avail_pes);
-  SUPERLU_FREE (vtcs_pe);
-  
-  Pslu_freeable->maxNvtcsPProc   = maxNvtcsPProc;
-  Pslu_freeable->globToLoc       = globToLoc;
-  if (iam < nprocs_symb) {
-    VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc;
-    VInfo->nblks_loc       = nblks_loc;
-    VInfo->nvtcs_loc       = nvtcs_loc;
-    VInfo->curblk_loc      = 0;
-    VInfo->maxNeltsVtx     = maxNeltsVtx;
-    VInfo->filledSep       = FALSE;
-    VInfo->xlsub_nextLvl   = 0;
-    VInfo->xusub_nextLvl   = 0;
-    VInfo->begEndBlks_loc  = begEndBlks_loc;
-    VInfo->fstVtx_nextLvl  = begEndBlks_loc[0];
-  }
-  return SUCCES_RET;
-}
-
-static void 
-symbfact_distributeMatrix
-(
- int   iam,             /* Input - my processor number */  
- int   nprocs_num,      /* Input - number of processors */
- int   nprocs_symb,     /* Input - number of processors for the
-			   symbolic factorization */
- SuperMatrix *A,        /* Input - input matrix A */
- int_t *perm_c,         /* Input - column permutation */
- int_t *perm_r,         /* Input - row permutation */
- matrix_symbfact_t *AS, /* Output - temporary storage for the
-			   redistributed matrix */
- Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */
- vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices
-				 distribution */
- int_t  *tempArray,     /* Input/Output - temporary array of size n
-			   (order of the matrix) */
- MPI_Comm    *num_comm  /* Input - communicator for nprocs_num procs */
- )
-{
-/*
- * Purpose 
- * =======
- *
- * Distribute input matrix A for the symbolic factorization routine.
- * Only structural information is distributed.  The redistributed
- * matrix has its rows and columns permuted according to perm_r and
- * perm_c. A is not modified during this routine.
- *
- */
-/* Notations:
- * Ainf : inferior part of A, including diagonal.
- * Asup : superior part of A.
- */
-  int p, p_irow, code_err, ainf_data;
-  int_t n, m_loc, fst_row;
-  int_t i, j, k, irow, jcol;
-  NRformat_loc *Astore;
-  int_t  nnz_loc, nnz_iam;    /* number of local nonzeros */
-  int_t  nnz_remote; /* number of remote nonzeros to be sent */
-  int_t  SendCnt; /* number of remote nonzeros to be sent */
-  int_t  RecvCnt; /* number of remote nonzeros to be received */
-  /* number of nonzeros to send/receive per processor */
-  int_t  *nnzToSend, *nnzToRecv; 
-  int_t *nnzAinf_toSnd; /* nnz in Ainf to send */
-  /* VInfo data structures */
-  int_t *globToLoc, *begEndBlks_loc, nblks_loc, nvtcs_loc, maxNvtcsPProc;
-  
-  int_t neltsRow, vtx, vtx_lid, nelts, ind;
-  int_t *snd_aind, *rcv_aind;
-  int_t *ptr_toSnd, *buf, *ptr_toRcv;
-  /* matrix_symbfact_t *As data */
-  int_t *x_ainf, *x_asup, *ind_ainf, *ind_asup;
-  int  *intBuf1, *intBuf2, *intBuf3, *intBuf4;
-
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  Astore = (NRformat_loc *) A->Store;
-  n = A->ncol;
-  m_loc = Astore->m_loc;
-  fst_row = Astore->fst_row;
-  globToLoc = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  nnzToRecv = intCalloc_symbfact(3 * (int_t)nprocs_num);
-  nnzToSend = nnzToRecv + nprocs_num;
-  nnzAinf_toSnd = nnzToRecv + 2 * nprocs_num;
-
-  /* --------------------------------------------------------------------- 
-    COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE
-    SPACE.  THIS ACCOUNTS FOR THE FIRST PASS OF A.
-    ----------------------------------------------------------------------*/
-  /* tempArray stores the number of nonzeros in each column of ainf */
-  for (i = 0; i < n; i++)
-    tempArray[i] = 0;
-  for (i = 0; i < m_loc; i++) {
-    irow   = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-    p_irow = OWNER(globToLoc[irow]);
-    neltsRow = 0;
-
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) {
-      jcol = perm_c[Astore->colind[j]];
-      if (jcol <= irow) {
-	p = OWNER(globToLoc[jcol]);
-	if (tempArray[jcol] == 0) {
-	  nnzToSend[p] += 2;
-	  nnzAinf_toSnd[p] += 2;
-	}
-	tempArray[jcol] ++;
-	nnzAinf_toSnd[p] ++;
-      }
-      else {
-	p = p_irow;
-	neltsRow ++;
-      }
-      nnzToSend[p] ++; 
-    }
-    if (neltsRow != 0) {
-      nnzToSend[p_irow] += 2;
-    }
-  }
-  
-  /* add one entry which will separate columns of Ainf from rows
-     of Asup */
-  for (p = 0; p < nprocs_num; p++)
-    if (nnzToSend[p] != 0)
-      nnzToSend[p] ++;
-  
-  /* All-to-all communication */
-  MPI_Alltoall (nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		(*num_comm));
-
-  nnz_loc = SendCnt = RecvCnt = 0;
-  for (p = 0; p < nprocs_num; p++) {
-    if ( p != iam ) {
-      SendCnt += nnzToSend[p];
-      RecvCnt += nnzToRecv[p];
-    } else {
-      nnz_loc += nnzToRecv[p];
-      nnzToSend[p] = 0;
-    }
-  }
-  nnz_iam = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-  
-  /* Allocate temporary storage for sending/receiving the A triplets. */
-  if (!(snd_aind = intMalloc_symbfact(SendCnt)) && SendCnt != 0)
-    ABORT("Malloc fails for snd_aind[].");
-  if ( !(rcv_aind = intMalloc_symbfact(nnz_iam + 1)))
-    ABORT("Malloc fails for rcv_aind[].");
-  if ( !(ptr_toSnd = intCalloc_symbfact((int_t) nprocs_num)) )
-    ABORT("Malloc fails for ptr_toSnd[].");
-  if ( !(ptr_toRcv = intCalloc_symbfact((int_t) nprocs_num)) )
-    ABORT("Malloc fails for ptr_toRcv[].");
-
-  /* setup ptr_toSnd[p] to point to data in snd_aind to be send to 
-   processor p */
-  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
-    if ( p != iam ) 
-      ptr_toSnd[p] = i;
-    else
-      ptr_toSnd[p] = j;
-    i += nnzToSend[p]; 
-    j += nnzToRecv[p];
-  }
-
-  for (i = 0; i < n; i++) {
-    if (tempArray[i] != 0) {
-      /* column i of Ainf will be send to a processor  */
-      p = OWNER( globToLoc[i] );
-      if (p == iam) {
-	buf = &(rcv_aind[ptr_toSnd[p]]);
-      }
-      else {
-	buf = &(snd_aind[ptr_toSnd[p]]);
-      }
-      buf[0] = tempArray[i];
-      buf[1] = i;
-      tempArray[i] = ptr_toSnd[p] + 2;
-      ptr_toSnd[p] += 2 + buf[0];
-    }
-  }
-
-  /* set ptr_toSnd to point to Asup data (stored by rows) */
-  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
-    if ( p != iam ) {
-      if (nnzToSend[p] != 0) { 
-	snd_aind[i + nnzAinf_toSnd[p]] = EMPTY;
-	ptr_toSnd[p] = i + nnzAinf_toSnd[p] + 1;
-      }
-    }
-    else {
-      if (nnzToRecv[p] != 0) {
-	rcv_aind[j + nnzAinf_toSnd[p]] = EMPTY;
-	ptr_toSnd[p] = j + nnzAinf_toSnd[p] + 1;
-      }
-    }
-    i += nnzToSend[p]; 
-    j += nnzToRecv[p];
-  }
-
-  /* ------------------------------------------------------------
-     LOAD THE ENTRIES OF A INTO THE snd_aind STRUCTURE TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF A.
-     For each processor, we store first the columns to be sent,
-     and then the rows to be sent. For each row/column sent:
-     entry 0            : x = number of elements in that row/column
-     entry 1            : row/column number
-     entries 2 .. x + 2 : row/column indices.
-     ------------------------------------------------------------*/
-  for (i = 0; i < m_loc; i++) {
-    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*A */
-    p_irow = OWNER( globToLoc[irow] );
-    ptr_toSnd[p_irow] +=2;
-    neltsRow = 0;
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) {
-      jcol = perm_c[Astore->colind[j]];
-      if (jcol <= irow) {
-	p = OWNER( globToLoc[jcol] );
-	k = tempArray[jcol];
-	tempArray[jcol] ++;
-	if (p == iam) { /* local */
-	  rcv_aind[k] = irow; 
-	}
-	else {
-	  snd_aind[k] = irow;
-	}
-      }
-      else {
-	p = p_irow;
-	neltsRow ++;
-	k = ptr_toSnd[p];
-	ptr_toSnd[p] ++;
-	if (p == iam) { /* local */
-	  rcv_aind[k] = jcol;
-	}
-	else {
-	  snd_aind[k] = jcol;
-	}
-      }
-    }
-
-    if (neltsRow == 0)
-      ptr_toSnd[p_irow] -= 2;
-    else {
-      /* store entry 0 and entry 1 */
-      if (p_irow == iam) { /* local */
-	rcv_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow;
-	rcv_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow;
-      }
-      else { /* remote */
-	snd_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow;
-	snd_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow;
-      }
-    }
-  }
-  
-  /* reset ptr_toSnd to point to the beginning of the data for
-     each processor (structure needed in MPI_Alltoallv */
-  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
-    ptr_toSnd[p] = i;
-    i += nnzToSend[p];
-    ptr_toRcv[p] = j;
-    j += nnzToRecv[p];
-  }
-  
-  /* ------------------------------------------------------------
-     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-     Note: it uses MPI_Alltoallv.
-     ------------------------------------------------------------*/
-  if (nprocs_num > 1) {
-#if defined (_LONGINT)
-    intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs_num * sizeof(int));
-    intBuf2 = intBuf1 + nprocs_num;
-    intBuf3 = intBuf1 + 2 * nprocs_num;
-    intBuf4 = intBuf1 + 3 * nprocs_num;
-    
-    for (p=0; p<nprocs_num; p++) {
-      if (nnzToSend[p] > INT_MAX || ptr_toSnd[p] > INT_MAX ||
-	  nnzToRecv[p] > INT_MAX || ptr_toRcv[p] > INT_MAX)
-	ABORT("ERROR in symbfact_distributeMatrix size to send > INT_MAX\n");
-      intBuf1[p] = (int) nnzToSend[p];
-      intBuf2[p] = (int) ptr_toSnd[p];
-      intBuf3[p] = (int) nnzToRecv[p];
-      intBuf4[p] = (int) ptr_toRcv[p];
-    }
-#else  /* Default */
-    intBuf1 = nnzToSend;  intBuf2 = ptr_toSnd;
-    intBuf3 = nnzToRecv;  intBuf4 = ptr_toRcv;
-#endif
-
-    i = nnzToRecv[iam]; /* This corresponds to nnzToRecv[iam] */
-    intBuf3[iam] = 0;
-    intBuf1[iam] = 0;
-
-    MPI_Alltoallv (snd_aind, intBuf1, intBuf2, mpi_int_t, 
-		   rcv_aind, intBuf3, intBuf4, mpi_int_t,
-		   (*num_comm));
-#if defined (_LONGINT)
-    SUPERLU_FREE (intBuf1);
-#endif
-    nnzToRecv[iam] = i;
-  }
-  
-  /* ------------------------------------------------------------
-     DEALLOCATE SEND STORAGE
-     ------------------------------------------------------------*/
-  if (snd_aind) SUPERLU_FREE( snd_aind );
-  SUPERLU_FREE( ptr_toSnd );
-
-  /* ------------------------------------------------------------
-     CONVERT THE RECEIVED FORMAT INTO THE SYMBOLIC FORMAT.
-     THIS IS PERFORMED ONLY BY NPROCS_SYMB PROCESSORS
-     ------------------------------------------------------------*/
-  if (iam < nprocs_symb) {
-    nblks_loc = VInfo->nblks_loc;
-    begEndBlks_loc = VInfo->begEndBlks_loc;
-    nvtcs_loc = VInfo->nvtcs_loc;
-    /* ------------------------------------------------------------
-       Allocate space for storing indices of A after redistribution.
-       ------------------------------------------------------------*/
-    if (!(x_ainf = intCalloc_symbfact (nvtcs_loc + 1)))
-      ABORT("Malloc fails for x_ainf[].");
-    if (!(x_asup = intCalloc_symbfact (nvtcs_loc + 1)))
-      ABORT("Malloc fails for x_asup[].");
-    
-    /* Initialize the array of columns/rows pointers */
-    for (i = 0, p = 0; p < nprocs_num; p++) {
-      ainf_data = TRUE;
-      k = 0;
-      while (k < nnzToRecv[p]) {
-	j = rcv_aind[i + k];
-	if (j == EMPTY) {
-	  ainf_data = FALSE;
-	  k ++;
-	}
-	else {
-	  nelts = rcv_aind[i + k];
-	  vtx = rcv_aind[i + k + 1];
-	  vtx_lid = LOCAL_IND( globToLoc[vtx] );
-	  k += nelts + 2;
-	  if (ainf_data) 
-	    x_ainf[vtx_lid] += nelts; 
-	  else 
-	    x_asup[vtx_lid] = nelts;
-	}
-      }
-      i += nnzToRecv[p];
-    }
-    
-    /* copy received information */
-    vtx_lid = 0;
-    for (i = 0, k = 0, j = 0; i < nblks_loc; i++) {
-      for (vtx = begEndBlks_loc[2*i]; vtx < begEndBlks_loc[2*i+1]; vtx++, vtx_lid ++) {
-	nelts = x_ainf[vtx_lid];
-	x_ainf[vtx_lid] = k;
-	k += nelts;
-	nelts = x_asup[vtx_lid];
-	x_asup[vtx_lid] = j;
-	j += nelts;
-	tempArray[vtx] = x_ainf[vtx_lid];
-      }
-    }
-    x_ainf[nvtcs_loc] = k;
-    x_asup[nvtcs_loc] = j;
-    
-    /* Allocate space for storing indices of A after conversion */
-    if ( !(ind_ainf = intMalloc_symbfact(x_ainf[nvtcs_loc])) && x_ainf[nvtcs_loc] != 0 )
-      ABORT("Malloc fails for ind_ainf[].");
-    if ( !(ind_asup = intMalloc_symbfact(x_asup[nvtcs_loc])) && x_asup[nvtcs_loc] != 0)
-      ABORT("Malloc fails for ind_asup[].");
-    
-    /* Copy the data into the row/column oriented storage */  
-    for (i = 0, p = 0; p < nprocs_num; p++) {
-      ainf_data = TRUE;
-      k = 0;
-      while (k < nnzToRecv[p]) {
-	j = rcv_aind[i + k];
-	if (ainf_data && j == EMPTY) {
-	  ainf_data = FALSE;
-	  k ++;
-	}
-	else {
-	  nelts = rcv_aind[i + k];
-	  vtx = rcv_aind[i + k + 1];
-	  vtx_lid = LOCAL_IND( globToLoc[vtx] );
-	  if (ainf_data) {
-	    /* traverse ainf data */
-	    ind = tempArray[vtx];
-	    for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) 
-	      ind_ainf[ind] = rcv_aind[j];
-	    tempArray[vtx] = ind;
-	  }
-	  else {
-	    /* traverse asup data */
-	    ind = x_asup[vtx_lid];
-	    for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) 
-	      ind_asup[ind] = rcv_aind[j];
-	  }
-	  k += nelts + 2;
-	}
-      }
-      i += nnzToRecv[p];
-    }
-    
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE
-       ------------------------------------------------------------*/
-    SUPERLU_FREE( ptr_toRcv );
-    if (rcv_aind) SUPERLU_FREE( rcv_aind );
-    if (nnzToRecv) SUPERLU_FREE( nnzToRecv );
-
-    AS->x_ainf = x_ainf;
-    AS->x_asup = x_asup;
-    AS->ind_ainf = ind_ainf;
-    AS->ind_asup = ind_asup;
-    
-    VInfo->nnz_asup_loc = x_asup[nvtcs_loc];
-    VInfo->nnz_ainf_loc = x_ainf[nvtcs_loc];
-  }
-}
-
-static
-float allocPrune_lvl
-(
- Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data
-				  structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input -local info on vertices
-				  distribution */
- psymbfact_stat_t *PS          /* Input -statistics */
- )
-/*
- * Allocate storage for data structures necessary for pruned graphs.
- * For those unpredictable size, make a guess as FILL * n.
- * Return value:
- *     0 if enough memory was available;
- *     otherwise, return the amount of space intended to allocate 
- *     when memory allocation failure occurred.
- */
-{
-  int_t  lword;
-  int_t  nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr;
-  int_t  nvtcs_loc, no_expand_pr, x_sz;
-  float  alpha = 1.5;
-  int_t  FILL = sp_ienv_dist(6);
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  
-  no_expand_pr = 0;
-  lword     = (int_t) sizeof(int_t);
-  
-  /* free memory allocated for the domain symbolic factorization */
-  if (Llu_symbfact->szLsubPr)
-    SUPERLU_FREE( Llu_symbfact->lsubPr );
-  if (Llu_symbfact->szUsubPr)
-    SUPERLU_FREE( Llu_symbfact->usubPr );
-  if (Llu_symbfact->xlsubPr)
-    SUPERLU_FREE( Llu_symbfact->xlsubPr );
-  if (Llu_symbfact->xusubPr)
-    SUPERLU_FREE( Llu_symbfact->xusubPr );
-  
-  Llu_symbfact->xlsub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1);
-  Llu_symbfact->xusub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1);
-
-  /* allocate memory to use during superior levels of sep_tree */
-  x_sz = SUPERLU_MIN( VInfo->maxNvtcsNds_loc, VInfo->maxSzBlk);
-  nzlmaxPr = 2 * FILL * VInfo->maxNvtcsNds_loc;
-  nzumaxPr = 2 * FILL * VInfo->maxSzBlk;  
-
-  /* Integer pointers for L\U factors */
-  if (x_sz != 0) {
-    xlsubPr   = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1);
-    xusubPr   = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1);
-    
-    lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword);
-    usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword);
-    
-    while ( !lsubPr || !usubPr ) {
-      if ( lsubPr ) SUPERLU_FREE( lsubPr ); 
-      if ( usubPr ) SUPERLU_FREE( usubPr );
-      
-      nzlmaxPr /= 2;     nzlmaxPr = alpha * (float) nzlmaxPr;
-      nzumaxPr /= 2;     nzumaxPr = alpha * (float) nzumaxPr;
-      
-      if ( nzumaxPr < x_sz ) {
-	fprintf(stderr, "Not enough memory to perform factorization.\n");
-	return (PS->allocMem);
-      }
-      lsubPr  = (int_t *) SUPERLU_MALLOC(nzlmaxPr * lword);
-      usubPr  = (int_t *) SUPERLU_MALLOC(nzumaxPr * lword);
-      ++no_expand_pr;
-    }
-  }    
-  else {
-    xlsubPr = NULL; lsubPr = NULL;
-    xusubPr = NULL; usubPr = NULL;
-    nzlmaxPr = 0; nzumaxPr = 0;
-  }
-  
-  if (VInfo->maxNvtcsNds_loc)
-    Llu_symbfact->cntelt_vtcsA_lvl = 
-      (int_t *) SUPERLU_MALLOC (VInfo->maxNvtcsNds_loc * lword);
-
-  if (PS->maxSzLPr < Llu_symbfact->indLsubPr)
-    PS->maxSzLPr = Llu_symbfact->indLsubPr;
-  if (PS->maxSzUPr < Llu_symbfact->indUsubPr)
-    PS->maxSzUPr = Llu_symbfact->indUsubPr;
-  
-  Llu_symbfact->lsubPr   = lsubPr;
-  Llu_symbfact->xlsubPr  = xlsubPr;
-  Llu_symbfact->usubPr   = usubPr;
-  Llu_symbfact->xusubPr  = xusubPr;
-  Llu_symbfact->szLsubPr = nzlmaxPr;
-  Llu_symbfact->szUsubPr = nzumaxPr;
-  Llu_symbfact->indLsubPr = 0;
-  Llu_symbfact->indUsubPr = 0;
-
-  Llu_symbfact->no_expand_pr += no_expand_pr;
-  return 0;
-}
-
-static float 
-allocPrune_domain
-(
- int_t fstVtx,  /* Input - first vertex of current node */ 
- int_t lstVtx,  /* Input - last vertex of current node */
- Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data
-				  structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input -local info on vertices
-				  distribution */
- psymbfact_stat_t *PS           /* Input -statistics */
- )
-/*
- * Allocate storage for data structures necessary for pruned graphs.
- * For those unpredictable size, make a guess as FILL * n.
- * Return value:
- *     0 if enough memory was available;
- *     otherwise, return the amount of space intended to allocate 
- *     when memory allocation failure occurred.
- */
-{
-  int_t  lword;
-  int_t  nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr;
-  int_t  nvtcs_loc, no_expand_pr, x_sz;
-  float  alpha = 1.5;
-  int_t  FILL = 2 * sp_ienv_dist(6);
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  
-  no_expand_pr = 0;
-  lword     = (int_t) sizeof(int_t);
-  
-  /* allocate memory to use during domain_symbolic routine */
-  /* Guess for prune graph */
-  x_sz = lstVtx - fstVtx;
-  nzlmaxPr = nzumaxPr = 2*FILL * x_sz;
-  
-  /* Integer pointers for L\U factors */
-  if (x_sz != 0) {
-    xlsubPr   = intMalloc_symbfact(x_sz+1);
-    xusubPr   = intMalloc_symbfact(x_sz+1);
-    
-    lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword);
-    usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword);
-    
-    while ( !lsubPr || !usubPr ) {
-      if ( lsubPr ) SUPERLU_FREE(lsubPr); 
-      if ( usubPr ) SUPERLU_FREE(usubPr);
-      
-      nzlmaxPr /= 2;     nzlmaxPr = alpha * (float) nzlmaxPr;
-      nzumaxPr /= 2;     nzumaxPr = alpha * (float) nzumaxPr;
-      
-      if ( nzumaxPr < x_sz ) {
-	fprintf(stderr, "Not enough memory to perform factorization.\n");
-	return (PS->allocMem);
-      }
-      lsubPr  = (void *) SUPERLU_MALLOC(nzlmaxPr * lword);
-      usubPr  = (void *) SUPERLU_MALLOC(nzumaxPr * lword);
-      ++no_expand_pr;
-    }
-  }    
-  else {
-    xlsubPr = NULL;
-    xusubPr = NULL;
-  }
-  
-  Llu_symbfact->lsubPr   = lsubPr;
-  Llu_symbfact->xlsubPr  = xlsubPr;
-  Llu_symbfact->usubPr   = usubPr;
-  Llu_symbfact->xusubPr  = xusubPr;
-  Llu_symbfact->szLsubPr = nzlmaxPr;
-  Llu_symbfact->szUsubPr = nzumaxPr;
-  Llu_symbfact->indLsubPr = 0;
-  Llu_symbfact->indUsubPr = 0;
-  Llu_symbfact->xlsub_rcvd = NULL;
-  Llu_symbfact->xusub_rcvd = NULL;
-  Llu_symbfact->cntelt_vtcsA_lvl = NULL;
-
-  PS->maxSzLPr = Llu_symbfact->indLsubPr;
-  PS->maxSzUPr = Llu_symbfact->indUsubPr;
-
-  Llu_symbfact->no_expand_pr = no_expand_pr;
-  Llu_symbfact->no_expcp = 0;
-  return 0;
-}
-
-/************************************************************************/
-static
-int symbfact_alloc
-/************************************************************************/
-(
- int_t n,       /* Input - order of the matrix */
- int   nprocs,  /* Input - number of processors for the symbolic
-		   factorization */  
- Pslu_freeable_t *Pslu_freeable, 
- Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input - local info on vertices
-				  distribution */
- comm_symbfact_t *CS, /* Input -information on communication */
- psymbfact_stat_t *PS /* Input -statistics */
- )
-/*
- * Allocate storage for the data structures common to symbolic factorization
- * routines. For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     0 if enough memory was available;
- *     otherwise, return the amount of space intended to allocate 
- *     when memory allocation failure occurred.
- */
-{
-  int    nlvls, p;  /* no of levels in the separator tree */
-  int_t  lword, no_expand;
-  int_t  *xsup, *supno;
-  int_t  *lsub, *xlsub;
-  int_t  *usub, *xusub;
-  int_t  nzlmax, nzumax, nnz_a_loc;
-  int_t  nvtcs_loc, *cntelt_vtcs;
-  float  alpha = 1.5;
-  int_t  FILL = sp_ienv_dist(6);
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  nnz_a_loc = VInfo->nnz_ainf_loc + VInfo->nnz_asup_loc;
-  nlvls = (int) LOG2( nprocs ) + 1;
-  no_expand = 0;
-  lword     = sizeof(int_t);
-  
-  /* Guess for L\U factors */
-  nzlmax = nzumax = FILL * nnz_a_loc;
-  
-  /* Integer pointers for L\U factors */
-  supno  = intMalloc_symbfact(nvtcs_loc+1);
-  xlsub  = intMalloc_symbfact(nvtcs_loc+1);
-  xusub  = intMalloc_symbfact(nvtcs_loc+1);
-  
-  lsub = (void *) SUPERLU_MALLOC(nzlmax * lword);
-  usub = (void *) SUPERLU_MALLOC(nzumax * lword);
-  
-  while ( !lsub || !usub ) {
-    if (!lsub) SUPERLU_FREE(lsub); 
-    if (!usub) SUPERLU_FREE(usub);
-    
-    nzlmax /= 2;     nzlmax = alpha * nzlmax;
-    nzumax /= 2;     nzumax = alpha * nzumax;
-    
-    if ( nzumax < nnz_a_loc/2 ) {
-      fprintf(stderr, "Not enough memory to perform factorization.\n");
-      return (PS->allocMem);
-    }
-    lsub  = (void *) SUPERLU_MALLOC(nzlmax * lword);
-    usub  = (void *) SUPERLU_MALLOC(nzumax * lword);
-    ++no_expand;
-  }
-  
-  if (nprocs == 1)
-    cntelt_vtcs = NULL;
-  else 
-    cntelt_vtcs = intMalloc_symbfact (nvtcs_loc+1);
-  
-  /* allocate memory for communication data structures */
-  CS->rcv_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1);
-  CS->snd_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1);
-  CS->ptr_rcvBuf   = intMalloc_symbfact (2 * (int_t) nprocs );
-  CS->rcv_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1);
-  CS->snd_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1);
-  
-  CS->snd_interSz  = intMalloc_symbfact ((int_t) nlvls + 1);
-  CS->snd_LinterSz = intMalloc_symbfact ((int_t) nlvls + 1);  
-  CS->snd_vtxinter = intMalloc_symbfact ((int_t) nlvls + 1);  
-  CS->rcv_bufSz    = 0;
-  CS->rcv_buf      = NULL;
-  CS->snd_bufSz    = 0;
-  CS->snd_buf      = NULL;
-
-  for (p = 0; p < nprocs; p++) {
-    CS->rcv_interLvl[p] = EMPTY;
-    CS->snd_interLvl[p] = EMPTY;
-    CS->rcv_intraLvl[p] = EMPTY;
-    CS->snd_intraLvl[p] = EMPTY;
-  }
-  
-  for (p = 0; p <= nlvls; p++) {
-    CS->snd_vtxinter[p] = EMPTY;
-    CS->snd_interSz[p]  = 0;
-    CS->snd_LinterSz[p] = 0;
-  }
-  
-  Pslu_freeable->supno_loc   = supno;
-  Llu_symbfact->lsub   = lsub;
-  Llu_symbfact->xlsub  = xlsub;
-  Llu_symbfact->usub   = usub;
-  Llu_symbfact->xusub  = xusub;
-  Llu_symbfact->szLsub = nzlmax;
-  Llu_symbfact->szUsub = nzumax;
-  Llu_symbfact->cntelt_vtcs = cntelt_vtcs;
-  
-  Llu_symbfact->no_expand = no_expand;  
-  
-  return SUCCES_RET;
-} /* SYMBFACT_ALLOC */
-
-static int_t 
-symbfact_vtx
-(
- int_t n,         /* Input - order of the matrix */
- int   iam,       /* Input - my processor number */
- int_t vtx,       /* Input - vertex number to perform symbolic factorization */
- int_t vtx_lid,   /* Input - local vertex number */
- int_t vtx_prid,  /* Input - */
- int_t computeL,  /* Input - TRUE when compute column L(:,vtx)
-		             otheriwse compute row U(vtx, :) */
- int   domain_symb,  /* Input - if TRUE, computation corresponds to the independent
-			domain at the bottom of the separator tree */
- int_t fstVtx,       /* Input - first vertex of current node */ 
- int_t lstVtx,       /* Input - last vertex of current node */
- int_t snrep_lid,    /* local index of current supernode reprezentative */
- int_t szSn,         /* size of supernode with snrep_lid reprezentative */
- int_t *p_next,      /* next element in sub structure */
- int_t *marker,      
- int_t *sub_rcvd,    /* elements of node */
- int_t sub_rcvd_sz,  /* size of sub to be explored */
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,    /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS,
- int_t *p_neltsVtxInit,
- int_t *p_neltsVtx,
- int_t *p_neltsVtx_CSep,
- int_t *p_neltsZrVtx,
- int_t *p_neltsMatched,
- int_t mark_vtx,
- int_t *p_prval_curvtx,
- int_t vtx_bel_othSn,
- int_t *p_vtx_bel_mySn
- )
-{ 
-  int_t x_aind_beg, x_aind_end;
-  int_t k, vtx_elt, ind, pr, pr_lid, mem_error, ii, jj, compRcvd;
-  int_t *xsub, *sub, *xsubPr, *subPr, *xsub_rcvd, *xsub_src, *sub_src;
-  int_t pr_elt, next, prval_curvtx, maxNvtcsPProc;
-  int_t  neltsVtx, neltsMatched, neltsZrVtx, neltsZrSn, neltsVtx_CSep;
-  int_t  neltsVtxInit, kk;
-  int   diagind, upd_lstSn;
-  
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  upd_lstSn     = FALSE;
-  diagind       = FALSE;
-  prval_curvtx  = *p_prval_curvtx;
-  neltsVtx_CSep = 0;
-  next = *p_next;
-  if (computeL) {
-    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
-    xsub_rcvd = Llu_symbfact->xlsub_rcvd;
-    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
-  }
-  else {
-    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
-    xsub_rcvd = Llu_symbfact->xusub_rcvd;
-    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
-  }
-
-  x_aind_beg = xsub[vtx_lid];
-  x_aind_end = xsub[vtx_lid + 1];
-  xsub[vtx_lid] = next;
-  k = x_aind_beg;
-  /* while (sub[k] != EMPTY && k < x_aind_end) { */
-  while (k < x_aind_end) {
-    if (sub[k] == EMPTY)
-      k = x_aind_end;
-    else {
-      vtx_elt = sub[k];
-      if (!computeL)
-	if (marker[vtx_elt] == mark_vtx - 2)
-	  if (vtx_elt < prval_curvtx)
-	    prval_curvtx = vtx_elt;
-      marker[vtx_elt] = mark_vtx;
-      if (computeL && vtx_elt == vtx)
-	diagind = TRUE;
-      if (!computeL && vtx_elt == vtx)
-	printf ("Pe[%d] ERROR diag elt in U part vtx %d dom_s %d fstV %d lstV %d\n", 
-		iam, vtx, domain_symb, fstVtx, lstVtx);
-      else {
-	sub[next] = vtx_elt; 
-	next ++;
-      }
-      if (vtx_elt < lstVtx) neltsVtx_CSep ++;
-      k++;
-    }
-  }
-  neltsVtxInit = k - x_aind_beg;
-  PS->nops += neltsVtxInit;
-  
-  if (domain_symb) {
-    if (computeL)
-      VInfo->nnz_ainf_loc -= x_aind_end - x_aind_beg;
-    else    
-      VInfo->nnz_asup_loc -= x_aind_end - x_aind_beg;
-  }
-
-#ifdef TEST_SYMB
-  printf ("compL %d vtx %d vtx_lid %d vtx_prid %d vtx_bel_othSn %d\n", 
-	  computeL, vtx, vtx_lid, vtx_prid, vtx_bel_othSn);
-  PrintInt10 ("A(:, v)", x_aind_end - x_aind_beg, &(sub[xsub[vtx_lid]]));
-#endif
-
-  ind = xsubPr[vtx_prid];
-  if (vtx_bel_othSn == vtx)
-    upd_lstSn = TRUE;
-
-  while (ind != EMPTY || upd_lstSn) {
-    if (upd_lstSn ) {
-      upd_lstSn = FALSE;
-      pr_lid = snrep_lid;
-    }
-    else {
-      pr_lid = subPr[ind];
-      ind = subPr[ind - 1];
-    }
-    
-    if (!computeL)
-      marker[vtx] = mark_vtx;
-    if (pr_lid >= VInfo->nvtcs_loc) {
-      compRcvd = TRUE;
-      xsub_src = xsub_rcvd; sub_src = sub_rcvd;
-      pr_lid -= VInfo->nvtcs_loc;
-      k = xsub_src[pr_lid] + RCVD_IND;
-    }
-    else {
-      compRcvd = FALSE;
-      xsub_src = xsub; sub_src = sub;
-      k = xsub_src[pr_lid];
-    }
-
-    PS->nops += xsub_src[pr_lid+1] - xsub_src[pr_lid];
-    for (; k < xsub_src[pr_lid+1]; k++) {
-      pr_elt = sub_src[k];
-      if (pr_elt >= vtx && marker[pr_elt] != mark_vtx) {
-
-	/* TEST available memory */
-	if (next >= x_aind_end) {	
-	  if (domain_symb) {
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, vtx, next, 0,
-				      computeL, DOMAIN_SYMB, 1, 
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  } else if (mem_error =
-		     psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, 
-					computeL, LL_SYMB, 1, 
-					Pslu_freeable, Llu_symbfact, VInfo, PS))
-	    return (mem_error);
-
-	  x_aind_end = xsub[vtx_lid + 1];
-	  if (computeL)   sub = Llu_symbfact->lsub; 
-	  else   sub = Llu_symbfact->usub; 
-	  if (!compRcvd) 
-	    sub_src = sub;	  
-	}
-
-	sub[next] = pr_elt; next ++;
-
-	if (pr_elt < lstVtx) neltsVtx_CSep ++;
-	if (computeL && pr_elt == vtx)
-	  diagind = TRUE;
-	if (!computeL)
-	  if (marker[pr_elt] == mark_vtx - 2)
-	    if (pr_elt < prval_curvtx)
-	      prval_curvtx = pr_elt;
-	marker[pr_elt] = mark_vtx;	
-      }
-    }
-  }
-
-  /* Abort if the diagonal element is zero */
-  if (computeL && diagind == FALSE) {
-    printf("Pe[%d] At column %d, ", iam, vtx);
-    ABORT("ParSymbFact() encounters zero diagonal");
-  } 
-
-  neltsVtx = next - xsub[vtx_lid];
-  neltsZrVtx = 0; /* number of zero elements which would
-		     be introduced in the vertex */
-  neltsZrSn = 0; /* -"- in the supernode */
-  neltsMatched = 0; 
-  if (vtx != fstVtx) {
-    for (k = xsub[snrep_lid]; k < xsub[snrep_lid+1]; k++) {
-      vtx_elt = sub[k];
-      if (vtx_elt >= vtx) {
-	if ((vtx_elt > vtx && !computeL) || 
-	    (vtx_elt >= vtx && computeL)) {
-	  if (marker[vtx_elt] != mark_vtx)
-	    neltsZrVtx ++;
-	  else {
-	    neltsMatched ++;
-	  }
-	}
-	if (computeL && vtx_elt == vtx)
-	  *p_vtx_bel_mySn = vtx;
-	if (!computeL && vtx_elt == vtx + 1)
-	  *p_vtx_bel_mySn = vtx + 1;
-      }
-    }
-  }
-  else {
-    neltsMatched = neltsVtx;
-    if (! computeL) 
-      for (k = xsub[vtx_lid]; k < next; k++) {
-	vtx_elt = sub[k];
-	if (vtx_elt == vtx + 1)
-	  *p_vtx_bel_mySn = vtx + 1;
-      }
-  }
-
-  *p_neltsVtxInit  = neltsVtxInit;
-  *p_neltsVtx      = neltsVtx;
-  *p_neltsVtx_CSep = neltsVtx_CSep;
-  *p_neltsZrVtx    = neltsZrVtx;
-  *p_neltsMatched  = neltsMatched;
-  *p_next          = next;
-  *p_prval_curvtx  = prval_curvtx;
-  return SUCCES_RET;
-}
-
-static int_t
-updateRcvd_prGraph
-(
- int_t n,         /* Input - order of the matrix */
- int   iam,       /* Input - my processor number */
- int_t *sub_rcvd,      /* elements of node */
- int_t sub_rcvd_sz,   /* Input - size of sub to be used in the update */
- int_t fstVtx_toUpd,  /* Input - first vertex to update */
- int_t lstVtx_toUpd,  /* Input - last vertex to update */
- int_t pr_offset,
- int   computeL,
- int_t *marker,
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input - local info on vertices distribution */
- psymbfact_stat_t *PS
- /*  marker: first elements of marker contain the nodes that will
-     be used in the updates */
-)
-{
-  int_t i, k, nelts, prVal, vtx_elt, vtx_elt_lid, ind;
-  int_t vtx, vtx_lid, fstVtx_toUpd_lid, fstVtx_srcUpd_lid;
-  int_t *xsub, *sub, *xsub_rcvd, *xsubPr, *subPr, szsubPr, *p_indsubPr;
-  int_t maxNvtcsPProc, *globToLoc, mem_error;
-  int_t nvtcs_toUpd, fstVtx_srcUpd, vtx_lid_p;
-  
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
-  nvtcs_toUpd = lstVtx_toUpd - fstVtx_toUpd;
-  
-  if (computeL) {
-    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
-    xsub_rcvd = Llu_symbfact->xlsub_rcvd;
-    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
-    p_indsubPr = &(Llu_symbfact->indLsubPr);
-    szsubPr = Llu_symbfact->szLsubPr;
-  }
-  else {
-    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
-    xsub_rcvd = Llu_symbfact->xusub_rcvd;
-    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
-    p_indsubPr = &(Llu_symbfact->indUsubPr);
-    szsubPr = Llu_symbfact->szUsubPr;
-  }
-  
-  /* count number of elements in transpose representation of sub_rcvd */
-  /* use marker to count those elements */
-  for (i = 0; i < nvtcs_toUpd; i++)
-    marker[i] = 0;
-  for (i = 0; i <= VInfo->maxSzBlk; i++)
-    xsub_rcvd[i] = 0;
-  
-  i = 0;
-  fstVtx_srcUpd = EMPTY;
-  while (i < sub_rcvd_sz) {
-    vtx   = sub_rcvd[i + DIAG_IND];
-    nelts = sub_rcvd[i + NELTS_IND];
-    i += RCVD_IND;
-    prVal = sub_rcvd[i];
-    if (fstVtx_srcUpd == EMPTY) fstVtx_srcUpd = vtx;
-    xsub_rcvd[vtx - fstVtx_srcUpd] = i - RCVD_IND;
-    xsub_rcvd[vtx-fstVtx_srcUpd+1] = i + nelts;
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = sub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - 
-	      fstVtx_toUpd_lid;
-	    marker[vtx_elt_lid] ++;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }
-
-  vtx_lid = fstVtx_toUpd_lid - pr_offset;
-  ind = 0;
-  for (i = 0; i < nvtcs_toUpd; i++) {
-    if (marker[i] != 0) {
-      xsubPr[vtx_lid] = ind + 1;
-      ind += 2* marker[i];
-      marker[i] = xsubPr[vtx_lid] - 1;
-    }
-    vtx_lid ++;
-  }
-  
-  if (ind == 0) 
-    /* quick return if no update */
-    return;
-
-  /* test if enough memory in usubPr array */
-  if (ind >= szsubPr) {
-    if (mem_error = 
-	psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS))
-      return (mem_error);
-    if (computeL) 
-      subPr = Llu_symbfact->lsubPr;  
-    else 
-      subPr = Llu_symbfact->usubPr;
-  }
-  *p_indsubPr = ind;
-  
-  i = 0;
-  while (i < sub_rcvd_sz) {
-    vtx   = sub_rcvd[i + DIAG_IND];
-    nelts = sub_rcvd[i + NELTS_IND];
-    i += RCVD_IND;
-    prVal = sub_rcvd[i];
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = sub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
-	    vtx_lid_p = vtx_elt_lid - pr_offset;
-	    vtx_elt_lid -= fstVtx_toUpd_lid;
-	    /* add vtx to structure of pruned graph */
-	    if (marker[vtx_elt_lid] != xsubPr[vtx_lid_p] - 1) 
-	      subPr[marker[vtx_elt_lid] - 2] = marker[vtx_elt_lid] + 1;
-	    subPr[marker[vtx_elt_lid] + 1] = vtx - fstVtx_srcUpd + VInfo->nvtcs_loc;
-	    subPr[marker[vtx_elt_lid]] = EMPTY;
-	    marker[vtx_elt_lid] += 2;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }  
-  
-  for (i = fstVtx_toUpd; i < nvtcs_toUpd; i++)
-    marker[i] = 0;
-}
-
-static int_t
-update_prGraph 
-(
- int   iam, 
- int_t n,           /* order of the matrix */
- int_t fstVtx_blk,  /* first vertex in block to factorize */
- int_t lstVtx_blk,  /* last vertex in block to factorize */
- int_t snrep_lid,   /* local index of current supernode reprezentative */
- int_t pr_offset,   /* offset in the indexing of prune structure */
- int_t prval_cursn, /* prune value of current supernode reprezentative */
- int_t xsub_snp1,   /* denotes xsub[snrep_lid + 1] */
- int   computeL,    /* Input - if 1, compute column L(:,vtx)
-		               else compute row U(vtx, :) */
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,   /* Input/Output - local L, U data structures */
- psymbfact_stat_t *PS
- )
-{
-  int_t k, mem_error;
-  int_t kmin, kmax, ktemp, maxElt;
-  int_t sn_elt, sn_elt_prid;
-  int_t *globToLoc, maxNvtcsPProc;
-  int_t *xsub, *sub, *xsubPr, *subPr;
-  int_t *p_indsubPr, szsubPr;
-  
-  globToLoc     = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-
-  if (computeL) {
-    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
-    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
-    p_indsubPr = &(Llu_symbfact->indLsubPr);
-    szsubPr = Llu_symbfact->szLsubPr;
-  }
-  else {
-    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
-    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
-    p_indsubPr = &(Llu_symbfact->indUsubPr);
-    szsubPr = Llu_symbfact->szUsubPr;
-  }
-  
-  kmin = xsub[snrep_lid];
-  kmax = xsub_snp1 - 1;
-  if (prval_cursn != n)
-    maxElt = prval_cursn;
-  else
-    maxElt = EMPTY;
-  while (kmin <= kmax) {
-    if (prval_cursn == n) {
-      /* compute maximum element of L(:, vtx) */
-      if (sub[kmin] > maxElt)
-	maxElt = sub[kmin];
-      kmin ++;
-    }
-    else {
-      /* Do a quicksort-type partition. */    
-      if (sub[kmax] > prval_cursn) 
-	kmax--;
-      else if (sub[kmin] <= prval_cursn)
-	kmin++;
-      else { /* kmin does'nt belong to G^s(L), and kmax belongs: 
-	      * 	   interchange the two subscripts
-	      */
-	ktemp = sub[kmin];
-	sub[kmin] = sub[kmax];
-	sub[kmax] = ktemp;
-	kmin ++;
-	kmax --;
-      }
-    }
-  }
-  k = xsub[snrep_lid];
-  while (sub[k] <= prval_cursn && k < xsub_snp1) {
-    sn_elt = sub[k];
-    if (sn_elt < lstVtx_blk) {
-      sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset;
-      if ((*p_indsubPr) + 2 >= szsubPr) {
-	if (mem_error = 
-	    psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS))
-	  return (mem_error);
-	if (computeL) {
-	  subPr = Llu_symbfact->lsubPr;  szsubPr = Llu_symbfact->szLsubPr;
-	}
-	else {
-	  subPr = Llu_symbfact->usubPr;  szsubPr = Llu_symbfact->szUsubPr;
-	}
-      }
-      /* add krow to structure of pruned graph */
-      subPr[(*p_indsubPr) + 1] = snrep_lid;
-      subPr[(*p_indsubPr)] = xsubPr[sn_elt_prid];
-      xsubPr[sn_elt_prid] = (*p_indsubPr) + 1;
-      (*p_indsubPr) += 2;
-    }
-    if (sn_elt == maxElt) {
-      /* move prune val in the first position */
-      sub[k] = sub[xsub[snrep_lid]];
-      sub[xsub[snrep_lid]] = sn_elt;
-    }
-    k ++; 
-  }
-  return SUCCES_RET;
-}
-
-static int_t
-blk_symbfact
-(SuperMatrix *A,
- int   iam,
- int   lvl, 
- int   szSep,
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t fstVtx_loc, /* Input - first vertex local of the level */
- int_t fstVtx_blk,
- int_t lstVtx_blk,
- int_t *lsub_rcvd,      /* elements of node */
- int_t lsub_rcvd_sz,    /* size of sub to be explored */
- int_t *usub_rcvd,  
- int_t usub_rcvd_sz,
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,   /* Input/Output - local L, U data structures */  
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *marker,
- int_t *p_mark,    /* marker used to merge elements of vertices */
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_neltsZr, /* no of artificial zeros introduced so far */
- int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
-			 computed so far */
- int_t *p_nsuper_loc
- )
-{
-  int szSep_tmp, lvl_tmp, ii, jj;
-  int_t  *xlsubPr, *xusubPr; 
-  int_t  *xsup, *supno, *lsub, *xlsub, *usub, *xusub;
-  int_t  vtx_lid, vtx_prid, vtx, vtx_super, vtx_elt, maxNvtcsPProc;
-  int_t  ind, pr, pr_elt, newnext, k, vtx_elt_lid;
-  int_t  nextl, nextu, nsuper_loc, nvtcs, n, mem_error;
-  int_t  x_aind_beg, x_aind_end, i, szLp, xlsub_snp1, xusub_snp1;
-  int_t  snrep, snrep_lid, szsn, vtxp1, *globToLoc, domain_symb;
-  int_t lstVtx, neltsCurSep, maxNeltsVtx, fstVtx_loc_lid;
-  /* supernode relaxation parameters */
-  int_t  neltsVtx_L, neltsZrVtx_L, neltsMatched_L, neltsVtx_CSep_L;
-  int_t  neltsVtx_U, neltsZrVtx_U, neltsMatched_U, neltsVtx_CSep_U;
-  int_t  neltsZrSn_L, neltsZrSn_U, neltsZr, neltsTotal, 
-    neltsZr_tmp, neltsTotal_tmp, neltsZrSn, neltsVtxInit_l, neltsVtxInit_u;
-  /* next vertex belongs to current supernode pruned structure */
-  int_t  vtx_bel_snL, vtx_bel_snU;
-  /* marker variables */
-  int_t  markl1_vtx, markl2_vtx, marku1_vtx, marku2_vtx;
-  /* prune structure variables */
-  int_t prval_cursn, prval_curvtx, pr_offset;
-  /* variables for comms info */
-  int_t neltSn_L, neltSn_U, lstVtx_tmp, stat;
-  float relax_param, relax_seps;
-
-  if (fstVtx_blk >= lstVtx_blk)
-    return;
-  
-  /* Initializations */
-  supno   = Pslu_freeable->supno_loc;
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-  xusubPr  = Llu_symbfact->xusubPr; 
-  xlsubPr  = Llu_symbfact->xlsubPr;   
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  maxNeltsVtx   = VInfo->maxNeltsVtx;
-  
-  n          = A->ncol;
-  nextl      = *p_nextl;
-  nextu      = *p_nextu;
-  neltsZr    = *p_neltsZr;
-  neltsTotal = *p_neltsTotal;
-  nsuper_loc = *p_nsuper_loc;
-  marku2_vtx = *p_mark;
-  lstVtx     = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
-
-  snrep = fstVtx_blk; 
-  snrep_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-  szsn = 1;
-  nvtcs = lstVtx_blk - fstVtx_blk;
-  prval_cursn = n;
-  vtx_bel_snL = EMPTY; vtx_bel_snU = EMPTY;
-  
-  /* set up to EMPTY xlsubPr[], xusubPr[] */
-  if (PS->maxSzLPr < Llu_symbfact->indLsubPr)
-    PS->maxSzLPr = Llu_symbfact->indLsubPr;
-  if (PS->maxSzUPr < Llu_symbfact->indUsubPr)
-    PS->maxSzUPr = Llu_symbfact->indUsubPr;
-  for (i = 0; i < nvtcs; i++) {
-    xlsubPr[i] = EMPTY;
-    xusubPr[i] = EMPTY;
-  }
-  Llu_symbfact->indLsubPr = 0;
-  Llu_symbfact->indUsubPr = 0;
-
-  if (ind_sizes1 == 0) 
-    domain_symb = TRUE;
-  else {
-    domain_symb = FALSE;
-    fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
-  }
-  
-  vtx_prid = 0;
-  vtx_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-  pr_offset = vtx_lid;
-
-  if (lsub_rcvd != NULL) {
-    updateRcvd_prGraph (n, iam, lsub_rcvd, lsub_rcvd_sz,
-			fstVtx_blk, lstVtx_blk, pr_offset, 1, marker,
-			Pslu_freeable, Llu_symbfact, VInfo, PS);    
-    updateRcvd_prGraph (n, iam, usub_rcvd, usub_rcvd_sz,
-			fstVtx_blk, lstVtx_blk, pr_offset, 0, marker,
-			Pslu_freeable, Llu_symbfact, VInfo, PS);
-  }
-  
-  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++, vtx_prid ++) {
-    vtxp1 = vtx + 1;
-    if (marku2_vtx +4 >= n) {
-      /* reset to EMPTY marker array */
-      for (i = 0; i < n; i++)
-	marker[i] = EMPTY;
-      marku2_vtx = EMPTY;
-    }
-    markl1_vtx = marku2_vtx + 1; markl2_vtx = markl1_vtx + 1;
-    marku1_vtx = markl2_vtx + 1; marku2_vtx = marku1_vtx + 1;    
-
-    prval_curvtx   = n;
-    /* Compute nonzero structure L(:,vtx) */
-    if (mem_error = 
-	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, 
-		      fstVtx_blk,  lstVtx,
-		      snrep_lid, szsn, &nextl,
-		      marker, 
-		      lsub_rcvd, lsub_rcvd_sz,
-		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l,
-		      &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, 
-		      &neltsMatched_L, markl1_vtx, &prval_curvtx, 
-		      vtx_bel_snU, &vtx_bel_snL))
-      return (mem_error);
-    lsub = Llu_symbfact->lsub;
-
-#ifdef TEST_SYMB
-    PrintInt10 ("L(:, %d)", nextl - xlsub[vtx_lid], &(lsub[xlsub[vtx_lid]]));
-#endif
-    
-    /* Compute nonzero structure of U(vtx,:) */
-    if (mem_error = 
-	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, 
-		      fstVtx_blk, lstVtx,
-		      snrep_lid, szsn, &nextu,
-		      marker, 
-		      usub_rcvd, usub_rcvd_sz,
-		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u,
-		      &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, 
-		      &neltsMatched_U, marku1_vtx, &prval_curvtx,
-		      vtx_bel_snL, &vtx_bel_snU))
-      return (mem_error);
-    usub = Llu_symbfact->usub;
-
-#ifdef TEST_SYMB
-    PrintInt10 ("U(%d, :)", nextu - xusub[vtx_lid], &(usub[xusub[vtx_lid]])); 
-#endif
-    
-    /* update statistics on fill-in */
-    if (!domain_symb) {
-      stat = CEILING( (neltsVtxInit_l + neltsVtxInit_u), 2);
-      if (Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] != stat) {
-	stat = CEILING(stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid]);
-	PS->fill_pelt[0] += (float) stat;
-	if ((float) stat > PS->fill_pelt[1]) PS->fill_pelt[1] = (float) stat;
-	PS->fill_pelt[2] += 1.;
-      }
-      stat = CEILING( (neltsVtx_L + neltsVtx_U), 2);
-      stat = CEILING( stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] );
-      PS->fill_pelt[3] += (float) stat;
-      if ((float) stat > PS->fill_pelt[4]) PS->fill_pelt[4] = (float) stat;
-      PS->fill_pelt[5] += 1.;
-    }     
-
-    /* compute number of artificial zeros */
-    neltsTotal = 0;
-    neltsZr = 0;
-    neltsZrSn_L    = neltsVtx_L - neltsMatched_L;
-    neltsZrSn_U    = neltsVtx_U - neltsMatched_U;
-    neltsZrSn      = neltsZrVtx_L + neltsZrVtx_U +
-      (neltsZrSn_L + neltsZrSn_U) * szsn;
-    neltsZr_tmp    = neltsZr + neltsZrSn;
-    neltsTotal_tmp = neltsTotal + neltsZrSn + neltsVtx_L + neltsVtx_U;
-    if (neltsTotal_tmp == 0)
-      neltsTotal_tmp = 1;
-    relax_param = (float) (neltsTotal_tmp - neltsZr_tmp) / neltsTotal_tmp;
-
-#ifdef TEST_SYMB
-    printf ("[%d] vtx %d pr %d szsn %d nVtx_L %d nZrSn_L %d nZrVtx_L %d\n",
-	    iam, vtx, prval_curvtx, szsn,neltsVtx_L, neltsZrSn_L, neltsZrVtx_L);
-    printf ("  [%d] nVtx_U %d, nZrSn_U %d nZrVtx_U %d nextl %d nextu %d\n",
-	    iam, neltsVtx_U, neltsZrSn_U, neltsZrVtx_U, nextl, nextu);
-    printf ("  [%d] nZr %d nZr_tmp %d nTot %d nTot_tmp %d rel %f test %d\n\n", 
-	    iam, neltsZr, neltsZr_tmp, neltsTotal, neltsTotal_tmp,
-	    relax_param, i);
-#endif
-
-    /* Check to see if vtx belongs in the same supernode as vtx-1 */
-    supno[vtx_lid] = nsuper_loc;
-    if (vtx == fstVtx_blk) {
-      prval_cursn = prval_curvtx;
-      neltsTotal += neltsVtx_L + neltsVtx_U;
-    }
-    else {
-      if (maxNeltsVtx > 0) {
-	relax_seps = (float) neltsVtx_L / (float) maxNeltsVtx;
-	relax_seps *= (float) (neltsVtx_U+1) / (float) maxNeltsVtx;
-      } 
-      else
-	relax_seps = 0.0;
-
-      /* check if all upper separators are dense */
-      if (relax_seps >= PS->relax_seps ) {
-	VInfo->filledSep = FILLED_SEPS; 
-	*p_nextl      = xlsub[vtx_lid];
-	*p_nextu      = xusub[vtx_lid];
-	nsuper_loc   += 1;
-	*p_nsuper_loc = nsuper_loc;
-	if (mem_error =
-	    dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
-				sizes, fstVtxSep, vtx,
-				Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-				p_nextl, p_nextu, p_nsuper_loc))
-	  return (mem_error);
-	/* set up neltsZr and neltsTotal */
-	vtx = lstVtx_blk;
-	return 0;
-      } /* if all upper separators are dense */
-      else {
-	if (relax_param >= PS->relax_gen) {
-	  /* vertex belongs to the same supernode */
-	  if (prval_cursn > prval_curvtx || prval_cursn <= vtx)
-	    prval_cursn = prval_curvtx;
-	  neltsZr    = neltsZr_tmp;
-	  neltsTotal = neltsTotal_tmp;
-	  szsn ++;
-	  /* add artificial zeros at the structure of current supernode */
-	  newnext = xlsub[snrep_lid+1];
-	  if (neltsZrSn_L != 0) {
-	    for (k = xlsub[snrep_lid]; k < xlsub[snrep_lid+1]; k++) {
-	      vtx_elt = lsub[k];
-	      if (vtx_elt >= vtx) 
-		marker[vtx_elt] = markl2_vtx;
-	    }
-	    for (k = xlsub[vtx_lid]; k < nextl; k++) {
-	      vtx_elt = lsub[k];
-	      if (marker[vtx_elt] != markl2_vtx) {
-		/* add vtx_elt to the structure of snrep */
-		lsub[newnext] = vtx_elt; newnext ++;
-		marker[vtx_elt] = markl2_vtx;
-	      }
-	    }
-	    xlsub[snrep_lid+1] = newnext;
-	  }
-	  xlsub[vtx_lid] = newnext;
-	  nextl = newnext;
-	  neltsVtx_L += neltsZrVtx_L;
-	  
-	  newnext = xusub[snrep_lid+1];
-	  if (neltsZrSn_U != 0) {
-	    for (k = xusub[snrep_lid]; k < xusub[snrep_lid+1]; k++) {
-	      vtx_elt = usub[k];
-	      if (vtx_elt >= vtx) {
-		if (marker[vtx_elt] == markl2_vtx)
-		  if (prval_cursn > vtx_elt && vtx_elt != vtx)
-		    prval_cursn = vtx_elt;
-		marker[vtx_elt] = marku2_vtx;
-	      }
-	    }
-	    for (k = xusub[vtx_lid]; k < nextu; k++) {
-	      vtx_elt = usub[k];
-	      if (marker[vtx_elt] != marku2_vtx) {
-		/* add vtx_elt to the structure of snrep */
-		usub[newnext] = vtx_elt; newnext ++;
-		if (marker[vtx_elt] == markl2_vtx)
-		  if (prval_cursn > vtx_elt && vtx_elt != vtx)
-		    prval_cursn = vtx_elt;
-		marker[vtx_elt] = marku2_vtx;
-	      }
-	    }
-	    if (marker[vtxp1] == marku2_vtx)
-	      vtx_bel_snU = vtxp1;
-	    xusub[snrep_lid+1] = newnext;
-	  }
-	  xusub[vtx_lid] = newnext;
-	  nextu = newnext;
-	  neltsVtx_U += neltsZrVtx_U;
-	}  /* if ( relax_param >= PS->relax_param) */
-      }  /* if (VInfo->filledSep != FILLED_SEPS) */
-    } /* if (vtx != fstVtx_blk) */
-
-    if ((relax_param < PS->relax_gen || vtx == lstVtx_blk-1) 
-	&& VInfo->filledSep != FILLED_SEPS) {
-      /* if a new supernode starts or is the last vertex */
-      /* vtx starts a new supernode. Note we only store the
-       * subscript set of the first column of a supernode.  */
-      
-      if (marker[vtxp1] == marku1_vtx)
-	vtx_bel_snU = vtxp1;
-      /* build the pruned structure */
-      if (relax_param < PS->relax_gen
-	  && vtx == lstVtx_blk - 1 && vtx != fstVtx_blk) 
-	szLp = 2;
-      else
-	szLp = 1;
-      if (vtx == fstVtx_blk) {
-	xlsub_snp1 = nextl;
-	xusub_snp1 = nextu;
-      }
-      else {
-	xlsub_snp1 = xlsub[snrep_lid+1];
-	xusub_snp1 = xusub[snrep_lid+1];	
-      }
-      while (szLp > 0) {
-	szLp --;
-#ifdef TEST_SYMB
-	printf ("End sn %d szsn %d\n", nsuper_loc, szsn);
-	printf ("BLD pr vtx %d snrep %d prval %d szLp %d\n",
-		vtx, snrep, prval_cursn, szLp);
-#endif
-	
-	update_prGraph (iam, n, fstVtx_blk, lstVtx_blk,
-			snrep_lid, pr_offset, prval_cursn,
-			xlsub_snp1, 1,
-			Pslu_freeable, Llu_symbfact, PS);
-	update_prGraph (iam, n, fstVtx_blk, lstVtx_blk,
-			snrep_lid, pr_offset, prval_cursn,
-			xusub_snp1, 0,
-			Pslu_freeable, Llu_symbfact, PS);
-
-#ifdef TEST_SYMB
-	printf ("Adr lsub %p usub %p lsub %p pos %d usub %p pos %d\n", 
-		&(lsub[xlsub[snrep_lid]]), &(usub[xusub[snrep_lid]]),
-		lsub, xlsub[snrep_lid], usub, xusub[snrep_lid]);
-	PrintInt10 ("Lsn", xlsub_snp1 - xlsub[snrep_lid],
-		    &(lsub[xlsub[snrep_lid]]));
-	PrintInt10 ("Usn", xusub_snp1 - xusub[snrep_lid],
-		    &(usub[xusub[snrep_lid]]));
-#endif
-
-	if (prval_cursn >= lstVtx_blk) {
-	  neltSn_L = xlsub_snp1 - xlsub[snrep_lid];
-	  neltSn_U = xusub_snp1 - xusub[snrep_lid];
-	  if (ind_sizes1 != 0) {
-	    CS->snd_intraSz += neltSn_L + neltSn_U + 4;
-	    CS->snd_LintraSz += neltSn_L + 2;
-	  }
-	  if (prval_cursn >= lstVtx) {
-	    /* this supernode will be send to next layers of the tree */
-	    lvl_tmp = lvl;
-	    ii = ind_sizes1;
-	    jj = ind_sizes2;
-	    szSep_tmp = szSep;
-	    lstVtx_tmp = lstVtx;
-	    while (prval_cursn >= lstVtx_tmp && szSep_tmp != 1) {
-	      jj = ii + szSep_tmp + (jj - ii) / 2;
-	      ii += szSep_tmp;
-	      lvl_tmp ++;
-	      szSep_tmp = szSep_tmp / 2;
-	      lstVtx_tmp = fstVtxSep[jj] + sizes[jj];
-	      CS->snd_interSz[lvl_tmp] += neltSn_L + neltSn_U + 4;
-	      CS->snd_LinterSz[lvl_tmp] += neltSn_L + 2;
-	      if (CS->snd_vtxinter[lvl_tmp] == EMPTY)
-		CS->snd_vtxinter[lvl_tmp] = snrep;
-	    }
-	  }
-	}
-	snrep = vtx;
-	snrep_lid = vtx_lid;
-	prval_cursn = prval_curvtx;
-	szsn        = 1;
-	xlsub_snp1  = nextl;
-	xusub_snp1  = nextu;
-      }
-      if (relax_param < PS->relax_gen) {
-	neltsTotal += neltsVtx_L + neltsVtx_U;
-	nsuper_loc ++;	
-	supno[vtx_lid] = nsuper_loc;
-	if (marker[vtxp1] == marku1_vtx)
-	  vtx_bel_snU = vtxp1;
-	else
-	  vtx_bel_snU = EMPTY;
-      }
-    }
-    if (vtx == lstVtx_blk - 1)
-      nsuper_loc ++;
-    
-    /* check if current separator is dense */
-    if (!VInfo->filledSep) {
-      relax_seps = (float) neltsVtx_CSep_L / (float) (lstVtx - vtx);
-      relax_seps *= (float) (neltsVtx_CSep_U+1) / (float) (lstVtx - vtx);
-      if (relax_seps >= PS->relax_curSep ) 
-	VInfo->filledSep = FILLED_SEP;
-    }
-    maxNeltsVtx --;
-  }
-    
-  *p_mark       = marku2_vtx + 1;
-  *p_nextl      = nextl;
-  *p_nextu      = nextu;
-  *p_neltsZr    = neltsZr;
-  *p_neltsTotal = neltsTotal;
-  *p_nsuper_loc = nsuper_loc;
-
-  return 0;
-}
-
-static void
-domain_symbfact
-(SuperMatrix *A,
- int   iam,        /* Input - my processor number */  
- int   lvl,        /* Input - current level in the separator tree */
- int   szSep,      /* Input - size of the current separator (node) */
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t fstVtx,     /* Input - first vertex of current node */ 
- int_t lstVtx,     /* Input - last vertex of current node */ 
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *marker,
- int_t *p_mark,    /* marker used to merge elements of vertices */
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_neltsZr, /* no of artificial zeros introduced so far */
- int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
-			 computed so far */
- int_t *p_nsuper_loc
- )
-{
-  int_t lstVtx_lid, maxNvtcsPProc; 
-
-  /* call blk_symbfact */
-  blk_symbfact (A, iam, lvl, 
-		szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-		EMPTY, fstVtx, lstVtx, 
-		NULL, EMPTY, NULL, EMPTY,
-		Pslu_freeable, Llu_symbfact, VInfo, CS, PS,
-		marker, p_mark,
-		p_nextl, p_nextu, p_neltsZr, p_neltsTotal, 
-		p_nsuper_loc);
-
-  if (VInfo->filledSep != FILLED_SEPS) {
-    maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-    if (fstVtx >= lstVtx)
-      lstVtx_lid = 0;
-    else 
-      lstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[lstVtx-1] ) + 1;
-    VInfo->xlsub_nextLvl  = Llu_symbfact->xlsub[lstVtx_lid];
-    Llu_symbfact->xlsub[lstVtx_lid] = *p_nextl;
-    VInfo->xusub_nextLvl  = Llu_symbfact->xusub[lstVtx_lid];
-    Llu_symbfact->xusub[lstVtx_lid] = *p_nextu;
-  }
-  VInfo->maxNeltsVtx -= lstVtx - fstVtx;
-}
-
-/*
- * Compute counts of rows/columns of current separator.
- * cntelt_vtcs[i] is 0 when i is nonzero before current separator
- * and n when i is zero before current separator.
- *
- * Set up nvtcsLvl_loc.
- */
-static void
-initLvl_symbfact
-(
- int_t n,       /* Input - order of the matrix */
- int   iam,     /* Input - my processor number */
- int_t fstVtx,  /* Input - first vertex of current node */   
- int_t lstVtx,  /* Input - last vertex of current node */   
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS,
- MPI_Comm ndComm,
- int_t  *marker,
- int_t  nextl,
- int_t  nextu
- ) 
-{
-  int_t *cntelt_vtcs, x_aind_beg, x_aind_end, x_aind_beg_l, x_aind_beg_u,
-    nelts_asup, nelts_ainf;
-  int_t nvtcsLvl_loc, fstVtx_loc, fstVtx_loc_lid, fstVtx_nextLvl;
-  int_t curblk_loc, nblks_loc, ind_blk;
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t *begEndBlks_loc, code_err, mem_error;
-  int_t i, j, k, vtx, vtx_lid, fstVtx_blk, lstVtx_blk, vtx_elt, p, fill;
-  int_t nelts, nelts_fill_l, nelts_fill_u, nelts_cnts, maxNvtcsPProc, *globToLoc;
-  int_t use_fillcnts, cntelt_vtx_l, cntelt_vtx_u;
-  MPI_Status status;
-  
-  fill = PS->fill_par;
-  VInfo->filledSep = FALSE;
-  
-  /* Initializations */
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  globToLoc      = Pslu_freeable->globToLoc;
-  curblk_loc     = VInfo->curblk_loc;
-  nblks_loc      = VInfo->nblks_loc;
-  begEndBlks_loc = VInfo->begEndBlks_loc;
-  cntelt_vtcs    = Llu_symbfact->cntelt_vtcs;
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-  
-  /* compute nvtcsLvl_loc */
-  nvtcsLvl_loc = 0;  
-  ind_blk = curblk_loc;
-  while (fstVtx > begEndBlks_loc[ind_blk] && ind_blk < 2 * nblks_loc) {
-    ind_blk += 2;
-  }
-  curblk_loc = ind_blk;
-  fstVtx_loc = begEndBlks_loc[ind_blk];
-  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
-    nvtcsLvl_loc += begEndBlks_loc[ind_blk + 1] - 
-      begEndBlks_loc[ind_blk];
-    ind_blk += 2;
-  }
-  fstVtx_nextLvl = begEndBlks_loc[ind_blk];
-  VInfo->nvtcsLvl_loc = nvtcsLvl_loc;
-  VInfo->curblk_loc = curblk_loc;
-  
-  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
-  vtx_lid      = fstVtx_loc_lid;
-  x_aind_beg_l = VInfo->xlsub_nextLvl;
-  x_aind_beg_u = VInfo->xusub_nextLvl;
-  nelts_cnts   = 0;
-  nelts_fill_l = 0;
-  nelts_fill_u = 0;
-  ind_blk      = curblk_loc;
-  
-  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
-    fstVtx_blk = begEndBlks_loc[ind_blk];
-    lstVtx_blk = begEndBlks_loc[ind_blk + 1];
-    ind_blk += 2;
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) 
-      nelts_cnts += cntelt_vtcs[vtx_lid];      
-    nelts_fill_l += fill * (xlsub[vtx_lid] - x_aind_beg_l);
-    nelts_fill_u += fill * (xusub[vtx_lid] - x_aind_beg_u);
-    x_aind_beg_l = xlsub[vtx_lid];
-    x_aind_beg_u = xusub[vtx_lid];
-  }
-
-  if (nvtcsLvl_loc != 0) {
-    nelts_ainf = xlsub[vtx_lid] - VInfo->xlsub_nextLvl;
-    nelts_asup = xusub[vtx_lid] - VInfo->xusub_nextLvl;
-  }
-  else {
-    nelts_ainf = 0;
-    nelts_asup = 0;
-  }
-  
-  use_fillcnts = FALSE;
-  if (nextl + nelts_cnts >= Llu_symbfact->szLsub - nelts_ainf ||
-      nextu + nelts_cnts >= Llu_symbfact->szUsub - nelts_asup) { 
-    use_fillcnts = TRUE;
-  }
- 
-  use_fillcnts = TRUE; 
-  
-  if (use_fillcnts) {
-    if (nextl + nelts_fill_l >= Llu_symbfact->szLsub - nelts_ainf)
-      mem_error = 
-	psymbfact_LUXpandMem (iam, n, fstVtx, nextl,
-			      nextl + nelts_fill_l, LSUB,
-			      RL_SYMB, 1, 
-			      Pslu_freeable, Llu_symbfact, VInfo, PS);
-    lsub = Llu_symbfact->lsub;
-    if (nextu + nelts_fill_u >= Llu_symbfact->szUsub - nelts_asup) 
-      mem_error = 
-	psymbfact_LUXpandMem (iam, n, fstVtx, nextu,
-			      nextu + nelts_fill_u, USUB,
-			      RL_SYMB, 1, 
-			      Pslu_freeable, Llu_symbfact, VInfo, PS);      
-    usub = Llu_symbfact->usub;
-  }
-
-  /* init xlsub[fstVtx:lstVtx] and xusub[fstVtx:lstVtx] and
-     copy elements of A[fstVtx:lstVtx, fstVtx:lstVtx] in lsub and usub */
-  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
-  x_aind_beg_l = VInfo->xlsub_nextLvl;
-  x_aind_beg_u = VInfo->xusub_nextLvl;
-  vtx_lid = fstVtx_loc_lid;
-  ind_blk = curblk_loc;
-
-  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
-    fstVtx_blk = begEndBlks_loc[ind_blk];
-    lstVtx_blk = begEndBlks_loc[ind_blk + 1];
-    ind_blk += 2;
-
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) {
-      if (vtx_lid != fstVtx_loc_lid) {
-	x_aind_beg_l = xlsub[vtx_lid];
-	x_aind_beg_u = xusub[vtx_lid];
-      }
-      if (use_fillcnts) {
-	cntelt_vtx_l = fill * (xlsub[vtx_lid+1] - x_aind_beg_l);
-	cntelt_vtx_u = fill * (xusub[vtx_lid+1] - x_aind_beg_u);
-      }
-      else {
-	cntelt_vtx_l = cntelt_vtcs[vtx_lid];
-	cntelt_vtx_u = cntelt_vtcs[vtx_lid];
-      }
-      x_aind_end = xlsub[vtx_lid + 1];
-      Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] = 
-	CEILING( (xlsub[vtx_lid+1]-x_aind_beg_l + xusub[vtx_lid+1]-x_aind_beg_u), 2);
-      
-      xlsub[vtx_lid] = nextl;
-      nelts = 0;
-      for (k = x_aind_beg_l; k < x_aind_end; k++) {
-	lsub[nextl] = lsub[k]; nextl ++;
-	nelts ++;
-      }
-      if (nelts < cntelt_vtx_l) 
-	lsub[nextl] = EMPTY; 
-      nextl += cntelt_vtx_l - nelts;
-      x_aind_end = xusub[vtx_lid + 1];
-      xusub[vtx_lid] = nextu;
-      nelts = 0;
-      for (k = x_aind_beg_u; k < x_aind_end; k++) {
-	usub[nextu] = usub[k]; nextu ++;
-	nelts ++;
-      }
-      if (nelts < cntelt_vtx_u) 
-	usub[nextu] = EMPTY; 
-      nextu += cntelt_vtx_u - nelts;
-    }
-  }
- 
-  if (nvtcsLvl_loc == 0) {
-    if (curblk_loc == 0)
-      vtx_lid = 0;
-    else {
-      if (begEndBlks_loc[curblk_loc-1] == 0)
-	vtx_lid = 0;
-      else
-	vtx_lid = LOCAL_IND( globToLoc[begEndBlks_loc[curblk_loc-1] - 1] ) + 1;
-    }
-
-    xlsub[vtx_lid] = nextl;
-    xusub[vtx_lid] = nextu;
-  }
-  else {
-    VInfo->xlsub_nextLvl   = xlsub[vtx_lid];
-    xlsub[vtx_lid] = nextl;
-    VInfo->xusub_nextLvl   = xusub[vtx_lid];
-    xusub[vtx_lid] = nextu;
-    if (PS->estimLSz < nextl)
-      PS->estimLSz = nextl;
-    if (PS->estimUSz < nextu)
-      PS->estimUSz = nextu;
-    
-    VInfo->nnz_ainf_loc -= nelts_ainf;
-    VInfo->nnz_asup_loc -= nelts_asup;
-  }
-  VInfo->fstVtx_nextLvl = fstVtx_nextLvl;
-}
-
-
-static int_t
-expand_RL 
-(
- int_t computeRcvd, /* if = 1, then update from receive buffer,
-		       else update from own data */
- int_t n,
- int   iam,       /* process number */
- int_t *lsub_rcvd,      /* elements of node */
- int_t lsub_rcvd_sz,    /* size of sub to be explored */
- int_t *usub_rcvd,  
- int_t usub_rcvd_sz,
- int_t vtxXp,
- int_t vtx_upd_pr,    /* ind in pruned structure of upd vertex which 
-			 doesn't fit into the alloc memory */
- int_t lstVtx_upd_pr, /* ind in pruned structure of lst vtx to update */
- int_t fstVtx_srcUpd, /* first vertex source of the updates */
- int_t lstVtx_srcUpd, /* last vertex source of the updates */
- int_t fstVtx_toUpd,  /* first vertex to update */
- int_t lstVtx_toUpd,  /* last vertex to update */
- int_t nvtcs_toUpd,   /* no of vertices to update */
- int   computeL,
- int_t *pmarkl,
- int_t *marker,
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS
- )
-{
-  int_t fstVtx_toUpd_lid, vtx_lid, vtx, vtx_elt, vtx_elt_lid, nextl, nelts_in;
-  int_t i, ii, j, nelts, nelts_vtx, mpnelts, lvtx_lid, elt, vtxXp_lid;
-  int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub;
-  int_t markl, *globToLoc, maxNvtcsPProc;
-  int_t mem_error, len_texp;
-  
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-
-  xusubPr = Llu_symbfact->xlsubPr; usubPr  = Llu_symbfact->lsubPr;
-  if (computeL) {
-    xlsub   = Llu_symbfact->xlsub;   lsub    = Llu_symbfact->lsub;
-    xusub   = Llu_symbfact->xusub;   usub    = Llu_symbfact->usub;
-  }
-  else {
-    xlsub   = Llu_symbfact->xusub;   lsub    = Llu_symbfact->usub;
-    xusub   = Llu_symbfact->xlsub;   usub    = Llu_symbfact->lsub;
-  }
-  markl = *pmarkl + 1;
-  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
-  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
-  nextl = xlsub[vtxXp_lid+1];
-    
-  lvtx_lid = EMPTY;
-  if (lstVtx_srcUpd != EMPTY)
-    lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] );
-
-  /* count the number of new elements, and update Llu_symbfact->cntelt_vtcs */
-  vtx_lid = fstVtx_toUpd_lid;
-  vtx_lid += vtx_upd_pr;
-  len_texp = 0;
-  for (i = vtx_upd_pr; i < lstVtx_upd_pr; i++, vtx_lid ++) { 
-    nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-    if (xusubPr[i] != xusubPr[i+1]) {
-      j = xusubPr[i]; 
-      vtx = usubPr[j];
-      /* setup marker structure for already existing elements */
-      ii = xlsub[vtx_lid];
-      while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) {
-	marker[lsub[ii]] = markl;
-	ii ++;
-      }
-      nelts_vtx = ii - xlsub[vtx_lid];
-      for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) {
-	vtx_elt = usubPr[j];
-	ii = marker[vtx_elt];
-	if (computeRcvd) {
-	  nelts = lsub_rcvd[ii + NELTS_IND];
-	  ii += RCVD_IND;
-	  mpnelts = marker[vtx_elt] + nelts + RCVD_IND;
-	}
-	else {
-	  vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
-	  if (vtx_elt_lid == lvtx_lid)
-	    nelts = lsub_rcvd_sz - ii;
-	  else
-	    nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid];
-	  mpnelts = marker[vtx_elt] + nelts;
-	}
-	
-	if (!computeL)
-	  marker[vtx] = markl;
-	for (ii; ii < mpnelts; ii++) {
-	  elt = lsub_rcvd[ii];
-	  if (elt >= vtx) {
-	    if (marker[elt] != markl) {
-	      /* add elt to structure of vtx */
-	      marker[elt] = markl;
-	      nelts_vtx ++;
-	    }
-	  }
-	}
-      }
-      if (nelts_vtx != 0 && (nelts_vtx > xlsub[vtx_lid+1] - xlsub[vtx_lid])) {
-	nelts_in = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-	if (nelts_in == 0) nelts_in = 1;
-	j = nelts_vtx / nelts_in;
-	if (nelts_vtx % nelts_in != 0) j++;
-	nelts_vtx = j * nelts_in;
-      }
-      else
-	nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-      markl ++;
-      if (markl == n) {
-	/* reset marker array */
-	for (j = fstVtx_toUpd; j < n; j++)
-	  marker[j] = EMPTY;
-	markl = 0;
-      }
-    }
-    Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx;
-    len_texp += nelts_vtx;
-  }
-  for (; i < nvtcs_toUpd; i++, vtx_lid++) {
-    nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];    
-    Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx;
-    len_texp += nelts_vtx;
-  }
-
-  *pmarkl = markl;
-  /* mark elements array */
-  for (i = xlsub[vtxXp_lid]; i < nextl; i++) {
-    marker[lsub[i]] = markl;
-  }
-
-  nextl = xlsub[vtxXp_lid+1];  
-  if (mem_error = 
-      psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, 
-			    computeL, Pslu_freeable, Llu_symbfact, VInfo, PS))
-    return (mem_error);		
-
-  return 0;
-}
-
-
-static int_t
-rl_update
-(
- int   computeRcvd, /* if = 1, then update from receive buffer,
-		       else update from own data */
- int_t n,
- int   iam,       /* process number */
- int_t *lsub_rcvd,      /* elements of node */
- int_t lsub_rcvd_sz,    /* size of sub to be explored */
- int_t *usub_rcvd,  
- int_t usub_rcvd_sz,
- int_t fstVtx_srcUpd, /* first vertex source of the updates */
- int_t lstVtx_srcUpd, /* last vertex source of the updates */
- int_t indBlk_srcUpd, /* block index of first vertex */
- int_t fstVtx_toUpd,  /* first vertex to update */
- int_t lstVtx_toUpd,  /* last vertex to update */
- int_t nvtcs_toUpd,   /* no of vertices to update */
- int   computeL,
- int_t *pmarkl,
- int_t *marker,
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS
- /*  marker: first elements of marker contain the nodes that will
-     be used in the updates */
- )
-{
-  int_t i, j, k, prVal, nelts, ind, nextl, ii, mpnelts, mem_error;
-  int_t vtx, vtx_lid, vtx_elt, vtx_elt_lid, lvtx_lid;
-  int_t fstVtx_toUpd_lid, markl, elt, vtx_loc, ind_blk;
-  int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub;
-  int_t fstVtx_upd, lstVtx_upd, maxNvtcsPProc, *globToLoc;
-  int_t fstVtx_srcUpd_lid, nelts_vtx, expand;
-  
-  /* quick return */
-  if (fstVtx_toUpd >= lstVtx_toUpd)
-    return;
-
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  
-  fstVtx_upd = EMPTY;
-  lstVtx_upd = EMPTY;
-  xusubPr = Llu_symbfact->xlsubPr; usubPr  = Llu_symbfact->lsubPr;
-  if (computeL) {
-    xlsub   = Llu_symbfact->xlsub;   lsub    = Llu_symbfact->lsub;
-    xusub   = Llu_symbfact->xusub;   usub    = Llu_symbfact->usub;
-  }
-  else {
-    xlsub   = Llu_symbfact->xusub;   lsub    = Llu_symbfact->usub;
-    xusub   = Llu_symbfact->xlsub;   usub    = Llu_symbfact->lsub;
-  }
-  markl = *pmarkl;
-  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
-
-  /* count number of elements in transpose representation of usub_rcvd */
-  /* use marker to count those elements */
-  for (i = 0; i < nvtcs_toUpd; i++)
-    marker[i] = 0;
-  
-  i = 0;
-  if (fstVtx_srcUpd != EMPTY) {
-    fstVtx_srcUpd_lid = LOCAL_IND( globToLoc[fstVtx_srcUpd] );
-    vtx_lid = fstVtx_srcUpd_lid;
-  }
-  lvtx_lid = EMPTY;
-  if (lstVtx_srcUpd != EMPTY)
-    lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] );
-  
-  while (i < usub_rcvd_sz) {
-    if (computeRcvd) {
-      vtx   = usub_rcvd[i + DIAG_IND];
-      nelts = usub_rcvd[i + NELTS_IND];
-      i += RCVD_IND;
-    }
-    else {
-      if (vtx_lid == lvtx_lid)
-	nelts = usub_rcvd_sz - i;
-      else
-	nelts = xusub[vtx_lid + 1] - xusub[vtx_lid];
-      vtx_lid ++;
-    }
-    prVal = usub_rcvd[i];
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = usub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - 
-	      fstVtx_toUpd_lid;
-	    marker[vtx_elt_lid] ++;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }
-
-  ind = 0;
-  for (i = 0; i < nvtcs_toUpd; i++) {
-    if (marker[i] != 0) {
-      marker[i] ++;
-      if (fstVtx_upd == EMPTY)
-	fstVtx_upd = i;
-      lstVtx_upd = i;
-    }
-    xusubPr[i] = ind;
-    ind += marker[i];
-    marker[i] = xusubPr[i];
-  }
-  xusubPr[i] = ind;
-  lstVtx_upd ++;
-
-  if (ind == 0) 
-    /* quick return if no update */
-    return;
-
-  /* test if enough memory in usubPr array */
-  if (ind > Llu_symbfact->szLsubPr) {
-    if (mem_error = 
-	psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS))
-      return (mem_error);
-    usubPr  = Llu_symbfact->lsubPr;
-  }
-  
-  i = 0;
-  if (fstVtx_srcUpd != EMPTY) {
-    vtx_loc = fstVtx_srcUpd;
-    vtx_lid = LOCAL_IND( globToLoc[vtx_loc] );
-    ind_blk = indBlk_srcUpd;
-  }
-  while (i < usub_rcvd_sz) {
-    if (computeRcvd) {
-      vtx   = usub_rcvd[i + DIAG_IND];
-      nelts = usub_rcvd[i + NELTS_IND];
-      i += RCVD_IND;
-    }
-    else {
-      vtx = vtx_loc;
-      if (vtx_lid == lvtx_lid)
-	nelts = usub_rcvd_sz - i;
-      else
-	nelts = xusub[vtx_lid + 1] - xusub[vtx_lid];
-      vtx_lid ++;
-      vtx_loc ++;
-      if (ind_blk != EMPTY)
-	if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) {
-	  ind_blk += 2;
-	  vtx_loc = VInfo->begEndBlks_loc[ind_blk];
-	}
-    }
-
-    prVal = usub_rcvd[i];
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = usub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt]) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_toUpd_lid;
-	    /* add vtx_elt to the pruned structure */
-	    if (marker[vtx_elt_lid] == xusubPr[vtx_elt_lid]) {
-	      usubPr[marker[vtx_elt_lid]] = vtx_elt;
-	      marker[vtx_elt_lid] ++;
-	    }
-	    usubPr[marker[vtx_elt_lid]] = vtx;
-	    marker[vtx_elt_lid] ++;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }
-  /* reset marker array */
-  for (i = 0; i < nvtcs_toUpd; i++)
-    marker[i] = EMPTY;
-  if (fstVtx_srcUpd != EMPTY) {
-    vtx_loc = fstVtx_srcUpd;
-    vtx_lid = LOCAL_IND( globToLoc[vtx_loc] );
-    ind_blk = indBlk_srcUpd;
-  }
-  i = 0;
-  while (i < lsub_rcvd_sz) {
-    if (computeRcvd) {
-      vtx   = lsub_rcvd[i + DIAG_IND];
-      nelts = lsub_rcvd[i + NELTS_IND];
-      marker[vtx] = i;
-      i += RCVD_IND;
-    }
-    else {
-      vtx = vtx_loc;
-      if (vtx_lid == lvtx_lid)
-	nelts = lsub_rcvd_sz - i;
-      else
-	nelts = xlsub[vtx_lid + 1] - xlsub[vtx_lid];
-      vtx_lid ++;
-      marker[vtx] = i;
-      vtx_loc ++;
-      if (ind_blk != EMPTY)
-	if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) {
-	  ind_blk += 2;
-	  vtx_loc = VInfo->begEndBlks_loc[ind_blk];
-	}
-    }
-    i += nelts;
-  }
-
-  /* use the pruned structure to update symbolic factorization */
-  vtx_lid = fstVtx_toUpd_lid;
-  vtx_lid += fstVtx_upd;
-  for (i = fstVtx_upd; i < lstVtx_upd; i++, vtx_lid ++) { 
-    if (xusubPr[i] != xusubPr[i+1]) {
-      j = xusubPr[i]; 
-      vtx = usubPr[j];
-      /* setup marker structure for already existing elements */
-      ii = xlsub[vtx_lid];
-      while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) {
-	marker[lsub[ii]] = markl;
-	ii ++;
-      }
-      PS->nops += ii - xlsub[vtx_lid];
-      nextl = ii;
-      for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) {
-	vtx_elt = usubPr[j];
-	ii = marker[vtx_elt];
-	if (computeRcvd) {
-	  nelts = lsub_rcvd[ii + NELTS_IND];
-	  ii += RCVD_IND;
-	  mpnelts = marker[vtx_elt] + nelts + RCVD_IND;
-	}
-	else {
-	  vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
-	  if (vtx_elt_lid == lvtx_lid)
-	    nelts = lsub_rcvd_sz - ii;
-	  else
-	    nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid];
-	  mpnelts = marker[vtx_elt] + nelts;
-	}
-		
-	if (!computeL)
-	  marker[vtx] = markl;
-	PS->nops += mpnelts - ii;
-	for (ii; ii < mpnelts; ii++) {
-	  elt = lsub_rcvd[ii];
-	  if (elt >= vtx) {
-	    if (marker[elt] != markl) {
-	      /* add elt to structure of vtx */
-	      if (nextl >= xlsub[vtx_lid + 1]) {
-		if (mem_error = 
-		    expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			       usub_rcvd, usub_rcvd_sz, vtx, i,
-			       lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd,
-			       fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL,
-			       &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS))
-		    return (mem_error);
-		if (computeL) {
-		  lsub    = Llu_symbfact->lsub;
-		  if (!computeRcvd) 
-		    lsub_rcvd    = 
-		      &(Llu_symbfact->lsub[Llu_symbfact->xlsub[fstVtx_srcUpd_lid]]);
-		} else {
-		  marker[vtx] = markl;
-		  lsub    = Llu_symbfact->usub;
-		  if (!computeRcvd) 
-		    lsub_rcvd = 
-		      &(Llu_symbfact->usub[Llu_symbfact->xusub[fstVtx_srcUpd_lid]]);
-		}
-	      }
-	      lsub[nextl] = elt; nextl ++;
-	      marker[elt] = markl;
-	    }
-	  }
-	}
-      }
-      if (nextl < xlsub[vtx_lid+1])
-	lsub[nextl] = EMPTY;
-      markl ++;
-      if (markl == n) {
-	/* reset marker array */
-	for (j = fstVtx_toUpd; j < n; j++)
-	  marker[j] = EMPTY;
-	markl = 0;
-      }
-    }
-  }
-  *pmarkl = markl;
-
-  return 0;
-}
-
-static int_t
-dnsUpSeps_symbfact
-(
- int_t n,
- int   iam,      /* my processor number */
- int   szSep, 
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t fstVtx_dns,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_nsuper_loc
- )
-{
-  int_t nextl, nextu, nsuper_loc, curblk_loc, mem_error;
-  int_t vtx_elt, ind_blk, vtx, k;
-  int_t *xlsub, *xusub, *lsub, *usub;
-  int_t fstVtx_blk, fstVtx_blk_lid, vtx_lid, lstVtx_blk, fstVtx_lvl, lstVtx_lvl;
-  int_t *globToLoc, maxNvtcsPProc;
-  
-  /* Initialization */
-  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;
-  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;
-
-  globToLoc  = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  nextl      = *p_nextl;
-  nextu      = *p_nextu;
-  nsuper_loc = *p_nsuper_loc;
-  curblk_loc = VInfo->curblk_loc;
-  VInfo->nnz_ainf_loc = 0;
-  VInfo->nnz_asup_loc = 0;
-
-  if (fstVtx_dns == EMPTY)
-    fstVtx_blk     = VInfo->begEndBlks_loc[curblk_loc];
-  else 
-    fstVtx_blk  = fstVtx_dns;
-  if (fstVtx_blk == n)
-    return 0;
-  fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-  vtx_lid        = fstVtx_blk_lid;
-  xlsub[vtx_lid] = nextl;
-  xusub[vtx_lid] = nextu;
-  PS->nDnsUpSeps = 0; 
-  
-  while (szSep >= 1) {
-    PS->nDnsUpSeps++; 
-    fstVtx_lvl = fstVtxSep[ind_sizes2];
-    lstVtx_lvl = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
-    if (fstVtx_blk > fstVtx_lvl)
-      vtx_elt = fstVtx_blk;
-    else 
-      vtx_elt = fstVtx_lvl;
-    if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) {
-      if (mem_error =
-	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 
-				nextl + fstVtx_lvl - vtx_elt,
-				LSUB, DNS_UPSEPS, 1,
-				Pslu_freeable, Llu_symbfact, VInfo, PS))
-	return (mem_error);
-      lsub = Llu_symbfact->lsub;
-    }
-    if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) {
-      if (mem_error =
-	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 
-				nextu + fstVtx_lvl - vtx_elt,
-				LSUB, DNS_UPSEPS, 1,
-				Pslu_freeable, Llu_symbfact, VInfo, PS))
-	return (mem_error);
-      usub = Llu_symbfact->usub;
-    }
-    PS->nops += 2 * (lstVtx_lvl - vtx_elt);
-    for (; vtx_elt < lstVtx_lvl; vtx_elt++) {
-      lsub[nextl] = vtx_elt; nextl++;
-      usub[nextu] = vtx_elt; nextu++;
-    }
-    ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2;
-    ind_sizes1 += szSep;
-    szSep = szSep / 2;
-  }
-  /* delete the diagonal element from the U structure */
-  usub[xusub[fstVtx_blk_lid]] = usub[nextu - 1];
-  nextu --;
-  xlsub[fstVtx_blk_lid+1] = nextl;
-  xusub[fstVtx_blk_lid+1] = nextu;
-
-  vtx_lid = fstVtx_blk_lid;
-  ind_blk = curblk_loc;
-  while (ind_blk < 2 * VInfo->nblks_loc) {
-    if (ind_blk != curblk_loc) {
-      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-
-      xlsub[vtx_lid] = nextl;
-      xusub[vtx_lid] = nextu;
-
-      for (k = xlsub[fstVtx_blk_lid]; k < xlsub[fstVtx_blk_lid+1]; k++) 
-	if (lsub[k] >= fstVtx_blk) {
-	  lsub[nextl] = lsub[k]; nextl ++;
-	  if (nextl >= MEM_LSUB( Llu_symbfact, VInfo ))
- 	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0,
-				      LSUB, DNS_UPSEPS, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  lsub = Llu_symbfact->lsub;
-	}
-      for (k = xusub[fstVtx_blk_lid]; k < xusub[fstVtx_blk_lid+1]; k++)
-	if (usub[k] > fstVtx_blk) {
-	  usub[nextu] = usub[k]; nextu ++;
-	  if (nextu >= MEM_USUB( Llu_symbfact, VInfo ))
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0,
-				      USUB, DNS_UPSEPS, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  usub = Llu_symbfact->usub;
-	}
-      PS->nops += xlsub[fstVtx_blk_lid+1] - xlsub[fstVtx_blk_lid];
-      PS->nops += xusub[fstVtx_blk_lid+1] - xusub[fstVtx_blk_lid];
-    }
-    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-      Pslu_freeable->supno_loc[vtx_lid] = nsuper_loc;
-      if (vtx > fstVtx_blk) {
-	xlsub[vtx_lid] = nextl;
-	xusub[vtx_lid] = nextu;
-      }
-    }
-    ind_blk += 2;
-    nsuper_loc ++;
-  }
-  
-  *p_nextl = nextl;
-  *p_nextu = nextu;
-  *p_nsuper_loc = nsuper_loc;
-/*   VInfo->curblk_loc = ind_blk; */
-  
-  return 0;
-}
-
-static int_t
-dnsCurSep_symbfact
-(
- int_t n,          /* Input - order of the matrix */
- int   iam,        /* Input - my processor number */
- int   ind_sizes1,
- int   ind_sizes2,
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int   szSep,
- int   npNode,
- int_t rcvd_dnsSep,
- int_t *p_nextl,     
- int_t *p_nextu, 
- int_t *p_mark,
- int_t *p_nsuper_loc,
- int_t *marker,   /* temporary array of size n */
- MPI_Comm ndCom,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS
- )
-{
-  int_t fstVtx_blk, fstVtx_dns, fstVtx_dns_lid, lstVtx_blk, 
-    fstVtx, lstVtx, lstVtx_dns_lid;
-  int_t ind_blk, i, vtx, vtx_lid, vtx_lid_x, nvtcs_upd, save_cnt, mem_error;
-  int_t computeL, computeU, vtx_elt, j, cur_blk, snlid, snrep;
-  int_t *sub, *xsub, *minElt_vtx, *cntelt_vtcs;
-  int_t mark, next, *x_newelts, *x_newelts_L, *x_newelts_U;
-  int_t *newelts_L, *newelts_U, *newelts;
-  int_t *globToLoc, maxNvtcsPProc, lvl;
-  int_t prval, kmin, kmax, maxElt, ktemp, prpos;
-  float mem_dnsCS;
-
-  if (!rcvd_dnsSep)
-    VInfo->curblk_loc += 2;
-  
-  computeL = TRUE; computeU = TRUE;
-  lstVtx_dns_lid = EMPTY;
-  globToLoc = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  fstVtx = fstVtxSep[ind_sizes2];
-  lstVtx = fstVtx + sizes[ind_sizes2];
-  cur_blk = VInfo->curblk_loc;
-  fstVtx_dns = VInfo->begEndBlks_loc[cur_blk];
-  fstVtx_dns_lid = LOCAL_IND( globToLoc[fstVtx_dns] );
-  lvl = (int_t) LOG2( npNode );
-  x_newelts_U = NULL;
-  newelts_L = NULL;
-  newelts_U = NULL;
-  mem_dnsCS = 0.;
-  
-  PS->nDnsCurSep ++;
-
-  if (CS->rcv_bufSz > n - fstVtx_dns)
-    minElt_vtx = CS->rcv_buf;
-  else {
-    if (!(minElt_vtx = intMalloc_symbfact(n - fstVtx_dns)))
-      ABORT("Malloc fails for minElt_vtx[].");
-    mem_dnsCS += n - fstVtx_dns;
-  }
-  
-  while (computeL || computeU) {
-    if (computeL) {
-      sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
-      x_newelts = Llu_symbfact->cntelt_vtcs;
-      x_newelts_L = x_newelts;
-    }
-    else {
-      sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
-    }
-
-    /* use minElt_vtx to determine starting vertex of each nonzero element */
-    for (i = 0; i < n - fstVtx_dns; i++)
-      minElt_vtx[i] = n;
-
-    ind_blk = cur_blk;
-    vtx_lid = fstVtx_dns_lid;
-    nvtcs_upd = 0;
-    while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-	   ind_blk < 2 * VInfo->nblks_loc) {	  
-      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-      ind_blk += 2;
-      nvtcs_upd += lstVtx_blk - fstVtx_blk;
-      for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-	j = xsub[vtx_lid];
-	while (j < xsub[vtx_lid+1] && sub[j] != EMPTY) {
-	  PS->nops ++;
-	  vtx_elt = sub[j] - fstVtx_dns;
-	  if (minElt_vtx[vtx_elt] == n) {
-	    minElt_vtx[vtx_elt] = vtx;
-	  }
-	  j ++;
-	}
-      }	  
-    }
-    if (!computeL) {
-      if (!(x_newelts_U = intMalloc_symbfact(nvtcs_upd + 1)))
-	ABORT("Malloc fails for x_newelts_U[].");
-      mem_dnsCS += nvtcs_upd + 1;
-      x_newelts = x_newelts_U;
-    }
-    else {
-      /* save the value in cntelt_vtcs[lstVtx_blk_lid] */
-      save_cnt = x_newelts[vtx_lid];
-      lstVtx_dns_lid = vtx_lid;
-    }
-    
-    MPI_Allreduce (&(minElt_vtx[lstVtx - fstVtx_dns]), &(marker[lstVtx]), 
-		   n - lstVtx, mpi_int_t, MPI_MIN, ndCom);
-
-#if ( PRNTlevel>=1 )
-    PS->no_msgsCol += (float) (2 * (int_t) LOG2( npNode ));
-    PS->sz_msgsCol += (float) (n - lstVtx);
-    if (PS->maxsz_msgCol < n - lstVtx) 
-      PS->maxsz_msgCol = n - lstVtx;      
-#endif
-    
-    /* use x_newelts to determine counts of elements starting in each vertex */
-    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++)
-      x_newelts[vtx_lid] = 0;
-    
-    for (vtx = lstVtx; vtx < n; vtx++) {
-      if (marker[vtx] != n) {
-	vtx_elt = marker[vtx];
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  x_newelts[ LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid ] ++;
-	}
-	else {
-	  /* find the first vertex > vtx_elt which belongs to iam */
-	  ind_blk = cur_blk;
-	  vtx_lid = 0;
-	  while (vtx_elt > VInfo->begEndBlks_loc[ind_blk] &&
-		 ind_blk < 2 * VInfo->nblks_loc) {
-	    vtx_lid += VInfo->begEndBlks_loc[ind_blk+1] -
-	      VInfo->begEndBlks_loc[ind_blk];
-	    ind_blk += 2;
-	  }
-	  if (VInfo->begEndBlks_loc[ind_blk] < lstVtx) {
-	    x_newelts[vtx_lid] ++;		
-	    marker[vtx] = VInfo->begEndBlks_loc[ind_blk];
-	  }			    
-	  else
-	    marker[vtx] = n;
-	}
-      }
-    }
-    
-    /* set up beginning of new elements for each local vtx */
-    i = 0;
-    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) {
-      j = x_newelts[vtx_lid];
-      x_newelts[vtx_lid] = i;
-      i += j;
-    }
-    x_newelts[vtx_lid] = i;
-    newelts = NULL;
-    if (i != 0) {
-      if (!(newelts = intMalloc_symbfact(x_newelts[vtx_lid])))
-	ABORT("Malloc fails for newelts[].");    
-      mem_dnsCS += x_newelts[vtx_lid];
-      
-      for (vtx = lstVtx; vtx < n; vtx++) {
-	if (marker[vtx] != n) {
-	  vtx_elt = marker[vtx];
-	  vtx_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid;	  
-	  newelts[x_newelts[vtx_lid]] = vtx;
-	  x_newelts[vtx_lid] ++;
-	}
-      }
-    }
-    /* reset beginning of new elements for each local vertex */
-    i = 0;
-    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) {
-      j = x_newelts[vtx_lid];
-      x_newelts[vtx_lid] = i;
-      i = j;
-    }
-
-    if (computeL == TRUE) {
-      computeL = FALSE;
-      newelts_L = newelts;
-    }
-    else {
-      computeU = FALSE;
-      newelts_U = newelts;
-    }
-  }
-  
-  for (i = fstVtx_dns; i < n; i++)
-    marker[i] = EMPTY;
-  mark = 0;
-  
-  /* update vertices */
-  prval = n; 	    
-  ind_blk = cur_blk;
-  fstVtx_dns = VInfo->begEndBlks_loc[ind_blk];
-  vtx_lid = LOCAL_IND( globToLoc[fstVtx_dns] );
-  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-	 ind_blk < 2 * VInfo->nblks_loc) {	  
-    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-    ind_blk += 2;
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-      vtx_lid_x = vtx_lid - fstVtx_dns_lid;
-      Llu_symbfact->xlsub[vtx_lid] = *p_nextl;
-      Llu_symbfact->xusub[vtx_lid] = *p_nextu;
-      if (vtx == fstVtx_blk || x_newelts_L[vtx_lid_x+1] != x_newelts_L[vtx_lid_x] ||
-	  x_newelts_U[vtx_lid_x+1] != x_newelts_U[vtx_lid_x]) {
-	/* a new supernode starts */
-	snlid = vtx_lid;
-	snrep = vtx;
-	if (mark + 2 > n) {
-	  /* reset to EMPTY marker array */
-	  for (i = 0; i < n; i++)
-	    marker[i] = EMPTY;
-	  mark = 0;
-	}
-
-	computeL = TRUE;
-	computeU = FALSE;
-	while (computeL || computeU) {
-	  if (computeL) {
-	    sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
-	    x_newelts = x_newelts_L; newelts = newelts_L;
-	    next = *p_nextl;
-	  }
-	  else {
-	    sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
-	    x_newelts = x_newelts_U; newelts = newelts_U;
-	    next = *p_nextu;
-	  }
-	  xsub[vtx_lid] = next;
-
-	  /* TEST available memory */
-	  j = x_newelts[vtx_lid_x+1] + lstVtx - vtx;
-	  if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) ||
-	      (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) {
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, vtx, next, next + j,
-				      computeL, DNS_CURSEP, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	    if (computeL) sub = Llu_symbfact->lsub;
-	    else sub = Llu_symbfact->usub; 
-	  }
-	  
-	  if (computeL)  i = vtx;
-	  else           i = vtx+1;
-	  while (i < lstVtx) {
-	    sub[next] = i; next ++;
-	    i ++;
-	  }
-	  PS->nops += x_newelts[vtx_lid_x+1];
-	  for (i = 0; i < x_newelts[vtx_lid_x+1]; i++) {
-	    vtx_elt = newelts[i];
-	    sub[next] = vtx_elt; next ++;
-	    if (computeU && vtx_elt < prval 
-		&& marker[vtx_elt] == mark-1)
-	      prval = vtx_elt;
-	    marker[vtx_elt] = mark;
-	  }
-	  if (computeL) {
-	    computeL = FALSE; computeU = TRUE;
-	    *p_nextl = next;
-	  }
-	  else {
-	    computeU = FALSE;
-	    *p_nextu = next;
-	  }
-	  mark ++;	  
-	}	  
-	if (vtx != fstVtx_blk)
-	  (*p_nsuper_loc) ++;
-      } /* a new supernode starts */
-      /* vtx belongs to the curent supernode */
-      Pslu_freeable->supno_loc[vtx_lid] = *p_nsuper_loc;
-    } 
-    (*p_nsuper_loc) ++;
-  }
-  
-  if (ind_blk > 0) {
-    /* if iam owns blocks of this level */
-    i = *p_nextl - Llu_symbfact->xlsub[snlid];
-    j = *p_nextu - Llu_symbfact->xusub[snlid];
-    
-    if (VInfo->begEndBlks_loc[ind_blk - 1] == lstVtx && i > 1 && j > 0) {
-      /* if iam the last processor owning a block of this level */
-      computeL = TRUE; computeU = FALSE;
-      /* prune the structure */
-      while (computeL || computeU) {	   
-	if (computeL) {
-	  sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
-	  next = *p_nextl;
-	  computeL = FALSE; computeU = TRUE;
-	}
-	else {
-	  sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
-	  next = *p_nextu;
-	  computeU = FALSE;
-	}
-	
-	kmin = xsub[snlid];
-	kmax = next - 1;
-	if (prval != n) {
-	  maxElt = prval;
-	  while (kmin <= kmax) {
-	    /* Do a quicksort-type partition. */    
-	    if (sub[kmax] > prval) 
-	      kmax--;
-	    else if (sub[kmin] <= prval) {
-	      kmin++;
-	    }
-	    else { /* kmin does'nt belong to G^s(L), and kmax belongs: 
-		    * 	   interchange the two subscripts
-		    */
-	      ktemp = sub[kmin];
-	      sub[kmin] = sub[kmax];
-	      sub[kmax] = ktemp;
-	      kmin ++;
-	      kmax --;
-	    }
-	    if (sub[kmin-1] == prval)
-	      prpos = kmin - 1;
-	  }
-	}
-	else {
-	  maxElt = EMPTY;
-	  while (kmin <= kmax) {
-	    /* compute maximum element of L(:, vtx) */
-	    if (sub[kmin] > maxElt) {
-	      maxElt = sub[kmin];
-	      prpos = kmin;
-	    }
-	    kmin ++;	      
-	  }
-	}
-	ktemp = sub[xsub[snlid]];
-	sub[xsub[snlid]] = maxElt;
-	sub[prpos] = ktemp;
-      }
-      
-      /* setup snd_interSz information */
-      prval = Llu_symbfact->lsub[Llu_symbfact->xlsub[snlid]];
-      if (prval >= lstVtx) {
-	/* this supernode will be send to next layers of the tree */
-	while (prval >= lstVtx && szSep != 1) {
-	  ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2;
-	  ind_sizes1 += szSep;
-	  lvl ++;
-	  szSep = szSep / 2;
-	  lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
-	  CS->snd_interSz[lvl] += i + j + 4;
-	  CS->snd_LinterSz[lvl] += i + 2;
-	  if (CS->snd_vtxinter[lvl] == EMPTY)
-	    CS->snd_vtxinter[lvl] = snrep;
-	}
-      }
-    }
-  }
-
-  /* restore value in cntelt_vtcs */
-  if (lstVtx_dns_lid != EMPTY)
-    Llu_symbfact->cntelt_vtcs[lstVtx_dns_lid] = save_cnt;
-  *p_mark = mark;
-  if (minElt_vtx != CS->rcv_buf)
-    SUPERLU_FREE (minElt_vtx);  
-  SUPERLU_FREE (x_newelts_U);
-  if (newelts_L) SUPERLU_FREE (newelts_L);
-  if (newelts_U) SUPERLU_FREE (newelts_U);
-  if (PS->szDnsSep < mem_dnsCS)
-    PS->szDnsSep = mem_dnsCS;
-}
-
-/* all processors affected to current node must call this routine
-   when VInfo->filledSep == FILLED_SEP
-   This is necessary since subsequent routines called from here use 
-   MPI_allreduce among all processors affected to curent node */
-
-static int_t
-denseSep_symbfact 
-(
- int   rcvd_dnsSep, /* =1 if processor received info that the separator
-		       became dense,
-		       =0 if myPE determined that separator is full */
- int_t n,           /* Input - order of the matrix */
- int   iam,         /* Input - my processor number */
- int   ind_sizes1,
- int   ind_sizes2,
- int_t *sizes,     /* Input - sizes of each separator in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int   szSep, 
- int   fstP,        /* first pe affected current node */
- int   lstP,        /* last pe affected current node */
- int_t fstVtx_blkCyc, 
- int_t nblk_loc,    /* block number in the block cyclic distribution of current
-		       supernode */
- int_t *p_nextl,
- int_t *p_nextu,
- int_t *p_mark,
- int_t *p_nsuper_loc,
- int_t *marker,
- MPI_Comm ndCom,
- MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS
-) 
-{
-  int   nprocsLvl, p, prvP, tag;
-  int_t nmsgsToSnd, nmsgsToRcv;
-  int_t ind_blk, mem_error;
-  int_t *rcv_intraLvl;
-  int_t fstVtx, lstVtx, cur_blk, lstVtx_blk, fstVtx_blk;
-  int_t *globToLoc, maxNvtcsPProc;
-  MPI_Status status;
-  
-  globToLoc = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  fstVtx = fstVtxSep[ind_sizes2];
-  lstVtx = fstVtx + sizes[ind_sizes2];
-  rcv_intraLvl = CS->rcv_intraLvl;
-  cur_blk   = VInfo->curblk_loc;
-  nprocsLvl = lstP - fstP;
-  
-  if (nblk_loc == 0) {
-    nmsgsToSnd = 2; nmsgsToRcv = 1;
-  }
-  else {
-    nmsgsToSnd = 1; nmsgsToRcv = 0;
-    if (!rcvd_dnsSep) nmsgsToRcv ++;
-  }
-  if (iam == fstP && rcvd_dnsSep && nblk_loc == 1) 
-    nmsgsToRcv ++;
-  
-  /* first exchange msgs with all processors affected to current node */
-  ind_blk = cur_blk;
-  while ((nmsgsToSnd || nmsgsToRcv) && VInfo->begEndBlks_loc[ind_blk] < lstVtx) {
-    tag = (int) (tag_intraLvl + nblk_loc);
-    if (nmsgsToSnd) {
-      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-      if (lstVtx_blk != lstVtx) {
-	p = OWNER( globToLoc[lstVtx_blk]);
-	MPI_Send (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p,
-		  tag, (*symb_comm));
-#if ( PRNTlevel>=1 )
-	PS->no_shmSnd += (float) 1;
-#endif
-      }
-      nmsgsToSnd --;
-    }
-    ind_blk += 2;
-    nblk_loc ++;
-    tag = tag_intraLvl + nblk_loc;
-    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-    if (nmsgsToRcv && fstVtx_blk < lstVtx) {
-      if (iam == fstP) tag --;
-      prvP = OWNER( globToLoc[fstVtx_blk - 1]);
-      MPI_Recv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP,
-		tag, (*symb_comm), &status);
-#if ( PRNTlevel>=1 )
-      PS->no_shmRcvd += (float) 1;
-#endif
-      nmsgsToRcv --;
-    }
-  }
-
-  if (VInfo->filledSep == FILLED_SEP) {
-    if (mem_error = 
-	dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, 
-			    szSep, lstP - fstP, rcvd_dnsSep, p_nextl, 
-			    p_nextu, p_mark, p_nsuper_loc, marker, ndCom,
-			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS))
-      return (mem_error);
-  }
-  else if (rcvd_dnsSep) 
-    if (mem_error = 
-	dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
-			    sizes, fstVtxSep, EMPTY,
-			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-			    p_nextl, p_nextu, p_nsuper_loc))
-      return (mem_error);
-  return 0;
-}
-
-
-static int_t
-interLvl_symbfact
-(
- SuperMatrix *A, /* Input - input matrix A */
- int   iam,      /* Input - my processor number */  
- int   lvl,      /* Input - current level in the separator tree */ 
- int   szSep,    /* Input - size of the current separator (node) */
- int   fstP,     /* Input - first processor assigned to current node */
- int   lstP,     /* Input - last processor assigned to current node */
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t *p_nextl,
- int_t *p_nextu,
- int_t *p_nsuper_loc,
- int_t *pmark,   /* mark for symbfact */
- int_t *marker,  /* temp array used for marking */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- comm_symbfact_t *CS,/* infos on communication data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS,
- MPI_Comm ndComm,
- MPI_Comm    *symb_comm /* Input - communicator for symbolic factorization */
- )
-{
-  MPI_Status  *status; 
-  MPI_Request *request_snd, *request_rcv;
-  
-  int   nprocsLvl, rcvdP, p, filledSep_lvl;
-  int   toSend, toSendL, toSendU;
-  int_t *rcv_interLvl;
-  int_t *snd_interLvl, *snd_interLvl1, *snd_interLvl2,
-    snd_interLvlSz, snd_LinterLvlSz, snd_vtxLvl;
-  int_t  vtx_elt, update_loc, code_err;
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz;
-  int_t  n, mark, max_rcvSz; 
-  int_t nextl, nextu, ind_blk, vtx_lid, k, count, nelts, 
-    lstVtxLvl_loc, lstVtxLvl_loc_lid, mem_error;
-  int_t fstVtx_blk, lstVtx_blk, i, j, vtx, prElt_L, prElt_U, 
-    snd_indBlk, prElt_ind;
-  int_t fstVtxLvl_loc, nvtcsLvl_loc, maxNvtcsPProc, *globToLoc, 
-    fstVtx, lstVtx;
-  int  ind1, nprocsToRcv, nprocsToSnd, ind2, ind_l, ind_u, ij, ik;
-  int_t req_ind, sent_msgs, req_ind_snd;
-  int_t initInfo_loc[2], initInfo_gl[2];
-
-  /* Initialization */
-  n = A->ncol;
-  fstVtx          = fstVtxSep[ind_sizes2];
-  lstVtx          = fstVtx + sizes[ind_sizes2];
-  maxNvtcsPProc   = Pslu_freeable->maxNvtcsPProc;
-  globToLoc       = Pslu_freeable->globToLoc;
-  nprocsLvl       = lstP - fstP;
-  rcv_interLvl    = CS->rcv_interLvl;
-  snd_interLvl    = CS->snd_interLvl;
-  snd_interLvlSz  = CS->snd_interSz[lvl];
-  snd_LinterLvlSz = CS->snd_LinterSz[lvl];
-  snd_vtxLvl      = CS->snd_vtxinter[lvl];
-  fstVtxLvl_loc   = VInfo->begEndBlks_loc[VInfo->curblk_loc];
-  nvtcsLvl_loc    = VInfo->nvtcsLvl_loc;
-  request_snd = NULL;
-  request_rcv = NULL;
-  status = NULL;
-  mark = *pmark;
-  
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-
-  /* snd_vtxLvl denotes the first vertex from which iam needs
-     to send data.  
-     snd_interLvlSz denotes maximum size of the send data,
-     snd_LinterLvlSz denotes send data corresponding to L part */
-
-  /* determine maximum size of receive buffer and information
-   on filled sep */
-  if (snd_interLvlSz != 0) {
-    if (snd_LinterLvlSz == 0) 
-      snd_interLvlSz = 0;
-    if (snd_interLvlSz - snd_LinterLvlSz == 0)
-      snd_interLvlSz = 0;
-  }
-  
-  initInfo_loc[0] = snd_interLvlSz;
-  initInfo_loc[1] = (int_t) VInfo->filledSep;
-  MPI_Allreduce (initInfo_loc, initInfo_gl, 2, 
-		 mpi_int_t, MPI_MAX, ndComm);
-#if ( PRNTlevel>=1 )
-  PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl ));
-  PS->sz_msgsCol += 2;
-  if (PS->maxsz_msgCol < 2) 
-    PS->maxsz_msgCol = 2;      
-#endif  
-  max_rcvSz = initInfo_gl[0];
-  filledSep_lvl = (int) initInfo_gl[1];
-
-  if (filledSep_lvl == FILLED_SEPS) {
-    /* quick return if all upper separators are dense */
-    if (VInfo->filledSep != FILLED_SEPS) {
-      VInfo->filledSep = FILLED_SEPS;
-      if (mem_error = 
-	  dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, 
-			      fstVtxSep,
-			      EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-			      p_nextl, p_nextu, p_nsuper_loc))
-	return (mem_error);
-    }
-    return 0;
-  }
-
-  if (max_rcvSz == 0)
-    /* quick return if no communication necessary */
-    return 0; 
-  
-  /* allocate data for the send buffer */  
-  if (snd_interLvlSz)
-    if (CS->snd_bufSz < snd_interLvlSz) {
-      PS->maxSzBuf += snd_interLvlSz - CS->snd_bufSz;
-      if (CS->snd_bufSz != 0)
-	/* not first time allocate memory */
-	SUPERLU_FREE (CS->snd_buf);
-      CS->snd_bufSz = snd_interLvlSz;
-      if (!(CS->snd_buf = intMalloc_symbfact (snd_interLvlSz))) {
-	ABORT("Malloc fails for snd_buf[].");
-      }
-    }
-    
-  /* snd_interLvl : to which processors the data need to be send 
-   * information setup during the copy of data to be send in the buffer  
-   * rcv_interLvl : from which processors iam receives update data  */
-  for (p = 2*fstP; p < 2*lstP; p++)
-    snd_interLvl[p] = EMPTY;
-
-  if (snd_interLvlSz == 0 && nvtcsLvl_loc == 0) {
-    code_err = MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t,
-			     &(rcv_interLvl[2*fstP]), 2, mpi_int_t,
-			     ndComm);
-#if ( PRNTlevel>=1 )
-    PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl ));
-    PS->sz_msgsCol += 2;
-    if (PS->maxsz_msgCol < 2) 
-      PS->maxsz_msgCol = 2;      
-#endif  
-    return 0;
-  }
-  
-  /* in interLvlInfos, 
-   * obtain from which processors iam receives update information */
-  update_loc = FALSE;
-  nextl = 0; 
-  nextu = snd_LinterLvlSz;
-  if (snd_interLvlSz != 0) {
-    /* copy data to be send */
-    /* find index block from where to send data */
-    ind_blk = VInfo->curblk_loc;
-    while (snd_vtxLvl < VInfo->begEndBlks_loc[ind_blk]) {
-      ind_blk -= 2;
-    }
-    snd_indBlk = ind_blk;
-    vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] );
-    for (; ind_blk < VInfo->curblk_loc; ind_blk += 2) {
-      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-      if (ind_blk == snd_indBlk)
-	fstVtx_blk = snd_vtxLvl;
-      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-      for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) {
-	toSendL = FALSE; toSendU = FALSE;
-	if (xlsub[vtx_lid] != xlsub[vtx_lid+1] && 
-	    xusub[vtx_lid] != xusub[vtx_lid+1]) {
-	  k = xlsub[vtx_lid];
-	  prElt_L = lsub[k];
-	  j = xusub[vtx_lid];
-	  prElt_U = usub[j];
-	  if (prElt_L >= fstVtx || prElt_U >= fstVtx) {
-	    if (prElt_L >= fstVtx)
-	      while (lsub[k] <= prElt_L && k < xlsub[vtx_lid + 1]) {
-		vtx_elt = lsub[k];
-		if (vtx_elt >= fstVtx && vtx_elt < lstVtx) {
-		  p = OWNER( globToLoc[vtx_elt] );
-		  if (p != iam) {
-		    /* vtx will be send to another processor */
-		    snd_interLvl[2*p] = TRUE;
-		    toSendL = TRUE;
-		  }
-		  else
-		    update_loc = TRUE;
-		}
-		k++;
-	      }
-	    if (prElt_U >= fstVtx)
-	      while (usub[j] <= prElt_U && j < xusub[vtx_lid + 1]) {
-		vtx_elt = usub[j];
-		if (vtx_elt >= fstVtx && vtx_elt < lstVtx) {
-		  p = OWNER( globToLoc[vtx_elt] );
-		  if (p != iam) {
-		    /* vtx will be send to another processor */
-		    snd_interLvl[2*p+1] = TRUE;
-		    toSendU = TRUE;
-		  }
-		  else
-		    update_loc = TRUE;
-		}
-		j ++;
-	      }
-	    if (toSendL || toSendU) {
-	      /* L(:, vtx) and U(vtx, :) will be send to processors */
-	      CS->snd_buf[nextu + DIAG_IND]  = vtx;
-	      nelts = xusub[vtx_lid+1] - xusub[vtx_lid];
-	      CS->snd_buf[nextu + NELTS_IND] = nelts;
-	      nextu += 2;
-	      for (j = xusub[vtx_lid]; j < xusub[vtx_lid+1]; j++, nextu ++) {
-		CS->snd_buf[nextu] = usub[j]; 
-	      }
-	      CS->snd_buf[nextl + DIAG_IND] = vtx;
-	      nelts = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-	      CS->snd_buf[nextl + NELTS_IND] = nelts; 
-	      nextl += 2;
-	      for (j = xlsub[vtx_lid]; j < xlsub[vtx_lid+1]; j++, nextl ++) {
-		CS->snd_buf[nextl] = lsub[j];
-	      }
-	    }
-	  }
-	}
-      }
-    }
-    lstVtxLvl_loc = vtx;
-    lstVtxLvl_loc_lid = vtx_lid;
-  }
-  
-  if (nextl == 0 || nextu - snd_LinterLvlSz == 0) {
-    for (p = 2*fstP; p < 2*lstP; p++)
-      snd_interLvl[p] = EMPTY;
-  }
-  
-  nprocsToSnd = 0;
-  for (p = 2*fstP; p < 2*lstP; p +=2) {
-    if (snd_interLvl[p] != EMPTY || snd_interLvl[p+1] != EMPTY) {
-      snd_interLvl[p] = nextl;
-      snd_interLvl[p+1] = nextu - snd_LinterLvlSz;
-      nprocsToSnd ++;
-    }
-  }
-  
-  MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t,
-		&(rcv_interLvl[2*fstP]), 2, mpi_int_t, ndComm);    
-#if ( PRNTlevel>=1 )
-  PS->no_msgsCol += (float) (2 * (int_t) LOG2( nprocsLvl ));
-  PS->sz_msgsCol += 2 * nprocsLvl;
-  if (PS->maxsz_msgCol < 2 * nprocsLvl) 
-    PS->maxsz_msgCol = 2 * nprocsLvl;      
-#endif    
-
-  max_rcvSz = 0;
-  nprocsToRcv = 0;
-  for (p = 2*fstP; p < 2*lstP; p +=2) {
-    CS->ptr_rcvBuf[p] = max_rcvSz;
-    if (rcv_interLvl[p] != EMPTY) 
-      max_rcvSz += rcv_interLvl[p];
-    CS->ptr_rcvBuf[p+1] = max_rcvSz;
-    if (rcv_interLvl[p+1] != EMPTY) 
-      max_rcvSz += rcv_interLvl[p+1];
-    if (rcv_interLvl[p] != EMPTY || rcv_interLvl[p+1] != EMPTY) 
-      nprocsToRcv ++;
-  }
-
-  /* allocate data for the receive buffer */  
-  if (CS->rcv_bufSz < max_rcvSz) {
-    PS->maxSzBuf += max_rcvSz - CS->rcv_bufSz;
-    if (CS->rcv_bufSz != 0) /* not first time allocate memory */
-      SUPERLU_FREE (CS->rcv_buf);
-    CS->rcv_bufSz = max_rcvSz;
-    if (!(CS->rcv_buf = intMalloc_symbfact (max_rcvSz))) {
-      ABORT("Malloc fails for rcv_buf[].");
-    }
-  }
-  
-  /* allocate memory for status arrays */
-  if (nprocsToSnd)
-    if ( !(request_snd = (MPI_Request*) 
-	   SUPERLU_MALLOC(2 * nprocsToSnd * sizeof(MPI_Request))))
-      ABORT("Not enough memory when allocating MPI_Request");
-  if (nprocsToRcv)
-    if ( !(request_rcv = (MPI_Request*) 
-	   SUPERLU_MALLOC(2 * nprocsToRcv * sizeof(MPI_Request))))
-      ABORT("Not enough memory when allocating MPI_Request");
-  if (nprocsToRcv || nprocsToSnd)
-    if ( !(status = (MPI_Status*) 
-	   SUPERLU_MALLOC(2 * (lstP-fstP) * sizeof(MPI_Status))))
-      ABORT("Not enough memory when allocating MPI_Request");
-  
-  /* determine if we have to send data */
-  i = 0;
-  for (toSend = fstP, p = 2*fstP; p < 2*lstP; toSend++, p+=2) 
-    if (snd_interLvl[p] != EMPTY && toSend != iam) {
-      MPI_Isend (CS->snd_buf, nextl, mpi_int_t, toSend,
-		 tag_interLvl_LData, (*symb_comm), &(request_snd[2*i]));
-      MPI_Isend (&(CS->snd_buf[snd_LinterLvlSz]), 
-		 nextu - snd_LinterLvlSz, mpi_int_t, toSend,
-		 tag_interLvl_UData, (*symb_comm), &(request_snd[2*i+1]));
-      i++;
-#if ( PRNTlevel>=1 )
-      PS->no_msgsSnd += (float) 2;
-      PS->sz_msgsSnd += (float) (nextl + nextu - snd_LinterLvlSz);
-      if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl;
-      if (PS->maxsz_msgSnd < nextu - snd_LinterLvlSz) 
-	PS->maxsz_msgSnd = nextu - snd_LinterLvlSz;      
-#endif
-    }
-  
-  if (update_loc) {
-    /* use own data to update symbolic factorization */
-    vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] );
-    lsub_rcvd    = &(lsub[xlsub[vtx_lid]]);
-    lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid];
-    usub_rcvd    = &(usub[xusub[vtx_lid]]);
-    usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid];
-    
-    mem_error = 
-      rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz,
-		 usub_rcvd, usub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-
-    lsub_rcvd    = &(Llu_symbfact->lsub[xlsub[vtx_lid]]);
-    lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid];
-    usub_rcvd    = &(Llu_symbfact->usub[xusub[vtx_lid]]);
-    usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid];
-    lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub;
-    mem_error = 
-      rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz,
-		 lsub_rcvd, lsub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-    lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub;
-  }
-
-  /* post non-blocking receives for all the incoming messages */
-  i = 0;
-  for (rcvdP = fstP, p = 2*fstP; p < 2*lstP; rcvdP++, p += 2) 
-    if (rcv_interLvl[p] != EMPTY) {
-      lsub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[p]]);
-      MPI_Irecv (lsub_rcvd, rcv_interLvl[p], mpi_int_t, rcvdP,
-		 tag_interLvl_LData, (*symb_comm), &(request_rcv[i]));
-      usub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[p+1]]);
-      MPI_Irecv (usub_rcvd, rcv_interLvl[p+1], mpi_int_t, rcvdP,
-		 tag_interLvl_UData, (*symb_comm), &(request_rcv[i+1]));
-      i += 2;
-#if ( PRNTlevel>=1 )
-      PS->no_msgsRcvd += (float) 2;
-      PS->sz_msgsRcvd += (float) (rcv_interLvl[p] + rcv_interLvl[p+1]);
-      if (PS->maxsz_msgRcvd < rcv_interLvl[p])
-	PS->maxsz_msgRcvd = rcv_interLvl[p];
-      if (PS->maxsz_msgRcvd < rcv_interLvl[p+1])
-	PS->maxsz_msgRcvd = rcv_interLvl[p+1];
-#endif
-    }
-  
-  /* wait until messages are received and update local data */
-  for (i = 0; i < nprocsToRcv; i++) {
-    MPI_Waitany (2*nprocsToRcv, request_rcv, &ind1, status);
-    ij = 0;
-    for (p = fstP; p < lstP; p++)
-      if (rcv_interLvl[2*p] != EMPTY) {
-	if (ij <= ind1 && ind1 < ij+2) {
-	  rcvdP = p; p = lstP;
-	  if (ind1 == ij) ind2 = ij+1;
-	  else ind2 = ind1 - 1;
-	  ind_l = ij; ind_u = ij+1;
-	}
-	ij += 2;
-      }
-    MPI_Get_count (status, mpi_int_t, &ij);
-    MPI_Wait (&(request_rcv[ind2]), status);
-    MPI_Get_count (status, mpi_int_t, &ik);    
-    if (ind1 == ind_l) {
-      lsub_rcvd_sz = ij;
-      usub_rcvd_sz = ik;
-    } else {
-      lsub_rcvd_sz = ik;
-      usub_rcvd_sz = ij;
-    }
-    lsub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP]]);
-    usub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP+1]]);
-    
-    /* use received data to update symbolic factorization information */
-    mem_error = 
-      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
-		 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-    lsub = Llu_symbfact->lsub;
-    mem_error = 
-      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
-		 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-    usub = Llu_symbfact->usub;      
-  }
-  
-  if (nprocsToSnd)
-    MPI_Waitall (2*nprocsToSnd, request_snd, status);
-
-  *pmark = mark;
-  if (request_snd != NULL) SUPERLU_FREE (request_snd);
-  if (request_rcv != NULL) SUPERLU_FREE (request_rcv);
-  if (status != NULL) SUPERLU_FREE (status);
-}
-
-static void
-freeComm
-(
- int   iam,          /* Input -my processor number */
- int   nprocs,       /* Input -number of procs for the symbolic fact. */
- MPI_Comm *commLvls, /* Input -communicators for the nodes in the sep tree */
- MPI_Comm *symb_comm /* Input - communicator for symbolic factorization */
- )
-{
-  int szSep, i, j, k;
-  int np, npNode, fstP, lstP, ind;
-
-  i = 2 * nprocs - 2;
-  MPI_Comm_free (&(commLvls[i]));
-  
-  szSep = 2;
-  i -= szSep;
-  
-  while (i > 0) {
-    /* for each level in the separator tree */
-    npNode = nprocs / szSep; 
-    fstP = 0; 
-    /* for each node in the level */
-    for (j = i; j < i + szSep; j++) {
-      lstP = fstP + npNode;
-      if (fstP <= iam && iam < lstP) {
-	ind = j;
-      }
-      fstP += npNode;
-    }
-    MPI_Comm_free ( &(commLvls[ind]) );
-    szSep *= 2;
-    i -= szSep;
-  }  
-}
-
-static void
-createComm 
-(
- int   iam,          /* Input -my processor number */
- int   nprocs,       /* Input -number of procs for the symbolic factorization */
- MPI_Comm *commLvls, /* Output -communicators for the nodes in the sep tree */
- MPI_Comm *symb_comm
- )
-{
-  int szSep, i, j, jj, k, *pranks;
-  int np, npNode, fstP, lstP, p, code_err, ind, col, key;
-  
-  for (i=0; i < 2*nprocs; i++)
-    commLvls[i] = MPI_COMM_NULL;
-
-  /* Make a list of the processes in the new communicator. */
-  pranks = (int *) SUPERLU_MALLOC( nprocs * sizeof(int) );
-  
-  i = 2 * nprocs - 2;
-  MPI_Comm_dup ((*symb_comm), &(commLvls[i]));
-  szSep = 2;
-  i -= szSep;
-
-  while (i > 0) {
-    /* for each level in the separator tree */
-    npNode = nprocs / szSep; 
-    fstP = 0; 
-    /* for each node in the level */
-    for (j = i; j < i + szSep; j++) {
-      lstP = fstP + npNode;
-      if (fstP <= iam && iam < lstP) {
-	ind = j;
-	key = iam - fstP;
-	col = fstP;
-      }
-      fstP += npNode;
-    }
-    MPI_Comm_split ((*symb_comm), col, key, &(commLvls[ind]) );
-    
-    szSep *= 2;
-    i -= szSep;
-  }
-  
-  SUPERLU_FREE (pranks);
-}
-
-static void
-intraLvl_symbfact 
-(
- SuperMatrix *A, /* Input - original matrix A  */
- int   iam,      /* Input - my processor number */
- int   lvl,      /* Input - current level in the separator tree */
- int   szSep,    /* Input - size of the current separator(node) */
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int   fstP,     /* Input - first processor assigned to current node */
- int   lstP,     /* Input - last processor assigned to current node */
- int_t fstVtx,   /* Input - first vertex of current node */
- int_t lstVtx,   /* Input - last vertex of current node */
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *marker,
- int_t *p_mark,    /* marker used to merge elements of vertices */
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_neltsZr, /* no of artificial zeros introduced so far */
- int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
-			 computed so far */
- int_t *p_nsuper_loc,
- MPI_Comm ndComm,
- MPI_Comm    *symb_comm /* Input - communicator for symbolic factorization */
- )
-{
-  int nprocsLvl, p, prvP, rcvP;
-  int toSend, rcvd_prvP, index_req[2];
-  int_t fstVtx_loc_lid, fstVtx_loc, vtx, vtxLvl, curblk_loc, denseSep;
-  int_t fstVtx_blk, fstVtx_blk_lid, lstVtx_blk, lstVtx_blk_lid, tag;
-  int_t nvtcs_blk, xusub_end, xlsub_end, prv_fstVtx_blk;
-  int_t n;
-  int_t *rcv_intraLvl, *snd_intraLvl;
-  int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz;
-  int_t nmsgsRcvd, nmsgsTRcv, sz_msg;
-  int_t nvtcsLvl_loc, nextl, nextu, ind_blk, snd_vtxLvl, maxNeltsVtx_in;
-  int_t count, vtx_loc, mem_error, lstBlkRcvd;
-  int_t fstVtx_blk_loc, fstBlk, vtx_lid, prElt, nelts, j, nvtcs_toUpd;
-  int_t snd_LinterLvlSz, fstVtx_blk_loc_lid, prElt_ind, maxNmsgsToRcv;
-  int_t *xlsub, *xusub, *lsub, *usub;
-  int_t *globToLoc, maxNvtcsPProc, nblk_loc, upd_myD, r, fstVtx_blkCyc;
-  int_t k, prElt_L, prElt_U, vtx_elt, fstVtx_toUpd;
-  int intSzMsg;
-
-  MPI_Status status[4];
-  MPI_Request request[4];
-  
-  /* Initializations */
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-  
-  /* max number of msgs this processor can receive during 
-     intraLvl_symbfact routine */
-  maxNmsgsToRcv  = (lstVtx - fstVtx) / VInfo->maxSzBlk + 1;
-  maxNeltsVtx_in = VInfo->maxNeltsVtx;
-  globToLoc      = Pslu_freeable->globToLoc;
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  n = A->ncol;
-  nprocsLvl       = lstP - fstP;
-  rcv_intraLvl    = CS->rcv_intraLvl;
-  snd_intraLvl    = CS->snd_intraLvl;
-  nvtcsLvl_loc    = VInfo->nvtcsLvl_loc;
-  nmsgsTRcv       = 0;
-  nmsgsRcvd       = 0;
-  nblk_loc        = 0;
-  nvtcs_toUpd     = nvtcsLvl_loc;
-  fstVtx_blk      = fstVtx;
-  denseSep        = FALSE;
-
-  /* determine first vertex that belongs to fstP */
-  k = fstVtx;
-  fstVtx_blkCyc = n;
-  while (k < lstVtx && fstVtx_blkCyc == n) {
-    p = OWNER( globToLoc[k] );
-    if (p == fstP)
-      fstVtx_blkCyc = k;
-    k += VInfo->maxSzBlk;
-  }
-
-  for (p = fstP; p < lstP; p++)
-    rcv_intraLvl[p] = 0;
-
-  for (r = 0; r < 3; r++) 
-    request[r] = MPI_REQUEST_NULL;
-
-  fstVtx_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc];
-  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); 
-  vtx = fstVtx_loc;
-  if (fstVtx_loc >= fstVtx_blkCyc)
-    nblk_loc = 1;
-  while (VInfo->begEndBlks_loc[VInfo->curblk_loc] < lstVtx && !VInfo->filledSep) {
-    CS->snd_intraSz  = 0;
-    CS->snd_LintraSz = 0;
-
-    lstBlkRcvd     = FALSE;
-    prv_fstVtx_blk = fstVtx_blk;
-    fstVtx_blk     = VInfo->begEndBlks_loc[VInfo->curblk_loc];
-    lstVtx_blk     = VInfo->begEndBlks_loc[VInfo->curblk_loc + 1];
-    fstVtx_toUpd   = VInfo->begEndBlks_loc[VInfo->curblk_loc + 2];
-    fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-    lstVtx_blk_lid = LOCAL_IND( globToLoc[lstVtx_blk - 1] + 1);
-    nvtcs_blk      = lstVtx_blk - fstVtx_blk;
-    nvtcs_toUpd   -= nvtcs_blk;
-    nmsgsTRcv      = n;
-    VInfo->maxNeltsVtx -= fstVtx_blk - prv_fstVtx_blk;
-
-    index_req[0] = EMPTY;
-    for (r = 0; r < 3; r++) 
-      request[r] = MPI_REQUEST_NULL;
-    if (fstVtx_blk != fstVtx) {
-      /* if not the first vertex of the level */
-      prvP           = OWNER( globToLoc[fstVtx_blk - 1] );
-      rcvd_prvP      = FALSE;
-      /* receive info on number messages to receive */
-      tag = tag_intraLvl + nblk_loc;
-      if (iam == fstP)  tag --;
-      
-      MPI_Irecv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP,
-		 tag, (*symb_comm), &(request[1]));
-
-      while (!rcvd_prvP || nmsgsRcvd < nmsgsTRcv) {
-	if (index_req[0] != 1) {
-	  MPI_Irecv (&sz_msg, 1, mpi_int_t, 
-		     MPI_ANY_SOURCE, tag_intraLvl_szMsg, 
-		     (*symb_comm), &(request[0]));  
-	  if (sz_msg > INT_MAX)
-	    ABORT("ERROR in intraLvl_symbfact size to send > INT_MAX\n");
-	}
-	MPI_Waitany (2, request, index_req, status);
-	if (index_req[0] == 1) {
-	  /* receive information on no msgs to receive */
-#if ( PRNTlevel>=1 )
-	  PS->no_shmRcvd ++;
-#endif
-	  rcvd_prvP = TRUE;
-	  nmsgsTRcv = rcv_intraLvl[iam];
-	  /* if dense separator was detected by one of the 
-	     previous processors ... */
-	  if (nmsgsTRcv > maxNmsgsToRcv) {
-	    VInfo->filledSep = (int) nmsgsTRcv / maxNmsgsToRcv;
-	    nmsgsTRcv = nmsgsTRcv % maxNmsgsToRcv;
-	  }
-	  
-	  if (nmsgsTRcv == nmsgsRcvd) {
-	    /* MPI_Cancel (&(request[0])); */
-	    MPI_Send (&r, 1, mpi_int_t, iam, 
-		      tag_intraLvl_szMsg, (*symb_comm));
-	    MPI_Wait (&(request[0]), status);	    
-	  }
-	}
-	if (index_req[0] == 0) {
-	  nmsgsRcvd ++;
-	  if (nmsgsTRcv == nmsgsRcvd)  lstBlkRcvd = TRUE; 
-	  rcvP = status->MPI_SOURCE;
-
-	  /* allocate enough space to receive data */
-	  if (CS->rcv_bufSz < sz_msg) {
-	    PS->maxSzBuf += sz_msg - CS->rcv_bufSz;
-	    if (CS->rcv_bufSz != 0)
-	      /* not first time allocate memory */
-	      SUPERLU_FREE (CS->rcv_buf);
-	    CS->rcv_bufSz = sz_msg;
-	    if (!(CS->rcv_buf = intMalloc_symbfact (sz_msg))) {
-	      ABORT("Malloc fails for rcv_buf[].");
-	    }
-	  }
-	  
-	  /* use received data to update symbolic factorization */
-	  lsub_rcvd = CS->rcv_buf;
-	  MPI_Recv (lsub_rcvd, sz_msg, mpi_int_t, 
-		    rcvP, tag_intraLvl_LData, (*symb_comm), status);
-	  MPI_Get_count (status, mpi_int_t, &intSzMsg);
-	  lsub_rcvd_sz = intSzMsg;
-	  usub_rcvd    = &(CS->rcv_buf[lsub_rcvd_sz]);
-	  MPI_Recv (usub_rcvd, sz_msg - lsub_rcvd_sz, 
-		    mpi_int_t, rcvP,
-		    tag_intraLvl_UData, (*symb_comm), status);
-	  MPI_Get_count (status, mpi_int_t, &intSzMsg);
-	  usub_rcvd_sz = intSzMsg;
-#if ( PRNTlevel>=1 )
-	  PS->no_shmRcvd ++;
-	  PS->no_msgsRcvd += (float) 2;
-	  PS->sz_msgsRcvd += (float) sz_msg;
-	  if (PS->maxsz_msgRcvd < lsub_rcvd_sz) PS->maxsz_msgRcvd = lsub_rcvd_sz;
-	  if (PS->maxsz_msgRcvd < usub_rcvd_sz) PS->maxsz_msgRcvd = usub_rcvd_sz;
-#endif
-
-	  if (!lstBlkRcvd) {
-	    mem_error = 
-	      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-			 fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd,
-			 1, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    lsub = Llu_symbfact->lsub;
-	    mem_error = 
-	      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
-			 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, 
-			 fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd,
-			 0, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    usub = Llu_symbfact->usub;
-	  }
-	}
-      }
-    }
-  
-    if (VInfo->filledSep) {
-      mem_error = 
-	denseSep_symbfact (1, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-			   szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
-			   p_nextl, p_nextu, p_mark, p_nsuper_loc, marker,
-			   ndComm, symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
-    }
-    else {
-      /* compute symbolic factorization for this block */
-      if (!lstBlkRcvd) {
-	lsub_rcvd = NULL; usub_rcvd = NULL;
-      }
-
-      blk_symbfact (A, iam, lvl, 
-		    szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-		    fstVtx_loc, fstVtx_blk, lstVtx_blk, 
-		    lsub_rcvd, lsub_rcvd_sz, usub_rcvd, usub_rcvd_sz,
-		    Pslu_freeable, Llu_symbfact, VInfo, CS, PS,
-		    marker, p_mark,
-		    p_nextl, p_nextu, p_neltsZr, p_neltsTotal, 
-		    p_nsuper_loc);
-      lsub = Llu_symbfact->lsub;
-      usub = Llu_symbfact->usub; 	 
-      
-      if (lstVtx_blk != lstVtx) {
-	/* if this is not the last block of the level */
-	if (VInfo->filledSep == FILLED_SEPS ||
-	    ( VInfo->filledSep == FILLED_SEP && 
-	      ((lstVtx - lstVtx_blk > VInfo->maxSzBlk * nprocsLvl && nblk_loc > 0) ||
-	       (lstVtx - fstVtx_blkCyc > VInfo->maxSzBlk * nprocsLvl && nblk_loc == 0))))
-	  /* if current separator is dense and this is not the last block, 
-	     then ... */
-	  denseSep = TRUE;
-	else
-	  /* separator dense but not enough uncomputed blocks 
-	     in the separator to take advantage of it */
-	  VInfo->filledSep = FALSE;
-	
-	if (VInfo->filledSep == FILLED_SEPS) {
-	  for (p = fstP; p < lstP; p++)
-	    rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + rcv_intraLvl[p];
-	  denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-			     szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
-			     p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, 
-			     symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
-	}
-	else {
-	  /* send blk to next procs and update the rest of my own blocks */
-	  if (lstBlkRcvd) {
-	    mem_error = 
-	      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 1, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    lsub = Llu_symbfact->lsub;
-	    mem_error = 
-	      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
-			 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, 
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 0, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    usub = Llu_symbfact->usub;
-	  }
-
-	  upd_myD = FALSE;
-	  /* determine processors to which send this block
-	     and copy data to be sent */
-	  for (p = fstP; p < lstP; p++)
-	    snd_intraLvl[p] = FALSE;
-	  nextl = 0; 
-	  nextu = nextl + CS->snd_LintraSz;
-	  
-	  /* allocate enough space to receive data */
-	  if (CS->rcv_bufSz < CS->snd_intraSz) {
-	    PS->maxSzBuf += CS->snd_intraSz - CS->rcv_bufSz;
-	    if (CS->rcv_bufSz != 0)
-	      /* not first time allocate memory */
-	      SUPERLU_FREE (CS->rcv_buf);
-	    CS->rcv_bufSz = CS->snd_intraSz;
-	    if (!(CS->rcv_buf = intMalloc_symbfact (CS->snd_intraSz))) {
-	      ABORT("Malloc fails for rcv_buf[].");
-	    }
-	  }
-
-	  for (vtx = fstVtx_blk, vtx_lid = fstVtx_blk_lid; 
-	       vtx < lstVtx_blk; vtx++, vtx_lid ++) {
-	    toSend = FALSE;
-	    k = xlsub[vtx_lid];
-	    prElt_L = lsub[k];
-	    j = xusub[vtx_lid];
-	    prElt_U = usub[j];
-
-	    if (prElt_L >= lstVtx_blk || prElt_U >= lstVtx_blk) {
-	      if (vtx == lstVtx_blk - 1) {
-		xlsub_end = *p_nextl;
-		xusub_end = *p_nextu;
-	      }
-	      else {
-		xlsub_end = xlsub[vtx_lid + 1];
-		xusub_end = xusub[vtx_lid + 1];
-	      }
-	      if (prElt_L >= lstVtx_blk) {
-		while (lsub[k] <= prElt_L && k < xlsub_end) {
-		  vtx_elt = lsub[k];
-		  if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) {
-		    p = OWNER( globToLoc[vtx_elt] );
-		    if (p != iam) {
-		      /* vtx will be send to another processor */
-		      snd_intraLvl[p] = TRUE;
-		      toSend = TRUE;
-		    }
-		    else {
-		      upd_myD = TRUE;
-		    }
-		  }
-		  k++;
-		}
-	      }
-	      if (prElt_U >= lstVtx_blk) {
-		while (usub[j] <= prElt_U && j < xusub_end) {
-		  vtx_elt = usub[j];
-		  if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) {
-		    p = OWNER( globToLoc[vtx_elt] );
-		    if (p != iam) {
-		      /* vtx will be send to another processor */
-		      snd_intraLvl[p] = TRUE;
-		      toSend = TRUE;
-		    }
-		    else {
-		      upd_myD = TRUE;
-		    }
-		  }
-		  j ++;
-		}
-	      }
-	      if (toSend) {
-		/* L(:, vtx) and U(vtx, :) will be send to processors */
-		nelts = xusub_end - xusub[vtx_lid];
-		CS->rcv_buf[nextu + DIAG_IND]  = vtx;
-		CS->rcv_buf[nextu + NELTS_IND] = nelts;
-		nextu += 2;
-		for (j = xusub[vtx_lid]; j < xusub_end; j++) {
-		  CS->rcv_buf[nextu] = usub[j]; nextu ++;
-		}
-		
-		nelts = xlsub_end - xlsub[vtx_lid];
-		CS->rcv_buf[nextl + DIAG_IND] = vtx;
-		CS->rcv_buf[nextl + NELTS_IND] = nelts; 
-		nextl += 2;
-		for (j = xlsub[vtx_lid]; j < xlsub_end; j++) {
-		  CS->rcv_buf[nextl] = lsub[j]; nextl ++;
-		}
-	      }
-	    }
-	  }
-	  for (p = fstP; p < lstP; p++) 
-	    if (snd_intraLvl[p])
-	      rcv_intraLvl[p] ++;
-
-	  if (VInfo->filledSep == FILLED_SEP) {
-	    for (p = fstP; p < lstP; p++)
-	      rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + 
-		rcv_intraLvl[p];
-	  }
-	  else {
-	    /* send to the owner of the next block info on no of messages */
-	    p = OWNER( globToLoc[lstVtx_blk] );
-	    tag = tag_intraLvl + nblk_loc;
-	    
-	    MPI_Isend (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p,
-		       tag, (*symb_comm), request);
-#if ( PRNTlevel>=1 )
-	    PS->no_shmSnd ++;
-#endif
-	  }
-
-	  /* there is data to be send */
-	  sz_msg = nextl + nextu - CS->snd_LintraSz;
-	  for (p = fstP; p < lstP; p++) {
-	    if (p != iam && snd_intraLvl[p]) {
-	      MPI_Isend (&sz_msg, 1, mpi_int_t, p,
-			 tag_intraLvl_szMsg, (*symb_comm), &(request[1]));
-	      MPI_Isend (CS->rcv_buf, nextl, mpi_int_t, p,
-			 tag_intraLvl_LData, (*symb_comm), &(request[2]));
-	      MPI_Isend (&(CS->rcv_buf[CS->snd_LintraSz]), 
-			 nextu - CS->snd_LintraSz, mpi_int_t, p,
-			 tag_intraLvl_UData, (*symb_comm), &(request[3]));
-	      MPI_Waitall(3, &(request[1]), &(status[1]));
-#if ( PRNTlevel>=1 )
-	      PS->no_shmSnd ++;
-	      PS->no_msgsSnd += (float) 2;
-	      PS->sz_msgsSnd += (float) sz_msg;
-	      if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl;
-	      if (PS->maxsz_msgSnd < nextu - CS->snd_LintraSz) 
-		PS->maxsz_msgSnd = nextu - CS->snd_LintraSz;
-#endif
-	    }
-	  }
-	  if (VInfo->filledSep != FILLED_SEP) {
-	    MPI_Wait (request, status);      
-	  }
-
-	  /* update rest of vertices */
-	  if (upd_myD) {
-	    lsub_rcvd_sz = (*p_nextl) - xlsub[fstVtx_blk_lid];
-	    lsub_rcvd    = &(lsub[xlsub[fstVtx_blk_lid]]);
-	    usub_rcvd_sz = (*p_nextu) - xusub[fstVtx_blk_lid];
-	    usub_rcvd    = &(usub[xusub[fstVtx_blk_lid]]);
-	    
-	    mem_error =
-	      rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			 usub_rcvd, usub_rcvd_sz, fstVtx_blk, lstVtx_blk,
-			 EMPTY,
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 1, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    lsub = Llu_symbfact->lsub;
-	    lsub_rcvd    = &(lsub[xlsub[fstVtx_blk_lid]]);
-	    mem_error =
-	      rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz,
-			 lsub_rcvd, lsub_rcvd_sz, fstVtx_blk, lstVtx_blk,
-			 EMPTY,
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 0, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    usub = Llu_symbfact->usub;
-	  }
-	  if (VInfo->filledSep == FILLED_SEP)
-	    denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-			       szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
-			       p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, 
-			       symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
-	}
-      }
-    }
-    VInfo->curblk_loc += 2;
-    nblk_loc ++;
-  }
-  
-  /* update maxNeltsVtx */
-  VInfo->maxNeltsVtx = maxNeltsVtx_in - lstVtx + fstVtx;
-  
-  /* if current separator dense, then reset value of filledSep */
-  if (VInfo->filledSep == FILLED_SEP)
-    VInfo->filledSep = FALSE;
-}
-
-static void
-symbfact_free 
-(
- int   iam,    /* Input - my processor number */
- int   nprocs, /* Input - number of processors for the symbolic factorization */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS
- )
-{
-  /* free memory corresponding to prune structure */
-  if (Llu_symbfact->szLsubPr != 0)
-    SUPERLU_FREE( Llu_symbfact->lsubPr );
-  if (Llu_symbfact->szUsubPr != 0)
-    SUPERLU_FREE( Llu_symbfact->usubPr );
-  if (Llu_symbfact->xlsubPr != NULL)
-    SUPERLU_FREE( Llu_symbfact->xlsubPr );
-  if (Llu_symbfact->xusubPr != NULL)
-    SUPERLU_FREE( Llu_symbfact->xusubPr );
-  
-  if (Llu_symbfact->xlsub_rcvd != NULL)
-    SUPERLU_FREE( Llu_symbfact->xlsub_rcvd);
-  if (Llu_symbfact->xusub_rcvd != NULL)
-    SUPERLU_FREE( Llu_symbfact->xusub_rcvd);
-  
-  if (Llu_symbfact->cntelt_vtcs != NULL)
-    SUPERLU_FREE( Llu_symbfact->cntelt_vtcs);
-  if (Llu_symbfact->cntelt_vtcsA_lvl != NULL)
-    SUPERLU_FREE( Llu_symbfact->cntelt_vtcsA_lvl);
-  
-  if (CS->rcv_bufSz != 0)
-    SUPERLU_FREE( CS->rcv_buf );
-  if (CS->snd_bufSz != 0)
-    SUPERLU_FREE( CS->snd_buf );
-  
-  SUPERLU_FREE( VInfo->begEndBlks_loc);
-  SUPERLU_FREE( CS->rcv_interLvl);
-  SUPERLU_FREE( CS->snd_interLvl);
-  SUPERLU_FREE( CS->ptr_rcvBuf);
-  SUPERLU_FREE( CS->rcv_intraLvl);
-  SUPERLU_FREE( CS->snd_intraLvl);
-  SUPERLU_FREE( CS->snd_interSz);
-  SUPERLU_FREE( CS->snd_LinterSz);
-  SUPERLU_FREE( CS->snd_vtxinter);  
-}
-
-static void
-estimate_memUsage
-(
- int_t n,  /* Input - order of the matrix */
- int iam,  /* Input - my processor number */
- mem_usage_t *symb_mem_usage,
- float *p_totalMemLU,   /* Output -memory used for symbolic factorization */
- float *p_overestimMem, /* Output -memory allocated during to right looking 
-			   overestimation memory usage */
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,  /* Input - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS
- )
-{
-  int_t nvtcs_loc, lword, nsuper_loc;
-  float lu_mem, other_mem, overestimMem;
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  nsuper_loc = Pslu_freeable->supno_loc[nvtcs_loc];
-  lword     = sizeof(int_t);
-  
-  /* memory for xlsub, xusub, supno_loc, cntelt_vtcs */
-  lu_mem = 4.0 * (float) nvtcs_loc * (float) lword;
-  /* memory for xlsubPr, xusubPr */
-  lu_mem += 2.0 * (float) VInfo->maxNvtcsNds_loc * (float) lword;
-  
-  if (PS->estimLSz < Llu_symbfact->xlsub[nvtcs_loc])
-    PS->estimLSz = Llu_symbfact->xlsub[nvtcs_loc];
-  if (PS->estimUSz < Llu_symbfact->xusub[nvtcs_loc])
-    PS->estimUSz = Llu_symbfact->xusub[nvtcs_loc];
-  
-  lu_mem += (float) PS->estimLSz * lword;
-  lu_mem += (float) PS->estimUSz * lword;
-  lu_mem += (float) PS->maxSzLPr * lword;
-  lu_mem += (float) PS->maxSzUPr * lword;
-  lu_mem += (float) PS->szDnsSep * lword;
-  /* memory for globToLoc, tempArray */
-  lu_mem += (float) 2* (float) n * lword;
-  lu_mem += (float) PS->maxSzBuf * lword;
-  
-  overestimMem  = (float) (PS->estimLSz - Llu_symbfact->xlsub[nvtcs_loc]) * lword;
-  overestimMem += (float) (PS->estimUSz - Llu_symbfact->xusub[nvtcs_loc]) * lword;
-  
-  *p_totalMemLU = lu_mem;  
-  *p_overestimMem = overestimMem;
-  
-  symb_mem_usage->for_lu = (float) ((3 * nvtcs_loc + 2 * nsuper_loc) * lword);
-  symb_mem_usage->for_lu += (float) (Llu_symbfact->xlsub[nvtcs_loc] * lword); 
-  symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword);   
-  symb_mem_usage->total = lu_mem;
-}
-
-
-static int_t *
-intMalloc_symbfact(int_t n)
-{
-  int_t *buf;
-  if (n == 0)
-    buf = NULL;
-  else
-    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
-  return buf;
-}
-
-static int_t *
-intCalloc_symbfact(int_t n)
-{
-  int_t *buf;
-  register int_t i;
-
-  if (n == 0)
-    buf = NULL;
-  else
-    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
-  if ( buf )
-    for (i = 0; i < n; i++) buf[i] = 0;
-  return (buf);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,5125 +0,0 @@
-
-/*
- * -- Parallel symbolic factorization routine  (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
- * INRIA France - January 2004
- * Laura Grigori
- *
- * November 1, 2007
- */
-
-
-/* 
- * The function symbfact_dist implements the parallel symbolic factorization
- * algorithm described in the paper:
- *
- * Parallel Symbolic Factorization for Sparse LU with Static Pivoting,
- * Laura Grigori, James W. Demmel and Xiaoye S. Li,
- * Pages 1289-1314, SIAM Journal on Scientific Computing, Volume 29, Issue 3.
- *
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-#include <math.h>
-#include "superlu_ddefs.h"
-#include "psymbfact.h"
-
-/*
- * Internal protypes
- */
-
-static int_t *
-intMalloc_symbfact(int_t );
-
-static int_t *
-intCalloc_symbfact(int_t );
-
-static int_t
-initParmsAndStats
-(psymbfact_stat_t *PS);
-
-static void
-estimate_memUsage
-(int_t, int, mem_usage_t *, float *, float *,
- Pslu_freeable_t *, Llu_symbfact_t *,
- vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);
-
-static void
-symbfact_free 
-(int, int, Llu_symbfact_t *, vtcsInfo_symbfact_t *, comm_symbfact_t *);
-
-static int_t
-denseSep_symbfact 
-(int , int_t, int, int, int, int_t *, int_t *, int, 
- int,  int, int_t, int_t, int_t *, int_t *, int_t *,
- int_t *, int_t *, MPI_Comm, MPI_Comm *, Llu_symbfact_t *,
- Pslu_freeable_t *_freeable, vtcsInfo_symbfact_t *, 
- comm_symbfact_t *, psymbfact_stat_t * );
-
-static int_t
-dnsUpSeps_symbfact
-(int_t, int, int, int, int, int_t *, int_t *, int_t,
- Llu_symbfact_t *, Pslu_freeable_t *, vtcsInfo_symbfact_t *, 
- comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *);
-
-static void
-intraLvl_symbfact 
-(SuperMatrix *, int, int, int, int, int, int_t *, int_t *, int, 
- int, int_t, int_t,  Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, 
- comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, 
- int_t *, int_t *, int_t *, MPI_Comm, MPI_Comm *);
-
-static void
-initLvl_symbfact
-(int_t, int, int_t, int_t, Pslu_freeable_t *, 
- Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *, MPI_Comm, 
- int_t *, int_t, int_t);
-
-static void
-createComm (int, int, MPI_Comm *, MPI_Comm *);
-
-static void
-domain_symbfact
-(SuperMatrix *, int, int, int,  int, int, int_t *, int_t *,
- int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *,
- comm_symbfact_t *, psymbfact_stat_t *, int_t *, int_t *, int_t *, int_t *, 
- int_t *, int_t *, int_t *);
-
-static float
-allocPrune_domain
-(int_t, int_t, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-static float
-allocPrune_lvl
-(Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-static int
-symbfact_alloc
-(int_t, int, Pslu_freeable_t *, Llu_symbfact_t *, 
- vtcsInfo_symbfact_t *, comm_symbfact_t *, psymbfact_stat_t *);
-
-static float 
-symbfact_mapVtcs
-(int, int, int, SuperMatrix *, int_t *, int_t *, 
- Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, int_t, psymbfact_stat_t *);
-
-static void 
-symbfact_distributeMatrix 
-(int, int, int, SuperMatrix *, int_t *, int_t *, matrix_symbfact_t *, 
- Pslu_freeable_t *, vtcsInfo_symbfact_t *, int_t *, MPI_Comm *);
-
-static int_t
-interLvl_symbfact
-(SuperMatrix *, int, int, int, int, int, int, int, 
- int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *,
- Llu_symbfact_t *, Pslu_freeable_t*, comm_symbfact_t *, vtcsInfo_symbfact_t *,
- psymbfact_stat_t *, MPI_Comm, MPI_Comm *);
-
-static float
-cntsVtcs 
-(int_t, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, 
- int_t *, int_t *, int_t *, psymbfact_stat_t *, MPI_Comm *);
-
-/************************************************************************/
-float symbfact_dist
-/************************************************************************/
-(
- int         nprocs_num,  /* Input - no of processors */
- int         nprocs_symb, /* Input - no of processors for the symbolic
-			     factorization */
- SuperMatrix *A,          /* Input - distributed input matrix */
- int_t       *perm_c,     /* Input - column permutation */
- int_t       *perm_r,     /* Input - row permutation */
- int_t       *sizes,      /* Input - sizes of each node in the separator tree */
- int_t       *fstVtxSep,  /* Input - first vertex of each node in the tree */
- Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, 
-				    global to local indexing information */
- MPI_Comm    *num_comm,   /* Input - communicator for numerical factorization */
- MPI_Comm    *symb_comm,  /* Input - communicator for symbolic factorization */
- mem_usage_t *symb_mem_usage
- )
-{
-/* 
- * Purpose
- * =======
- *   symbfact_dist() performs symbolic factorization of matrix A suitable
- *   for performing the supernodal Gaussian elimination with no pivoting (GEPP). 
- *   This routine computes the structure of one column of L and one row of U 
- *   at a time.  It uses:
- *        o distributed input matrix
- *        o supernodes
- *        o symmetric structure pruning
- *
- *
- * Arguments
- * =========
- *
- * nprocs_num (input) int
- *         Number of processors SuperLU_DIST is executed on, and the input 
- *         matrix is distributed on.
- *
- * nprocs_symb (input) int
- *         Number of processors on which the symbolic factorization is
- *         performed.  It is equal to the number of independent domains
- *         idenfied in the graph partitioning algorithm executed
- *         previously and has to be a power of 2.  It corresponds to
- *         number of leaves in the separator tree.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The
- *         number of the linear equations is A->nrow.  Matrix A is
- *         distributed in NRformat_loc format.
- *         Matrix A is not yet permuted by perm_c.
- *
- * perm_c  (input) int_t*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * perm_r  (input) int_t*
- *	   Row permutation vector of size A->nrow, which defines the 
- *         permutation matrix Pr; perm_r[i] = j means column i of A is 
- *         in position j in Pr*A.
- *
- * sizes   (input) int_t*
- *         Contains the number of vertices in each separator.
- *
- * fstVtxSep (input) int_t*
- *         Contains first vertex for each separator.
- *
- * Pslu_freeable (output) Pslu_freeable_t*
- *         Returns the local L and U structure, and global to local
- *         information on the indexing of the vertices.  Contains all
- *         the information necessary for performing the data
- *         distribution towards the numeric factorization.
- *				    
- * num_comm (input) MPI_Comm*
- *         Communicator for numerical factorization 
- *
- * symb_comm (input) MPI_Comm*
- *         Communicator for symbolic factorization 
- *
- * symb_mem_usage (input) mem_usage_t *
- *         Statistics on memory usage.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the symbolic factorization.
- *   > 0, number of bytes allocated when out of memory.
- *
- * Sketch of the algorithm
- * =======================
- *
- *  Distrbute the vertices on the processors using a subtree to
- *  subcube algorithm.
- *
- *  Redistribute the structure of the input matrix A according to the
- *  subtree to subcube computed previously for the symbolic
- *  factorization routine.  This implies in particular a distribution
- *  from nprocs_num processors to nprocs_symb processors.
- *
- *  Perform symbolic factorization guided by the separator tree provided by
- *  a graph partitioning algorithm.  The symbolic factorization uses a 
- *  combined left-looking, right-looking approach. 
- *
- */
-  NRformat_loc *Astore;
-  int iam, szSep, fstP, lstP, npNode, nlvls, lvl, p, iSep, jSep;
-  int iinfo; /* return code */
-  int_t m, n;
-  int_t nextl, nextu, neltsZr, neltsTotal, nsuper_loc, szLGr, szUGr;
-  int_t ind_blk, nsuper, vtx, min_mn, nnzL, nnzU, szsn;
-  float stat_loc[23], stat_glob[23], mem_glob[15];
-  
-  Llu_symbfact_t Llu_symbfact; /* local L and U and pruned L and U data structures */
-  vtcsInfo_symbfact_t VInfo; /* local information on number of blocks,
-				number of vertices in a block etc */
-  matrix_symbfact_t   AS; /* temporary storage for the input matrix after redistribution */
-  comm_symbfact_t CS;  /* information on communication */
-  /* relaxation parameters (for future release) and 
-     statistics collected during the symbolic factorization */
-  psymbfact_stat_t PS; 
-  /* temp array of size n, used as a marker by the subroutines */
-  int_t *tempArray; 
-  int_t i, j, k;
-  int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc;
-  int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd;
-  float totalMemLU, overestimMem;
-  MPI_Comm *commLvls;  
-
-  /* maximum block size */
-  int_t  maxSzBlk;
-  float flinfo;
-#if ( PRNTlevel >= 1)
-  float stat_msgs_l[10], stat_msgs_g[10]; 
-#endif  
-#if ( PROFlevel>=1 )
-  double t, t_symbFact[3], t_symbFact_loc[3];
-  double *time_lvlsT, *time_lvls, t1, t2, time_lvlsg[9];
-#endif
-  
-  /* Initialization */
-  MPI_Comm_rank ((*num_comm), &iam);
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter psymbfact()");
-#endif
-  initParmsAndStats (&PS);
-  if (nprocs_symb != 1) {
-    if (!(commLvls = (MPI_Comm *) SUPERLU_MALLOC(2*nprocs_symb*sizeof(MPI_Comm)))) {
-      fprintf (stderr, "Malloc fails for commLvls[].");  
-      return (PS.allocMem);
-    }
-    PS.allocMem += 2 * nprocs_symb * sizeof(MPI_Comm);
-  }
-  else {
-    commLvls = NULL;
-  }
-  
-  nlvls = (int) log2( (double)nprocs_num ) + 1;
-#if ( PROFlevel>=1 )
-  time_lvlsT = (double *) SUPERLU_MALLOC(3*nprocs_symb*(nlvls+1) 
-					 * sizeof(double));
-  time_lvls  = (double *) SUPERLU_MALLOC(3*(nlvls+1) * sizeof(double));
-  if (!time_lvls || !time_lvlsT) {
-    fprintf (stderr, "Malloc fails for time_lvls[].");  
-    return (PS.allocMem);
-  }
-  PS.allocMem += (3*nprocs_symb*(nlvls+1) + 3*(nlvls+1)) * sizeof(double);
-#endif
-  
-  VInfo.xlsub_nextLvl  = 0;
-  VInfo.xusub_nextLvl  = 0;
-  VInfo.maxSzBlk = sp_ienv_dist(3);
-  maxSzBlk = VInfo.maxSzBlk;
-  
-  mark = EMPTY;
-  nsuper_loc = 0;
-  nextl   = 0; nextu      = 0;
-  neltsZr = 0; neltsTotal = 0;
-  
-  m = A->nrow;
-  n = A->ncol;
-  min_mn = SUPERLU_MIN( m, n );
-  
-  if (!(tempArray = intMalloc_symbfact(n))) {
-    fprintf (stderr, "Malloc fails for tempArray[].\n");  
-    return (PS.allocMem);
-  }
-  PS.allocMem += n * sizeof(int_t);
-  
-#if ( PROFlevel>=1 )  
-  t = SuperLU_timer_();
-#endif
-  
-  /* Distribute vertices on processors */
-  if ((flinfo = 
-       symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes, 
-			 Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0) 
-    return (flinfo);
-
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  
-  /* Redistribute matrix A on processors following the distribution found
-     in symbfact_mapVtcs.  Store the redistributed A temporarily into AS */
-  symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb,  A, 
-			     perm_c, perm_r, &AS, 
-			     Pslu_freeable, &VInfo, tempArray, num_comm);
-  
-  /* THE REST OF THE SYMBOLIC FACTORIZATION IS EXECUTED ONLY BY NPROCS_SYMB
-     PROCESSORS */
-  if ( iam < nprocs_symb ) {
-    
-#if ( PROFlevel>=1 )
-    t_symbFact_loc[0] = SuperLU_timer_() - t;
-    t = SuperLU_timer_();
-    t_symbFact_loc[1] = t;
-#endif
-
-    /* Allocate storage common to the symbolic factor routines */
-    if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, 
-				&Llu_symbfact, &VInfo, &CS, &PS)) 
-      return (PS.allocMem);
-  
-    /* Copy the redistributed input matrix AS at the end of the memory buffer
-       allocated to store L and U.  That is, copy (AS.x_ainf, AS.ind_ainf) in
-       (xlsub, lsub), (AS.x_asup, AS.ind_asup) in (xusub, usub).  Free the
-       memory used to store the input matrix */
-    nnz_ainf_loc = VInfo.nnz_ainf_loc;
-    nnz_asup_loc = VInfo.nnz_asup_loc;
-    j = Llu_symbfact.szUsub - VInfo.nnz_asup_loc;
-    k = Llu_symbfact.szLsub - VInfo.nnz_ainf_loc;
-    for (i = 0; i <= VInfo.nvtcs_loc; i++) {
-      Llu_symbfact.xusub[i] = AS.x_asup[i] + j;
-      Llu_symbfact.xlsub[i] = AS.x_ainf[i] + k;
-    }
-    
-    for (i = 0; i < VInfo.nnz_asup_loc; i++, j++) 
-      Llu_symbfact.usub[j] = AS.ind_asup[i];
-    for (i = 0; i < VInfo.nnz_ainf_loc; i++, k++)   
-      Llu_symbfact.lsub[k] = AS.ind_ainf[i];
-    SUPERLU_FREE( AS.x_ainf );
-    SUPERLU_FREE( AS.x_asup );  
-    SUPERLU_FREE( AS.ind_ainf );  
-    SUPERLU_FREE( AS.ind_asup );  
-
-    if (nprocs_symb != 1) {
-      createComm (iam, nprocs_symb, commLvls, symb_comm);    
-#if ( PROFlevel>=1 )
-      t_symbFact_loc[2] = SuperLU_timer_();
-#endif
-      if ((flinfo = cntsVtcs (n, iam, nprocs_symb, Pslu_freeable, &Llu_symbfact, 
-			      &VInfo, tempArray, fstVtxSep, sizes, &PS, commLvls)) > 0) 
-	return (flinfo);
-			     
-#if ( PROFlevel>=1 )
-      t_symbFact_loc[2] = SuperLU_timer_() - t_symbFact_loc[2];
-#endif
-    }
-    
-    /* set to EMPTY marker[] array */
-    for (i = 0; i < n; i++)
-      tempArray[i] = EMPTY;
-    
-    szSep = nprocs_symb;
-    iSep = 0;
-    lvl = 0;
-    while (szSep >= 1) {
-      /* for each level in the separator tree */
-      npNode = nprocs_symb / szSep; 
-      fstP = 0; 
-      /* for each node in the level */
-      for (jSep = iSep; jSep < iSep + szSep; jSep++) {
-	fstVtx = fstVtxSep[jSep];
-	lstVtx  = fstVtx + sizes[jSep];
-	/* if this is the first level */
-	if (szSep == nprocs_symb) {
-	  /* compute symbolic factorization for my domain */
-	  if (fstP == iam) {
-	    /* allocate storage for the pruned structures */
-#if ( PROFlevel>=1 )
-	    t1 = SuperLU_timer_();
-#endif
-	    if ((flinfo = allocPrune_domain (fstVtx, lstVtx, 
-					     &Llu_symbfact, &VInfo, &PS)) > 0)
-	      return (flinfo);
-	    if (fstVtx < lstVtx)
-	      VInfo.fstVtx_nextLvl = VInfo.begEndBlks_loc[2];
-	    
-	    domain_symbfact 
-	      (A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstVtx, lstVtx, 
-	       Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS, tempArray, 
-	       &mark, &nextl, &nextu, &neltsZr, &neltsTotal, &nsuper_loc);
-	    
-	    PS.estimLSz = nextl;
-	    PS.estimUSz = nextu;
-	    if (nprocs_symb != 1) 
-	      if((flinfo = allocPrune_lvl (&Llu_symbfact, &VInfo, &PS)) > 0)
-		return (flinfo);
-#if ( PROFlevel>=1 )
-	    t2 = SuperLU_timer_();
-	    time_lvls[lvl] = 0.; time_lvls[lvl+1] = 0.;
-	    time_lvls[lvl + 2] = t2 - t1;
-#endif
-	  }
-	}
-	else {
-	  lstP = fstP + npNode;
-	  if (fstP <= iam && iam < lstP) {
-#if ( PROFlevel>=1 )
-	    t1 = SuperLU_timer_();	  
-#endif
-	    if (VInfo.filledSep != FILLED_SEPS)
-	      initLvl_symbfact(n, iam, fstVtx, lstVtx,
-			       Pslu_freeable, &Llu_symbfact, &VInfo, &PS, commLvls[jSep], 
-			       tempArray, nextl, nextu);
-#if ( PROFlevel>=1 )
-	    t2 = SuperLU_timer_();
-	    time_lvls[3*lvl] = t2 - t1;
-#endif
-	    interLvl_symbfact (A, iam, lvl, szSep, fstP, lstP,
-			       iSep, jSep, sizes, fstVtxSep, 
-			       &nextl, &nextu, &nsuper_loc, &mark, tempArray,
-			       &Llu_symbfact, Pslu_freeable, &CS, &VInfo, &PS,
-			       commLvls[jSep], symb_comm);
-#if ( PROFlevel>=1 )
-	    t1 = SuperLU_timer_();
-	    time_lvls[3*lvl+1] = t1 - t2;
-#endif
-	    if (VInfo.filledSep != FILLED_SEPS)
-	      intraLvl_symbfact 
-		(A, iam, lvl, szSep, iSep, jSep, sizes, fstVtxSep, fstP, lstP, 
-		 fstVtx, lstVtx, Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS,
-		 tempArray, &mark, &nextl, &nextu, &neltsZr, &neltsTotal, 
-		 &nsuper_loc, commLvls[jSep], symb_comm);
-#if ( PROFlevel>=1 )
-	    t2 = SuperLU_timer_();
-	    time_lvls[3*lvl+2] = t2 - t1;		 
-#endif
-	  }
-	}
-	fstP += npNode;
-      }
-      iSep += szSep;
-      szSep = szSep / 2;
-      lvl ++;
-    }
-  
-    SUPERLU_FREE( tempArray );
-#if 0
-    SUPERLU_FREE( commLvls );
-#else /* XSL correction */
-    if ( commLvls ) SUPERLU_FREE( commLvls );
-#endif
-    
-    /* Set up global information and collect statistics */
-    if (PS.maxSzLPr < Llu_symbfact.indLsubPr)
-      PS.maxSzLPr = Llu_symbfact.indLsubPr;
-    if (PS.maxSzUPr < Llu_symbfact.indUsubPr)
-      PS.maxSzUPr = Llu_symbfact.indUsubPr;
-    
-    Llu_symbfact.xlsub[VInfo.nvtcs_loc] = nextl;
-    Llu_symbfact.xusub[VInfo.nvtcs_loc] = nextu;
-    fill_rcmd = SUPERLU_MAX( nextl / nnz_ainf_loc, nextu / nnz_asup_loc) + 1;
-    Pslu_freeable->xsup_beg_loc = intMalloc_dist (nsuper_loc+1);
-    Pslu_freeable->xsup_end_loc = intMalloc_dist (nsuper_loc+1);
-    if (!Pslu_freeable->xsup_beg_loc || !Pslu_freeable->xsup_end_loc) {
-      fprintf (stderr, "Malloc fails for xsup_beg_loc, xsup_end_loc.");
-      return (PS.allocMem);
-    }
-    PS.allocMem += 2 * (nsuper_loc+1) * sizeof(int_t);
-    maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-    nnzL = 0; nnzU = 0;
-    
-    i = 0;
-    nsuper = 0;
-    ind_blk = 0;
-    for (ind_blk = 0; ind_blk < VInfo.nblks_loc; ind_blk ++) {
-      fstVtx = VInfo.begEndBlks_loc[2 * ind_blk];
-      lstVtx = VInfo.begEndBlks_loc[2 * ind_blk + 1];
-      fstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[fstVtx] );
-      nsuper = Pslu_freeable->supno_loc[fstVtx_lid];
-      Pslu_freeable->xsup_beg_loc[nsuper] = fstVtx;
-      szsn = 1;
-      if (INT_MAX - nnzL <= Llu_symbfact.xlsub[fstVtx_lid + 1] - 
-	  Llu_symbfact.xlsub[fstVtx_lid])
-	printf ("PE[%d] ERR nnzL %d\n", iam, nnzL); 
-      if (INT_MAX - nnzU <= Llu_symbfact.xusub[fstVtx_lid + 1] - 
-	  Llu_symbfact.xusub[fstVtx_lid])
-	printf ("PE[%d] ERR nnzU %d\n", iam, nnzU);
-      
-      j = Llu_symbfact.xlsub[fstVtx_lid + 1] - Llu_symbfact.xlsub[fstVtx_lid];
-      k = Llu_symbfact.xusub[fstVtx_lid + 1] - Llu_symbfact.xusub[fstVtx_lid];
-      nnzL += j;
-      nnzU += k;
-
-      for (vtx = fstVtx + 1, vtx_lid = fstVtx_lid + 1; 
-	   vtx < lstVtx; vtx++, vtx_lid ++) {
-	if (Pslu_freeable->supno_loc[vtx_lid] != nsuper) {
-	  nsuper = Pslu_freeable->supno_loc[vtx_lid];
-	  Pslu_freeable->xsup_end_loc[nsuper-1] = vtx;
-	  Pslu_freeable->xsup_beg_loc[nsuper] = vtx;
-	  szsn = 1;
-	  j = Llu_symbfact.xlsub[vtx_lid + 1] - Llu_symbfact.xlsub[vtx_lid];
-	  k = Llu_symbfact.xusub[vtx_lid + 1] - Llu_symbfact.xusub[vtx_lid];
-	}
-	else {
-	  szsn ++;
-	}
-	nnzL += j - szsn + 1;
-	nnzU += k - szsn + 1;
-      }
-      Pslu_freeable->xsup_end_loc[nsuper] = lstVtx;
-    }
-    Pslu_freeable->supno_loc[VInfo.nvtcs_loc] = nsuper_loc;
-    Pslu_freeable->nvtcs_loc = VInfo.nvtcs_loc; 
-
-    /* set up xsup data */
-    Pslu_freeable->lsub = Llu_symbfact.lsub;
-    Pslu_freeable->xlsub = Llu_symbfact.xlsub;
-    Pslu_freeable->usub = Llu_symbfact.usub;
-    Pslu_freeable->xusub = Llu_symbfact.xusub;
-    Pslu_freeable->szLsub = Llu_symbfact.szLsub;
-    Pslu_freeable->szUsub = Llu_symbfact.szUsub;
-    
-#if ( PROFlevel>=1 )
-    t_symbFact_loc[1] = SuperLU_timer_() - t_symbFact_loc[1];
-#endif  
-
-#if ( PRNTlevel>=1 )
-    estimate_memUsage (n, iam,  symb_mem_usage, 
-		       &totalMemLU, &overestimMem, 
-		       Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS);
-    stat_loc[0] = (float) nnzL;
-    stat_loc[1] = (float) nnzU;  
-    stat_loc[2] = (float) nsuper_loc;
-    stat_loc[3] = (float) Pslu_freeable->xlsub[VInfo.nvtcs_loc];
-    stat_loc[4] = (float) Pslu_freeable->xusub[VInfo.nvtcs_loc];
-    stat_loc[5] = totalMemLU;
-    stat_loc[6] = overestimMem;
-    stat_loc[7] = totalMemLU - overestimMem;
-    stat_loc[8] = (float) PS.maxSzBuf;
-    stat_loc[9] = (float) PS.nDnsUpSeps;
-    stat_loc[10] = (float) PS.nDnsCurSep;
-    stat_loc[11] = (float) (Llu_symbfact.no_expand + Llu_symbfact.no_expcp +
-			    Llu_symbfact.no_expand_pr);
-    stat_loc[12] = (float) Llu_symbfact.no_expand;
-    stat_loc[13] = (float) Llu_symbfact.no_expcp;
-    stat_loc[14] = (float) Llu_symbfact.no_expand_pr;
-    stat_loc[15] = (float) fill_rcmd;
-    stat_loc[16] = PS.nops;
-    stat_loc[17] = PS.fill_pelt[1];
-    stat_loc[18] = PS.fill_pelt[4];
-    stat_loc[19] = PS.fill_pelt[0];
-    stat_loc[20] = PS.fill_pelt[2];
-    stat_loc[21] = PS.fill_pelt[3];
-    stat_loc[22] = PS.fill_pelt[5];
-    
-    MPI_Reduce (stat_loc, stat_glob, 23, MPI_FLOAT, 
-		MPI_SUM, 0, (*symb_comm));
-    MPI_Reduce (&(stat_loc[5]), mem_glob, 14, MPI_FLOAT, 
-		MPI_MAX, 0, (*symb_comm));
-    fill_rcmd = (int_t) mem_glob[10];
-    PS.fill_pelt[0] = stat_glob[19];
-    PS.fill_pelt[1] = mem_glob[12];
-    PS.fill_pelt[2] = stat_glob[20];
-    PS.fill_pelt[3] = stat_glob[21];
-    PS.fill_pelt[4] = mem_glob[13];
-    PS.fill_pelt[5] = stat_glob[22];
-    if (PS.fill_pelt[2] == 0.) PS.fill_pelt[2] = 1.;
-    if (PS.fill_pelt[5] == 0.) PS.fill_pelt[5] = 1.;
-    
-#if ( PROFlevel>=1 )
-    MPI_Reduce (t_symbFact_loc, t_symbFact, 3, MPI_DOUBLE,
-		MPI_MAX, 0, (*symb_comm));
-    MPI_Gather (time_lvls, 3 * nlvls, MPI_DOUBLE,
-		time_lvlsT, 3 * nlvls , MPI_DOUBLE,
-		0, (*symb_comm));
-#endif
-    
-    stat_msgs_l[0] = (float) PS.maxsz_msgSnd;
-    stat_msgs_l[1] = (float) PS.maxsz_msgSnd;
-    if (PS.maxsz_msgSnd < PS.maxsz_msgCol)
-      stat_msgs_l[1] = PS.maxsz_msgCol;
-    stat_msgs_l[2] = PS.no_shmSnd + PS.no_msgsSnd + 
-      PS.no_shmRcvd + PS.no_msgsRcvd;
-    stat_msgs_l[3] = stat_msgs_l[2] + PS.no_msgsCol;
-    stat_msgs_l[4] = stat_msgs_l[2];
-    stat_msgs_l[5] = stat_msgs_l[3]; 
-    stat_msgs_l[6] = PS.no_msgsSnd;
-    stat_msgs_l[7] = PS.no_msgsSnd + PS.no_msgsCol;  
-    stat_msgs_l[8] = PS.sz_msgsSnd;
-    stat_msgs_l[9] = PS.sz_msgsSnd + PS.sz_msgsCol;
-    MPI_Reduce (stat_msgs_l, stat_msgs_g, 4, MPI_FLOAT,
-		MPI_MAX, 0, (*symb_comm));
-    MPI_Reduce (&(stat_msgs_l[4]), &(stat_msgs_g[4]), 6, MPI_FLOAT,
-		MPI_SUM, 0, (*symb_comm));
-    if (stat_msgs_g[6] == 0) stat_msgs_g[6] = 1;
-    if (stat_msgs_g[7] == 0) stat_msgs_g[7] = 1;
-    
-    if (!iam) {
-      nnzL   = (int_t) stat_glob[0]; nnzU  = (int_t) stat_glob[1];
-      nsuper = (int_t) stat_glob[2];
-      szLGr  = (int_t) stat_glob[3]; szUGr = (int_t) stat_glob[4];
-      printf("\tMax szBlk          %ld\n", VInfo.maxSzBlk);
-#if ( PRNTlevel>=2 )
-      printf("\t relax_gen %.2f, relax_curSep %.2f, relax_seps %.2f\n",
-	     PS.relax_gen, PS.relax_curSep, PS.relax_seps);
-#endif
-      printf("\tParameters: fill mem %ld fill pelt %ld\n",
-	     sp_ienv_dist(6), PS.fill_par);
-      printf("\tNonzeros in L       %ld\n", nnzL);
-      printf("\tNonzeros in U       %ld\n", nnzU);
-      printf("\tnonzeros in L+U-I   %ld\n", 
-	     nnzL + nnzU);
-      printf("\tNo of supers   %ld\n", nsuper);
-      printf("\tSize of G(L)   %ld\n", szLGr);
-      printf("\tSize of G(U)   %ld\n", szUGr);
-      printf("\tSize of G(L+U) %ld\n", szLGr+szUGr);
-
-      printf("\tParSYMBfact (MB)      :\tL\\U MAX %.2f\tAVG %.2f\n",
-	     mem_glob[0]*1e-6, 
-	     stat_glob[5]/nprocs_symb*1e-6);
-#if ( PRNTlevel>=2 )
-      printf("\tRL overestim (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
-	     mem_glob[1]*1e-6, 
-	     stat_glob[6]/nprocs_symb*1e-6);
-      printf("\tsnd/rcv buffers (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
-	     mem_glob[3]*1e-6, 
-	     stat_glob[8]/nprocs_symb*1e-6);
-      printf("\tSYMBfact 2*n+4*nvtcs_loc+2*maxNvtcsNds_loc:\tL\\U %.2f\n",
-	     (float) (2 * n * sizeof(int_t)) *1e-6);
-      printf("\tint_t %d, int %d, long int %d, short %d, float %d, double %d\n", 
-	     sizeof(int_t), sizeof(int),  sizeof(long int), sizeof(short), sizeof(float),
-	     sizeof(double));
-      printf("\tDNS ALLSEPS:\t MAX %d\tAVG %.2f\n",
-	     (int_t) mem_glob[4], stat_glob[9]/nprocs_symb);
-      printf("\tDNS CURSEP:\t MAX %d\tAVG %.2f\n\n",
-	     (int_t) mem_glob[5], stat_glob[10]/nprocs_symb);
-
-      printf("\t MAX FILL Mem(L+U) / Mem(A) per processor %ld\n", fill_rcmd);    
-      printf("\t      Per elt MAX %ld AVG %ld\n", 
-	     (int_t) PS.fill_pelt[4], (int_t)(PS.fill_pelt[3]/PS.fill_pelt[5]));
-      printf("\t      Per elt RL MAX %ld AVG %ld\n",
-	     (int_t) PS.fill_pelt[1], (int_t)(PS.fill_pelt[0]/PS.fill_pelt[2]));
-      printf("\tM Nops:\t MAX %.2f\tAVG %.2f\n",
-	     mem_glob[11]*1e-6, (stat_glob[16]/nprocs_symb)*1e-6);
-      
-      
-      printf("\tEXPANSIONS: MAX/AVG\n");
-      printf("\tTOTAL: %d / %.2f\n",
-	     (int_t) mem_glob[6], stat_glob[11]/nprocs_symb);
-      printf("\tREALLOC: %.f / %.2f RL_CP %.f / %.2f PR_CP %.f / %.2f\n",
-	     mem_glob[7], stat_glob[12]/nprocs_symb,
-	     mem_glob[8], stat_glob[13]/nprocs_symb,
-	     mem_glob[9], stat_glob[14]/nprocs_symb);
-      
-      printf ("\n\tDATA MSGS  noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n",
-	      stat_msgs_g[2]*1e-3, stat_msgs_g[4]/nprocs_symb*1e-3,
-	      stat_msgs_g[0]*1e-6, stat_msgs_g[8] / stat_msgs_g[6]*1e-6);
-      printf ("\tTOTAL MSGS noMsgs*10^3 %.3f/%.3f size (MB) %.3f/%.3f \n",
-	      stat_msgs_g[3]*1e-3, stat_msgs_g[5]/nprocs_symb*1e-3,
-	      stat_msgs_g[1]*1e-6, stat_msgs_g[9]/stat_msgs_g[7]*1e-6);
-#endif      
-
-#if ( PROFlevel>=1 )
-      printf("Distribute matrix time = %8.3f\n", t_symbFact[0]);
-      printf("Count vertices time    = %8.3f\n", t_symbFact[2]);
-      printf("Symbfact DIST time     = %8.3f\n", t_symbFact[1]);
-      
-      printf("\nLvl\t    Time\t    Init\t   Inter\t    Intra\n");
-      time_lvlsg[0] = 0.;
-      for (i = 0; i < nlvls; i++) {
-	for (j = 1; j < 9; j++)
-	  time_lvlsg[j] = 0.;
-	for (p = 0; p < nprocs_symb; p++) {
-	  k = p * 3 * nlvls;
-	  t = time_lvlsT[i*3+k] + time_lvlsT[i*3+k+1] + time_lvlsT[i*3+k+2];
-	  if (t > time_lvlsg[1]) {
-	    time_lvlsg[1] = t; j = p;
-	  }
-	  time_lvlsg[2] += t;
-	  if (time_lvlsT[i*3+k] > time_lvlsg[3])
-	    time_lvlsg[3] = time_lvlsT[i*3+k];
-	  time_lvlsg[4] += time_lvlsT[i*3+k];
-	  if (time_lvlsT[i*3+k+1] > time_lvlsg[5])
-	    time_lvlsg[5] = time_lvlsT[i*3+k+1];
-	  time_lvlsg[6] += time_lvlsT[i*3+k+1];
-	  if (time_lvlsT[i*3+k+2] > time_lvlsg[7])
-	    time_lvlsg[7] = time_lvlsT[i*3+k+2];
-	  time_lvlsg[8] += time_lvlsT[i*3+k+2];
-	}
-	time_lvlsg[0] += time_lvlsg[1];
-	printf ("%d \t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\t%.3f/%.3f\n", i,
-		time_lvlsg[1], time_lvlsg[2] / nprocs_symb,
-		time_lvlsg[3], time_lvlsg[4] / nprocs_symb,
-		time_lvlsg[5], time_lvlsg[6] /nprocs_symb,
-		time_lvlsg[7], time_lvlsg[8] / nprocs_symb); 
-      }
-      printf("\t   %8.3f \n", time_lvlsg[0]);    
-#endif
-    }
-#endif
-#if ( PROFlevel>=1 )
-    SUPERLU_FREE (time_lvls);
-    SUPERLU_FREE (time_lvlsT);
-#endif
-    symbfact_free (iam, nprocs_symb, &Llu_symbfact, &VInfo, &CS);
-  } /* if (iam < nprocs_symb) */  
-  else {
-    /* update Pslu_freeable before returning */
-    Pslu_freeable->nvtcs_loc = 0; 
-    Pslu_freeable->xlsub = NULL; Pslu_freeable->lsub = NULL; 
-    Pslu_freeable->xusub = NULL; Pslu_freeable->usub = NULL; 
-    Pslu_freeable->supno_loc = NULL;
-    Pslu_freeable->xsup_beg_loc = NULL;     
-    Pslu_freeable->xsup_end_loc = NULL;
-    
-    SUPERLU_FREE( tempArray );
-    PS.allocMem -= n * sizeof(int_t);
-  }
-  
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit psymbfact()");
-#endif
-
-  return (- PS.allocMem);
-} /* SYMBFACT_DIST */
-
-
-static int_t
-initParmsAndStats
-(
- psymbfact_stat_t *PS /* Output -statistics*/
-)
-/* 
- * Purpose
- * =======
- * Initialize relaxation parameters and statistics variables
- */
-{
-  int  i;
-
-  PS->nDnsCurSep = 0;
-  PS->nDnsUpSeps = 0;
-  
-  PS->relax_gen = 1.0;
-  PS->relax_curSep = 1.0;
-  PS->relax_seps = 1.0;
-  PS->fill_par = sp_ienv_dist(6);
-  PS->nops = 0.;
-  PS->no_shmSnd = 0.;
-  PS->no_msgsSnd = 0.;
-  PS->maxsz_msgSnd = 0;
-  PS->sz_msgsSnd = 0.;
-  PS->no_shmRcvd = 0.;
-  PS->no_msgsRcvd = 0.;
-  PS->maxsz_msgRcvd = 0;
-  PS->sz_msgsRcvd = 0.;
-  PS->no_msgsCol = 0.;
-  PS->maxsz_msgCol = 0;
-  PS->sz_msgsCol = 0.;
-
-  for (i = 0; i < 6; i++)
-    PS->fill_pelt[i] = 0.;
-
-  PS->estimUSz = 0;
-  PS->estimLSz = 0;
-  PS->maxSzLPr = 0;
-  PS->maxSzUPr = 0;
-  PS->maxSzBuf = 0;
-  PS->szDnsSep = 0;  
-  PS->allocMem = 0;
-}
-
-static float
-cntsVtcs 
-(
- int_t  n,           /* Input - order of the input matrix */
- int    iam,         /* Input - my processor number */
- int    nprocs_symb, /* Input - no of processors for symbolic factorization */
- Pslu_freeable_t *Pslu_freeable, /* Input -globToLoc and maxNvtcsPProc */
- Llu_symbfact_t  *Llu_symbfact, /* Input/Output -local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices distribution */
- int_t            *tempArray, /* Input - temporary storage */
- int_t            *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t            *sizes,     /* Input - sizes of each node in the tree */
- psymbfact_stat_t *PS,  /* Input/Output -statistics */
- MPI_Comm         *commLvls
- )
-/* 
- * Purpose
- * =======
- * 
- * Computes an estimation of the number of elements in columns of L
- * and rows of U.  Stores this information in cntelt_vtcs, and it will
- * be used in the right-looking symbolic factorization.
- */
-{
-  int   fstP, lstP, szSep, npNode, i, j;
-  int_t nvtcs_loc, ind_blk, vtx, vtx_lid, ii, jj, lv, vtx_elt, cur_blk;
-  int_t fstVtx, lstVtx, fstVtx_blk, lstVtx_blk;
-  int_t nelts, nelts_new_blk;
-  int_t *xlsub, *lsub, *xusub, *usub, *globToLoc, maxNvtcsPProc;
-  int_t *minElt_vtx, *cntelt_vtcs;
-  
-  /* Initialization */
-  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;
-  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;
-  cntelt_vtcs = Llu_symbfact->cntelt_vtcs;
-  globToLoc = Pslu_freeable->globToLoc;
-  nvtcs_loc = VInfo->nvtcs_loc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  if (Llu_symbfact->szLsub - VInfo->nnz_ainf_loc > n)
-    minElt_vtx = lsub;
-  else { 
-    /* allocate memory for minElt_vtx */
-    if (!(minElt_vtx = intMalloc_dist(n))) {
-      fprintf(stderr, "Malloc fails for minElt_vtx[].");
-      return (PS->allocMem);
-    }
-    PS->allocMem += n * sizeof (int_t);
-  } 
-  
-  for (ii = 0; ii < n; ii++) 
-    tempArray[ii] = n;
-  for (ii = 0; ii < nvtcs_loc; ii++)
-    cntelt_vtcs[ii] = 0;
-
-  szSep = nprocs_symb;
-  i = 0;
-  cur_blk = 0;
-  vtx_lid = 0;
-  while (szSep >= 1) {
-    /* for each level in the separator tree */
-    npNode = nprocs_symb / szSep; 
-    fstP = 0; 
-    /* for each node in the level */
-    for (j = i; j < i + szSep; j++) {
-      fstVtx = fstVtxSep[j];
-      lstVtx  = fstVtx + sizes[j];
-      lstP = fstP + npNode;
-
-      if (fstP <= iam && iam < lstP) {      
-	ind_blk = cur_blk;
-	ii = vtx_lid;
-	while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-	       ind_blk < 2 * VInfo->nblks_loc) {	  
-	  fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-	  lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-	  ind_blk += 2;
-	  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, ii++) {
-	    for (jj = xlsub[ii]; jj < xlsub[ii+1]; jj++) {
-	      vtx_elt = lsub[jj];
-	      if (tempArray[vtx_elt] == n) {
-		tempArray[vtx_elt] = vtx;
-	      }
-	    }
-	    for (jj = xusub[ii]; jj < xusub[ii+1]; jj++) {
-	      vtx_elt = usub[jj];
-	      if (tempArray[vtx_elt] == n) {
-		tempArray[vtx_elt] = vtx;
-	      }
-	    }
-	  }	  
-	} 
-	if (szSep == nprocs_symb) 
-	  vtx_lid = ii;
-	else {
-	  MPI_Allreduce (&(tempArray[fstVtx]), &(minElt_vtx[fstVtx]), 
-			 (int) (n - fstVtx), mpi_int_t, MPI_MIN, commLvls[j]);
-#if ( PRNTlevel>=1 )
-	  PS->no_msgsCol += (float) (2 * (int) log2( (double) npNode ));
-	  PS->sz_msgsCol += (float) (n - fstVtx);
-	  if (PS->maxsz_msgCol < n - fstVtx) 
-	    PS->maxsz_msgCol = n - fstVtx;      
-#endif
-
-	  nelts = 0;
-	  for (ii = fstVtx; ii < lstVtx; ii++)
-	    tempArray[ii] = 0;
-	  for (ii = fstVtx; ii < n; ii++) {
-	    if (minElt_vtx[ii] != n) {
-	      if (minElt_vtx[ii] < fstVtx)
-		nelts ++;
-	      else
-		tempArray[minElt_vtx[ii]] ++;
-	      if (ii > lstVtx)
-		tempArray[ii] = minElt_vtx[ii];
-	    }
-	  }
-	
-	  ind_blk = cur_blk;
-	  lv = fstVtx;
-	  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-		 ind_blk < 2 * VInfo->nblks_loc) {	  
-	    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-	    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-	    ind_blk += 2;
-	    
-	    for (ii = lv; ii < fstVtx_blk; ii++)
-	      nelts += tempArray[ii];
-	    lv = lstVtx_blk;
-
-	    nelts_new_blk = 0;
-	    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-	      nelts_new_blk += tempArray[vtx];
-	      cntelt_vtcs[vtx_lid] = nelts;
-	    }
-	    nelts += nelts_new_blk;
-	  }
-	} /* if (szSep != nprocs_symb) */
-	cur_blk = ind_blk;
-      }
-      fstP += npNode;
-    }
-    i += szSep;
-    szSep = szSep / 2;
-  }
-  /* free memory */
-  if (minElt_vtx != lsub) {
-    SUPERLU_FREE (minElt_vtx);
-    PS->allocMem -= n * sizeof(int_t);
-  }
-  return (SUCCES_RET);
-}
-
-static float
-symbfact_mapVtcs
-(
- int iam,             /* Input -process number */
- int nprocs_num,      /* Input -number of processors */
- int nprocs_symb,     /* Input -number of procs for symbolic factorization */
- SuperMatrix *A,      /* Input -input distributed matrix A */
- int_t *fstVtxSep,    /* Input -first vertex in each separator */
- int_t *sizes,        /* Input -size of each separator in the separator tree */
- Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc 
-				    computed */
- vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */
- int_t *tempArray,    /* Input -temp array of size n = order of the matrix */
- int_t  maxSzBlk,     /* Input -maximum number of vertices in a block */
- psymbfact_stat_t *PS /* Input/Output -statistics */
- ) 
-{
-/* 
- * Purpose
- * =======
- *
- *  symbfact_mapVtcs maps the vertices of the graph of the input
- *  matrix A on nprocs_symb processors, using the separator tree
- *  returned by a graph partitioning algorithm from the previous step
- *  of the symbolic factorization.  The number of processors
- *  nprocs_symb must be a power of 2.
- *
- * Description of the algorithm
- * ============================
- *
- *  A subtree to subcube algorithm is used first to map the processors
- *  on the nodes of the separator tree.
- *
- *  For each node of the separator tree, its corresponding vertices
- *  are distributed on the processors affected to this node, using a
- *  block cyclic distribution.
- *
- *  After the distribution, fields of the VInfo structure are
- *  computed.  The array globToLoc and maxNvtcsPProc of Pslu_freeable
- *  are also computed.
- *
- */
-  int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d;
-  int_t k, n, kk;
-  int_t fstVtx, lstVtx;
-  int_t fstVtxBlk, ind_blk;
-  int_t noVtcsProc, noBlk;
-  int_t nvtcs_loc; /* number of vertices owned by process iam */
-  int_t nblks_loc; /* no of blocks owned by process iam */
-  int_t *globToLoc;    /* global indexing to local indexing */
-  int_t maxNvtcsPProc, maxNvtcsNds_loc, nvtcsNds_loc, maxNeltsVtx;
-  int_t *begEndBlks_loc; /* begin and end vertex of each local block */
-  int_t *vtcs_pe;  /* contains the number of vertices on each processor */
-  int   *avail_pes; /* contains the processors to be used at each level */
-  
-  n = A->ncol;
-  /* allocate memory */
-  if (!(globToLoc = intMalloc_dist(n + 1))) {
-    fprintf (stderr, "Malloc fails for globToLoc[].");
-    return (PS->allocMem);
-  }
-  PS->allocMem += (n+1) * sizeof(int_t);
-  if (!(avail_pes = (int *) SUPERLU_MALLOC(nprocs_symb*sizeof(int)))) {
-    fprintf (stderr, "Malloc fails for avail_pes[].");
-    return (PS->allocMem);
-  }
-  PS->allocMem += nprocs_symb*sizeof(int);
-  if (!(vtcs_pe = (int_t *) SUPERLU_MALLOC(nprocs_symb*sizeof(int_t)))) {
-    fprintf (stderr, "Malloc fails for vtcs_pe[].");
-    return (PS->allocMem);
-  }
-  PS->allocMem += nprocs_symb*sizeof(int_t);
-  
-  /* Initialization */
-  globToLoc[n] = n;  
-  for (p = 0; p < nprocs_symb; p++) {
-    vtcs_pe[p] = 0;
-    avail_pes[p] = EMPTY;
-  }
-  nvtcs_loc = 0;
-  nblks_loc = 0;
-  maxNvtcsNds_loc = 0;
-  maxNeltsVtx     = 0;
-  
-  /* distribute data among processors */
-  szSep = nprocs_symb;
-  iSep = 0;
-  while (szSep >= 1) {
-    /* for each level in the separator tree */
-    npNode = nprocs_symb / szSep; 
-    firstP = 0; 
-    nvtcsNds_loc = 0;
-    
-    for (jSep = iSep; jSep < iSep + szSep; jSep++) {
-      /* for each node in the level */
-      fstVtx = fstVtxSep[jSep];
-      lstVtx = fstVtx + sizes[jSep];
-      if (firstP <= iam && iam < firstP + npNode)
-	maxNeltsVtx += lstVtx - fstVtx;
-
-      if (szSep == nprocs_symb) {
-	/* leaves of the separator tree */
-	for (k = fstVtx; k < lstVtx; k++) {
-	  globToLoc[k] = (int_t) firstP;
-	  vtcs_pe[firstP] ++;
-	}
-	if (firstP == iam) {	  
-	  nvtcs_loc += lstVtx - fstVtx;
-	  nblks_loc ++;
-	}
-      }
-      else {
-	/* superior levels of the separator tree */
-	k = fstVtx;
-	noVtcsProc = maxSzBlk;
-	fstVtxBlk = fstVtx;
-	if ((jSep - iSep) % 2 == 0) ind_ap_d = (jSep - iSep) * npNode;
-	/* first allocate processors from previous levels */	
-	for (ind_ap_s = (jSep-iSep) * npNode; ind_ap_s < (jSep-iSep+1) * npNode; ind_ap_s ++) {
-	  p = avail_pes[ind_ap_s];
-	  if (p != EMPTY && k < lstVtx) {
-	    /* for each column in the separator */	  
-	    avail_pes[ind_ap_s] = EMPTY;
-	    kk = 0;
-	    while (kk < noVtcsProc && k < lstVtx) {
-	      globToLoc[k] = p;
-	      vtcs_pe[p] ++;
-	      k ++;
-	      kk ++;
-	    }
-	    if (p == iam) {
-	      nvtcs_loc += kk;
-	      nblks_loc ++;
-	      nvtcsNds_loc += kk;
-	    }
-	  }
-	  else {
-	    if (p != EMPTY && k == lstVtx) {
-	      avail_pes[ind_ap_s] = EMPTY;
-	      avail_pes[ind_ap_d] = p; ind_ap_d ++;
-	    }
-	  }
-	} 
-	noBlk = 0;
-	p = firstP + npNode;
-	while (k < lstVtx) {
-	  /* for each column in the separator */
-	  kk = 0;
-	  p = (int) (noBlk % (int_t) npNode) + firstP;
-	  while (kk < noVtcsProc && k < lstVtx) {
-	    globToLoc[k] = p;
-	    vtcs_pe[p] ++;
-	    k ++;
-	    kk ++;
-	  }
-	  if (p == iam) {
-	    nvtcs_loc += kk;
-	    nblks_loc ++;
-	    nvtcsNds_loc += kk;
-	  }
-	  noBlk ++;
-	} /* while (k < lstVtx) */
-	/* Add the unused processors to the avail_pes list of pes */
-	for (p = p + 1; p < firstP + npNode; p ++) {
-	  avail_pes[ind_ap_d] = p; ind_ap_d ++;
-	}
-      }
-      firstP += npNode;
-    }
-    if (maxNvtcsNds_loc < nvtcsNds_loc && szSep != nprocs_symb)
-      maxNvtcsNds_loc = nvtcsNds_loc;
-    iSep += szSep;
-    szSep = szSep / 2;
-  }
-  
-#if ( PRNTlevel>=2 )
-  if (!iam)
-    PrintInt10 (" novtcs_pe", nprocs_symb, vtcs_pe);
-#endif
-  /* determine maximum number of vertices among processors */
-  maxNvtcsPProc = vtcs_pe[0];
-  vtcs_pe[0] = 0;
-  for (p = 1; p < nprocs_symb; p++) {
-    if (maxNvtcsPProc < vtcs_pe[p])
-      maxNvtcsPProc = vtcs_pe[p];
-    vtcs_pe[p] = 0;
-  }
-#if ( PRNTlevel>=2 )
-  if (!iam)
-    printf ("  MaxNvtcsPerProc %d MaxNvtcs/Avg %e\n\n", 
-	    maxNvtcsPProc, ((float) maxNvtcsPProc * nprocs_symb)/(float)n);
-#endif
-
-  if (iam < nprocs_symb)
-    if (!(begEndBlks_loc = intMalloc_symbfact(2 * nblks_loc + 1)))
-      ABORT("Malloc fails for begEndBlks_loc[].");
-  
-  ind_blk = 0;
-  k = 0;
-  while (k < n) {
-    p = globToLoc[k];
-    if (p == iam) 
-      begEndBlks_loc[ind_blk] = k;
-    while (globToLoc[k] == p && k < n) {
-      globToLoc[k] = globToLoc[k] * maxNvtcsPProc + vtcs_pe[p];
-      vtcs_pe[p] ++;
-      k ++;
-    }
-    if (p == iam) {
-      begEndBlks_loc[ind_blk + 1] = k;
-      ind_blk += 2;
-    }
-  }
-  if (iam < nprocs_symb)
-    begEndBlks_loc[2 * nblks_loc] = n;
- 
-  SUPERLU_FREE (avail_pes);
-  
-  Pslu_freeable->maxNvtcsPProc   = maxNvtcsPProc;
-  Pslu_freeable->globToLoc       = globToLoc;
-  if (iam < nprocs_symb) {
-    VInfo->maxNvtcsNds_loc = maxNvtcsNds_loc;
-    VInfo->nblks_loc       = nblks_loc;
-    VInfo->nvtcs_loc       = nvtcs_loc;
-    VInfo->curblk_loc      = 0;
-    VInfo->maxNeltsVtx     = maxNeltsVtx;
-    VInfo->filledSep       = FALSE;
-    VInfo->xlsub_nextLvl   = 0;
-    VInfo->xusub_nextLvl   = 0;
-    VInfo->begEndBlks_loc  = begEndBlks_loc;
-    VInfo->fstVtx_nextLvl  = begEndBlks_loc[0];
-  }
-  return SUCCES_RET;
-}
-
-static void 
-symbfact_distributeMatrix
-(
- int   iam,             /* Input - my processor number */  
- int   nprocs_num,      /* Input - number of processors */
- int   nprocs_symb,     /* Input - number of processors for the
-			   symbolic factorization */
- SuperMatrix *A,        /* Input - input matrix A */
- int_t *perm_c,         /* Input - column permutation */
- int_t *perm_r,         /* Input - row permutation */
- matrix_symbfact_t *AS, /* Output - temporary storage for the
-			   redistributed matrix */
- Pslu_freeable_t *Pslu_freeable, /* Input - global to local information */
- vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices
-				 distribution */
- int_t  *tempArray,     /* Input/Output - temporary array of size n
-			   (order of the matrix) */
- MPI_Comm    *num_comm  /* Input - communicator for nprocs_num procs */
- )
-{
-/*
- * Purpose 
- * =======
- *
- * Distribute input matrix A for the symbolic factorization routine.
- * Only structural information is distributed.  The redistributed
- * matrix has its rows and columns permuted according to perm_r and
- * perm_c. A is not modified during this routine.
- *
- */
-/* Notations:
- * Ainf : inferior part of A, including diagonal.
- * Asup : superior part of A.
- */
-  int p, p_irow, code_err, ainf_data;
-  int_t n, m_loc, fst_row;
-  int_t i, j, k, irow, jcol;
-  NRformat_loc *Astore;
-  int_t  nnz_loc, nnz_iam;    /* number of local nonzeros */
-  int_t  nnz_remote; /* number of remote nonzeros to be sent */
-  int_t  SendCnt; /* number of remote nonzeros to be sent */
-  int_t  RecvCnt; /* number of remote nonzeros to be received */
-  /* number of nonzeros to send/receive per processor */
-  int_t  *nnzToSend, *nnzToRecv; 
-  int_t *nnzAinf_toSnd; /* nnz in Ainf to send */
-  /* VInfo data structures */
-  int_t *globToLoc, *begEndBlks_loc, nblks_loc, nvtcs_loc, maxNvtcsPProc;
-  
-  int_t neltsRow, vtx, vtx_lid, nelts, ind;
-  int_t *snd_aind, *rcv_aind;
-  int_t *ptr_toSnd, *buf, *ptr_toRcv;
-  /* matrix_symbfact_t *As data */
-  int_t *x_ainf, *x_asup, *ind_ainf, *ind_asup;
-  int  *intBuf1, *intBuf2, *intBuf3, *intBuf4;
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  Astore = (NRformat_loc *) A->Store;
-  n = A->ncol;
-  m_loc = Astore->m_loc;
-  fst_row = Astore->fst_row;
-  globToLoc = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  nnzToRecv = intCalloc_symbfact(3 * (int_t)nprocs_num);
-  nnzToSend = nnzToRecv + nprocs_num;
-  nnzAinf_toSnd = nnzToRecv + 2 * nprocs_num;
-
-  /* --------------------------------------------------------------------- 
-    COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE
-    SPACE.  THIS ACCOUNTS FOR THE FIRST PASS OF A.
-    ----------------------------------------------------------------------*/
-  /* tempArray stores the number of nonzeros in each column of ainf */
-  for (i = 0; i < n; i++)
-    tempArray[i] = 0;
-  for (i = 0; i < m_loc; i++) {
-    irow   = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-    p_irow = OWNER(globToLoc[irow]);
-    neltsRow = 0;
-
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) {
-      jcol = perm_c[Astore->colind[j]];
-      if (jcol <= irow) {
-	p = OWNER(globToLoc[jcol]);
-	if (tempArray[jcol] == 0) {
-	  nnzToSend[p] += 2;
-	  nnzAinf_toSnd[p] += 2;
-	}
-	tempArray[jcol] ++;
-	nnzAinf_toSnd[p] ++;
-      }
-      else {
-	p = p_irow;
-	neltsRow ++;
-      }
-      nnzToSend[p] ++; 
-    }
-    if (neltsRow != 0) {
-      nnzToSend[p_irow] += 2;
-    }
-  }
-  
-  /* add one entry which will separate columns of Ainf from rows
-     of Asup */
-  for (p = 0; p < nprocs_num; p++)
-    if (nnzToSend[p] != 0)
-      nnzToSend[p] ++;
-  
-  /* All-to-all communication */
-  MPI_Alltoall (nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		(*num_comm));
-
-  nnz_loc = SendCnt = RecvCnt = 0;
-  for (p = 0; p < nprocs_num; p++) {
-    if ( p != iam ) {
-      SendCnt += nnzToSend[p];
-      RecvCnt += nnzToRecv[p];
-    } else {
-      nnz_loc += nnzToRecv[p];
-      nnzToSend[p] = 0;
-    }
-  }
-  nnz_iam = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-  
-  /* Allocate temporary storage for sending/receiving the A triplets. */
-  if (!(snd_aind = intMalloc_symbfact(SendCnt)) && SendCnt != 0)
-    ABORT("Malloc fails for snd_aind[].");
-  if ( !(rcv_aind = intMalloc_symbfact(nnz_iam + 1)))
-    ABORT("Malloc fails for rcv_aind[].");
-  if ( !(ptr_toSnd = intCalloc_symbfact((int_t) nprocs_num)) )
-    ABORT("Malloc fails for ptr_toSnd[].");
-  if ( !(ptr_toRcv = intCalloc_symbfact((int_t) nprocs_num)) )
-    ABORT("Malloc fails for ptr_toRcv[].");
-
-  /* setup ptr_toSnd[p] to point to data in snd_aind to be send to 
-   processor p */
-  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
-    if ( p != iam ) 
-      ptr_toSnd[p] = i;
-    else
-      ptr_toSnd[p] = j;
-    i += nnzToSend[p]; 
-    j += nnzToRecv[p];
-  }
-
-  for (i = 0; i < n; i++) {
-    if (tempArray[i] != 0) {
-      /* column i of Ainf will be send to a processor  */
-      p = OWNER( globToLoc[i] );
-      if (p == iam) {
-	buf = &(rcv_aind[ptr_toSnd[p]]);
-      }
-      else {
-	buf = &(snd_aind[ptr_toSnd[p]]);
-      }
-      buf[0] = tempArray[i];
-      buf[1] = i;
-      tempArray[i] = ptr_toSnd[p] + 2;
-      ptr_toSnd[p] += 2 + buf[0];
-    }
-  }
-
-  /* set ptr_toSnd to point to Asup data (stored by rows) */
-  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
-    if ( p != iam ) {
-      if (nnzToSend[p] != 0) { 
-	snd_aind[i + nnzAinf_toSnd[p]] = EMPTY;
-	ptr_toSnd[p] = i + nnzAinf_toSnd[p] + 1;
-      }
-    }
-    else {
-      if (nnzToRecv[p] != 0) {
-	rcv_aind[j + nnzAinf_toSnd[p]] = EMPTY;
-	ptr_toSnd[p] = j + nnzAinf_toSnd[p] + 1;
-      }
-    }
-    i += nnzToSend[p]; 
-    j += nnzToRecv[p];
-  }
-
-  /* ------------------------------------------------------------
-     LOAD THE ENTRIES OF A INTO THE snd_aind STRUCTURE TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF A.
-     For each processor, we store first the columns to be sent,
-     and then the rows to be sent. For each row/column sent:
-     entry 0            : x = number of elements in that row/column
-     entry 1            : row/column number
-     entries 2 .. x + 2 : row/column indices.
-     ------------------------------------------------------------*/
-  for (i = 0; i < m_loc; i++) {
-    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*A */
-    p_irow = OWNER( globToLoc[irow] );
-    ptr_toSnd[p_irow] +=2;
-    neltsRow = 0;
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; j++) {
-      jcol = perm_c[Astore->colind[j]];
-      if (jcol <= irow) {
-	p = OWNER( globToLoc[jcol] );
-	k = tempArray[jcol];
-	tempArray[jcol] ++;
-	if (p == iam) { /* local */
-	  rcv_aind[k] = irow; 
-	}
-	else {
-	  snd_aind[k] = irow;
-	}
-      }
-      else {
-	p = p_irow;
-	neltsRow ++;
-	k = ptr_toSnd[p];
-	ptr_toSnd[p] ++;
-	if (p == iam) { /* local */
-	  rcv_aind[k] = jcol;
-	}
-	else {
-	  snd_aind[k] = jcol;
-	}
-      }
-    }
-
-    if (neltsRow == 0)
-      ptr_toSnd[p_irow] -= 2;
-    else {
-      /* store entry 0 and entry 1 */
-      if (p_irow == iam) { /* local */
-	rcv_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow;
-	rcv_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow;
-      }
-      else { /* remote */
-	snd_aind[ptr_toSnd[p_irow] - neltsRow - 2] = neltsRow;
-	snd_aind[ptr_toSnd[p_irow] - neltsRow - 1] = irow;
-      }
-    }
-  }
-  
-  /* reset ptr_toSnd to point to the beginning of the data for
-     each processor (structure needed in MPI_Alltoallv */
-  for (i = 0, j = 0, p = 0; p < nprocs_num; p++) {
-    ptr_toSnd[p] = i;
-    i += nnzToSend[p];
-    ptr_toRcv[p] = j;
-    j += nnzToRecv[p];
-  }
-  
-  /* ------------------------------------------------------------
-     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-     Note: it uses MPI_Alltoallv.
-     ------------------------------------------------------------*/
-  if (nprocs_num > 1) {
-#if defined (_LONGINT)
-    intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs_num * sizeof(int));
-    intBuf2 = intBuf1 + nprocs_num;
-    intBuf3 = intBuf1 + 2 * nprocs_num;
-    intBuf4 = intBuf1 + 3 * nprocs_num;
-    
-    for (p=0; p<nprocs_num; p++) {
-      if (nnzToSend[p] > INT_MAX || ptr_toSnd[p] > INT_MAX ||
-	  nnzToRecv[p] > INT_MAX || ptr_toRcv[p] > INT_MAX)
-	ABORT("ERROR in symbfact_distributeMatrix size to send > INT_MAX\n");
-      intBuf1[p] = (int) nnzToSend[p];
-      intBuf2[p] = (int) ptr_toSnd[p];
-      intBuf3[p] = (int) nnzToRecv[p];
-      intBuf4[p] = (int) ptr_toRcv[p];
-    }
-#else  /* Default */
-    intBuf1 = nnzToSend;  intBuf2 = ptr_toSnd;
-    intBuf3 = nnzToRecv;  intBuf4 = ptr_toRcv;
-#endif
-
-    MPI_Alltoallv (snd_aind, intBuf1, intBuf2, mpi_int_t, 
-		   rcv_aind, intBuf3, intBuf4, mpi_int_t,
-		   (*num_comm));
-#if defined (_LONGINT)
-    SUPERLU_FREE (intBuf1);
-#endif
-  }
-  
-  /* ------------------------------------------------------------
-     DEALLOCATE SEND STORAGE
-     ------------------------------------------------------------*/
-  if (snd_aind) SUPERLU_FREE( snd_aind );
-  SUPERLU_FREE( ptr_toSnd );
-
-  /* ------------------------------------------------------------
-     CONVERT THE RECEIVED FORMAT INTO THE SYMBOLIC FORMAT.
-     THIS IS PERFORMED ONLY BY NPROCS_SYMB PROCESSORS
-     ------------------------------------------------------------*/
-  if (iam < nprocs_symb) {
-    nblks_loc = VInfo->nblks_loc;
-    begEndBlks_loc = VInfo->begEndBlks_loc;
-    nvtcs_loc = VInfo->nvtcs_loc;
-    /* ------------------------------------------------------------
-       Allocate space for storing indices of A after redistribution.
-       ------------------------------------------------------------*/
-    if (!(x_ainf = intCalloc_symbfact (nvtcs_loc + 1)))
-      ABORT("Malloc fails for x_ainf[].");
-    if (!(x_asup = intCalloc_symbfact (nvtcs_loc + 1)))
-      ABORT("Malloc fails for x_asup[].");
-    
-    /* Initialize the array of columns/rows pointers */
-    for (i = 0, p = 0; p < nprocs_num; p++) {
-      ainf_data = TRUE;
-      k = 0;
-      while (k < nnzToRecv[p]) {
-	j = rcv_aind[i + k];
-	if (j == EMPTY) {
-	  ainf_data = FALSE;
-	  k ++;
-	}
-	else {
-	  nelts = rcv_aind[i + k];
-	  vtx = rcv_aind[i + k + 1];
-	  vtx_lid = LOCAL_IND( globToLoc[vtx] );
-	  k += nelts + 2;
-	  if (ainf_data) 
-	    x_ainf[vtx_lid] += nelts; 
-	  else 
-	    x_asup[vtx_lid] = nelts;
-	}
-      }
-      i += nnzToRecv[p];
-    }
-    
-    /* copy received information */
-    vtx_lid = 0;
-    for (i = 0, k = 0, j = 0; i < nblks_loc; i++) {
-      for (vtx = begEndBlks_loc[2*i]; vtx < begEndBlks_loc[2*i+1]; vtx++, vtx_lid ++) {
-	nelts = x_ainf[vtx_lid];
-	x_ainf[vtx_lid] = k;
-	k += nelts;
-	nelts = x_asup[vtx_lid];
-	x_asup[vtx_lid] = j;
-	j += nelts;
-	tempArray[vtx] = x_ainf[vtx_lid];
-      }
-    }
-    x_ainf[nvtcs_loc] = k;
-    x_asup[nvtcs_loc] = j;
-    
-    /* Allocate space for storing indices of A after conversion */
-    if ( !(ind_ainf = intMalloc_symbfact(x_ainf[nvtcs_loc])) && x_ainf[nvtcs_loc] != 0 )
-      ABORT("Malloc fails for ind_ainf[].");
-    if ( !(ind_asup = intMalloc_symbfact(x_asup[nvtcs_loc])) && x_asup[nvtcs_loc] != 0)
-      ABORT("Malloc fails for ind_asup[].");
-    
-    /* Copy the data into the row/column oriented storage */  
-    for (i = 0, p = 0; p < nprocs_num; p++) {
-      ainf_data = TRUE;
-      k = 0;
-      while (k < nnzToRecv[p]) {
-	j = rcv_aind[i + k];
-	if (ainf_data && j == EMPTY) {
-	  ainf_data = FALSE;
-	  k ++;
-	}
-	else {
-	  nelts = rcv_aind[i + k];
-	  vtx = rcv_aind[i + k + 1];
-	  vtx_lid = LOCAL_IND( globToLoc[vtx] );
-	  if (ainf_data) {
-	    /* traverse ainf data */
-	    ind = tempArray[vtx];
-	    for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) 
-	      ind_ainf[ind] = rcv_aind[j];
-	    tempArray[vtx] = ind;
-	  }
-	  else {
-	    /* traverse asup data */
-	    ind = x_asup[vtx_lid];
-	    for (j = i + k + 2; j < i + k + 2 + nelts; j++, ind ++) 
-	      ind_asup[ind] = rcv_aind[j];
-	  }
-	  k += nelts + 2;
-	}
-      }
-      i += nnzToRecv[p];
-    }
-    
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE
-       ------------------------------------------------------------*/
-    if (nprocs_symb > 1) {
-      SUPERLU_FREE( ptr_toRcv );
-      if (rcv_aind) SUPERLU_FREE( rcv_aind );
-      if (nnzToRecv) SUPERLU_FREE( nnzToRecv );
-    }
-
-    AS->x_ainf = x_ainf;
-    AS->x_asup = x_asup;
-    AS->ind_ainf = ind_ainf;
-    AS->ind_asup = ind_asup;
-    
-    VInfo->nnz_asup_loc = x_asup[nvtcs_loc];
-    VInfo->nnz_ainf_loc = x_ainf[nvtcs_loc];
-  }
-}
-
-static
-float allocPrune_lvl
-(
- Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data
-				  structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input -local info on vertices
-				  distribution */
- psymbfact_stat_t *PS          /* Input -statistics */
- )
-/*
- * Allocate storage for data structures necessary for pruned graphs.
- * For those unpredictable size, make a guess as FILL * n.
- * Return value:
- *     0 if enough memory was available;
- *     otherwise, return the amount of space intended to allocate 
- *     when memory allocation failure occurred.
- */
-{
-  int_t  lword;
-  int_t  nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr;
-  int_t  nvtcs_loc, no_expand_pr, x_sz;
-  float  alpha = 1.5;
-  int_t  FILL = sp_ienv_dist(6);
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  
-  no_expand_pr = 0;
-  lword     = (int_t) sizeof(int_t);
-  
-  /* free memory allocated for the domain symbolic factorization */
-  if (Llu_symbfact->szLsubPr)
-    SUPERLU_FREE( Llu_symbfact->lsubPr );
-  if (Llu_symbfact->szUsubPr)
-    SUPERLU_FREE( Llu_symbfact->usubPr );
-  if (Llu_symbfact->xlsubPr)
-    SUPERLU_FREE( Llu_symbfact->xlsubPr );
-  if (Llu_symbfact->xusubPr)
-    SUPERLU_FREE( Llu_symbfact->xusubPr );
-  
-  Llu_symbfact->xlsub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1);
-  Llu_symbfact->xusub_rcvd = intMalloc_symbfact (VInfo->maxSzBlk + 1);
-
-  /* allocate memory to use during superior levels of sep_tree */
-  x_sz = SUPERLU_MIN( VInfo->maxNvtcsNds_loc, VInfo->maxSzBlk);
-  nzlmaxPr = 2 * FILL * VInfo->maxNvtcsNds_loc;
-  nzumaxPr = 2 * FILL * VInfo->maxSzBlk;  
-
-  /* Integer pointers for L\U factors */
-  if (x_sz != 0) {
-    xlsubPr   = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1);
-    xusubPr   = intMalloc_symbfact(VInfo->maxNvtcsNds_loc + 1);
-    
-    lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword);
-    usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword);
-    
-    while ( !lsubPr || !usubPr ) {
-      if (!lsubPr) SUPERLU_FREE(lsubPr); 
-      if (!usubPr) SUPERLU_FREE(usubPr);
-      
-      nzlmaxPr /= 2;     nzlmaxPr = alpha * (float) nzlmaxPr;
-      nzumaxPr /= 2;     nzumaxPr = alpha * (float) nzumaxPr;
-      
-      if ( nzumaxPr < x_sz ) {
-	fprintf(stderr, "Not enough memory to perform factorization.\n");
-	return (PS->allocMem);
-      }
-      lsubPr  = (int_t *) SUPERLU_MALLOC(nzlmaxPr * lword);
-      usubPr  = (int_t *) SUPERLU_MALLOC(nzumaxPr * lword);
-      ++no_expand_pr;
-    }
-  }    
-  else {
-    xlsubPr = NULL; lsubPr = NULL;
-    xusubPr = NULL; usubPr = NULL;
-  }
-  
-  if (VInfo->maxNvtcsNds_loc)
-    Llu_symbfact->cntelt_vtcsA_lvl = 
-      (int_t *) SUPERLU_MALLOC (VInfo->maxNvtcsNds_loc * lword);
-
-  if (PS->maxSzLPr < Llu_symbfact->indLsubPr)
-    PS->maxSzLPr = Llu_symbfact->indLsubPr;
-  if (PS->maxSzUPr < Llu_symbfact->indUsubPr)
-    PS->maxSzUPr = Llu_symbfact->indUsubPr;
-  
-  Llu_symbfact->lsubPr   = lsubPr;
-  Llu_symbfact->xlsubPr  = xlsubPr;
-  Llu_symbfact->usubPr   = usubPr;
-  Llu_symbfact->xusubPr  = xusubPr;
-  Llu_symbfact->szLsubPr = nzlmaxPr;
-  Llu_symbfact->szUsubPr = nzumaxPr;
-  Llu_symbfact->indLsubPr = 0;
-  Llu_symbfact->indUsubPr = 0;
-
-  Llu_symbfact->no_expand_pr += no_expand_pr;
-  return 0;
-}
-
-static float 
-allocPrune_domain
-(
- int_t fstVtx,  /* Input - first vertex of current node */ 
- int_t lstVtx,  /* Input - last vertex of current node */
- Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data
-				  structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input -local info on vertices
-				  distribution */
- psymbfact_stat_t *PS           /* Input -statistics */
- )
-/*
- * Allocate storage for data structures necessary for pruned graphs.
- * For those unpredictable size, make a guess as FILL * n.
- * Return value:
- *     0 if enough memory was available;
- *     otherwise, return the amount of space intended to allocate 
- *     when memory allocation failure occurred.
- */
-{
-  int_t  lword;
-  int_t  nzlmaxPr, nzumaxPr, *xlsubPr, *xusubPr, *lsubPr, *usubPr;
-  int_t  nvtcs_loc, no_expand_pr, x_sz;
-  float  alpha = 1.5;
-  int_t  FILL = 2 * sp_ienv_dist(6);
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  
-  no_expand_pr = 0;
-  lword     = (int_t) sizeof(int_t);
-  
-  /* allocate memory to use during domain_symbolic routine */
-  /* Guess for prune graph */
-  x_sz = lstVtx - fstVtx;
-  nzlmaxPr = nzumaxPr = 2*FILL * x_sz;
-  
-  /* Integer pointers for L\U factors */
-  if (x_sz != 0) {
-    xlsubPr   = intMalloc_symbfact(x_sz+1);
-    xusubPr   = intMalloc_symbfact(x_sz+1);
-    
-    lsubPr = (int_t *) SUPERLU_MALLOC (nzlmaxPr * lword);
-    usubPr = (int_t *) SUPERLU_MALLOC (nzumaxPr * lword);
-    
-    while ( !lsubPr || !usubPr ) {
-#if 0
-      if (!lsubPr) SUPERLU_FREE(lsubPr); 
-      if (!usubPr) SUPERLU_FREE(usubPr);
-#else /* XSL correction */
-      if ( lsubPr ) SUPERLU_FREE(lsubPr); 
-      if ( usubPr ) SUPERLU_FREE(usubPr);
-#endif      
-      
-      nzlmaxPr /= 2;     nzlmaxPr = alpha * (float) nzlmaxPr;
-      nzumaxPr /= 2;     nzumaxPr = alpha * (float) nzumaxPr;
-      
-      if ( nzumaxPr < x_sz ) {
-	fprintf(stderr, "Not enough memory to perform factorization.\n");
-	return (PS->allocMem);
-      }
-      lsubPr  = (void *) SUPERLU_MALLOC(nzlmaxPr * lword);
-      usubPr  = (void *) SUPERLU_MALLOC(nzumaxPr * lword);
-      ++no_expand_pr;
-    }
-  }    
-  else {
-    xlsubPr = NULL;
-    xusubPr = NULL;
-  }
-  
-  Llu_symbfact->lsubPr   = lsubPr;
-  Llu_symbfact->xlsubPr  = xlsubPr;
-  Llu_symbfact->usubPr   = usubPr;
-  Llu_symbfact->xusubPr  = xusubPr;
-  Llu_symbfact->szLsubPr = nzlmaxPr;
-  Llu_symbfact->szUsubPr = nzumaxPr;
-  Llu_symbfact->indLsubPr = 0;
-  Llu_symbfact->indUsubPr = 0;
-  Llu_symbfact->xlsub_rcvd = NULL;
-  Llu_symbfact->xusub_rcvd = NULL;
-  Llu_symbfact->cntelt_vtcsA_lvl = NULL;
-
-  PS->maxSzLPr = Llu_symbfact->indLsubPr;
-  PS->maxSzUPr = Llu_symbfact->indUsubPr;
-
-  Llu_symbfact->no_expand_pr = no_expand_pr;
-  Llu_symbfact->no_expcp = 0;
-  return 0;
-}
-
-/************************************************************************/
-static
-int symbfact_alloc
-/************************************************************************/
-(
- int_t n,       /* Input - order of the matrix */
- int   nprocs,  /* Input - number of processors for the symbolic
-		   factorization */  
- Pslu_freeable_t *Pslu_freeable, 
- Llu_symbfact_t *Llu_symbfact, /* Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input - local info on vertices
-				  distribution */
- comm_symbfact_t *CS, /* Input -information on communication */
- psymbfact_stat_t *PS /* Input -statistics */
- )
-/*
- * Allocate storage for the data structures common to symbolic factorization
- * routines. For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     0 if enough memory was available;
- *     otherwise, return the amount of space intended to allocate 
- *     when memory allocation failure occurred.
- */
-{
-  int    nlvls, p;  /* no of levels in the separator tree */
-  int_t  lword, no_expand;
-  int_t  *xsup, *supno;
-  int_t  *lsub, *xlsub;
-  int_t  *usub, *xusub;
-  int_t  nzlmax, nzumax, nnz_a_loc;
-  int_t  nvtcs_loc, *cntelt_vtcs;
-  float  alpha = 1.5;
-  int_t  FILL = sp_ienv_dist(6);
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  nnz_a_loc = VInfo->nnz_ainf_loc + VInfo->nnz_asup_loc;
-  nlvls = (int) log2( (double) nprocs ) + 1;
-  no_expand = 0;
-  lword     = sizeof(int_t);
-  
-  /* Guess for L\U factors */
-  nzlmax = nzumax = FILL * nnz_a_loc;
-  
-  /* Integer pointers for L\U factors */
-  supno  = intMalloc_symbfact(nvtcs_loc+1);
-  xlsub  = intMalloc_symbfact(nvtcs_loc+1);
-  xusub  = intMalloc_symbfact(nvtcs_loc+1);
-  
-  lsub = (void *) SUPERLU_MALLOC(nzlmax * lword);
-  usub = (void *) SUPERLU_MALLOC(nzumax * lword);
-  
-  while ( !lsub || !usub ) {
-    if (!lsub) SUPERLU_FREE(lsub); 
-    if (!usub) SUPERLU_FREE(usub);
-    
-    nzlmax /= 2;     nzlmax = alpha * nzlmax;
-    nzumax /= 2;     nzumax = alpha * nzumax;
-    
-    if ( nzumax < nnz_a_loc/2 ) {
-      fprintf(stderr, "Not enough memory to perform factorization.\n");
-      return (PS->allocMem);
-    }
-    lsub  = (void *) SUPERLU_MALLOC(nzlmax * lword);
-    usub  = (void *) SUPERLU_MALLOC(nzumax * lword);
-    ++no_expand;
-  }
-  
-  if (nprocs == 1)
-    cntelt_vtcs = NULL;
-  else 
-    cntelt_vtcs = intMalloc_symbfact (nvtcs_loc+1);
-  
-  /* allocate memory for communication data structures */
-  CS->rcv_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1);
-  CS->snd_interLvl = intMalloc_symbfact (2 * (int_t) nprocs + 1);
-  CS->ptr_rcvBuf   = intMalloc_symbfact (2 * (int_t) nprocs );
-  CS->rcv_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1);
-  CS->snd_intraLvl = intMalloc_symbfact ((int_t) nprocs + 1);
-  
-  CS->snd_interSz  = intMalloc_symbfact ((int_t) nlvls + 1);
-  CS->snd_LinterSz = intMalloc_symbfact ((int_t) nlvls + 1);  
-  CS->snd_vtxinter = intMalloc_symbfact ((int_t) nlvls + 1);  
-  CS->rcv_bufSz    = 0;
-  CS->rcv_buf      = NULL;
-  CS->snd_bufSz    = 0;
-  CS->snd_buf      = NULL;
-
-  for (p = 0; p < nprocs; p++) {
-    CS->rcv_interLvl[p] = EMPTY;
-    CS->snd_interLvl[p] = EMPTY;
-    CS->rcv_intraLvl[p] = EMPTY;
-    CS->snd_intraLvl[p] = EMPTY;
-  }
-  
-  for (p = 0; p <= nlvls; p++) {
-    CS->snd_vtxinter[p] = EMPTY;
-    CS->snd_interSz[p]  = 0;
-    CS->snd_LinterSz[p] = 0;
-  }
-  
-  Pslu_freeable->supno_loc   = supno;
-  Llu_symbfact->lsub   = lsub;
-  Llu_symbfact->xlsub  = xlsub;
-  Llu_symbfact->usub   = usub;
-  Llu_symbfact->xusub  = xusub;
-  Llu_symbfact->szLsub = nzlmax;
-  Llu_symbfact->szUsub = nzumax;
-  Llu_symbfact->cntelt_vtcs = cntelt_vtcs;
-  
-  Llu_symbfact->no_expand = no_expand;  
-  
-  return SUCCES_RET;
-} /* SYMBFACT_ALLOC */
-
-static int_t 
-symbfact_vtx
-(
- int_t n,         /* Input - order of the matrix */
- int   iam,       /* Input - my processor number */
- int_t vtx,       /* Input - vertex number to perform symbolic factorization */
- int_t vtx_lid,   /* Input - local vertex number */
- int_t vtx_prid,  /* Input - */
- int_t computeL,  /* Input - TRUE when compute column L(:,vtx)
-		             otheriwse compute row U(vtx, :) */
- int   domain_symb,  /* Input - if TRUE, computation corresponds to the independent
-			domain at the bottom of the separator tree */
- int_t fstVtx,       /* Input - first vertex of current node */ 
- int_t lstVtx,       /* Input - last vertex of current node */
- int_t snrep_lid,    /* local index of current supernode reprezentative */
- int_t szSn,         /* size of supernode with snrep_lid reprezentative */
- int_t *p_next,      /* next element in sub structure */
- int_t *marker,      
- int_t *sub_rcvd,    /* elements of node */
- int_t sub_rcvd_sz,  /* size of sub to be explored */
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,    /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS,
- int_t *p_neltsVtxInit,
- int_t *p_neltsVtx,
- int_t *p_neltsVtx_CSep,
- int_t *p_neltsZrVtx,
- int_t *p_neltsMatched,
- int_t mark_vtx,
- int_t *p_prval_curvtx,
- int_t vtx_bel_othSn,
- int_t *p_vtx_bel_mySn
- )
-{ 
-  int_t x_aind_beg, x_aind_end, upd_lstSn;
-  int_t k, vtx_elt, ind, pr, pr_lid, mem_error, ii, jj, compRcvd;
-  int_t *xsub, *sub, *xsubPr, *subPr, *xsub_rcvd, *xsub_src, *sub_src;
-  int_t pr_elt, next, prval_curvtx, maxNvtcsPProc;
-  int_t  neltsVtx, neltsMatched, neltsZrVtx, neltsZrSn, neltsVtx_CSep;
-  int_t  neltsVtxInit, kk;
-
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  upd_lstSn     = FALSE;
-  prval_curvtx  = *p_prval_curvtx;
-  neltsVtx_CSep = 0;
-  next = *p_next;
-  if (computeL) {
-    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
-    xsub_rcvd = Llu_symbfact->xlsub_rcvd;
-    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
-  }
-  else {
-    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
-    xsub_rcvd = Llu_symbfact->xusub_rcvd;
-    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
-  }
-
-  x_aind_beg = xsub[vtx_lid];
-  x_aind_end = xsub[vtx_lid + 1];
-  xsub[vtx_lid] = next;
-  k = x_aind_beg;
-  /* while (sub[k] != EMPTY && k < x_aind_end) { */
-  while (k < x_aind_end) {
-    if (sub[k] == EMPTY)
-      k = x_aind_end;
-    else {
-      vtx_elt = sub[k];
-      if (!computeL)
-	if (marker[vtx_elt] == mark_vtx - 2)
-	  if (vtx_elt < prval_curvtx)
-	    prval_curvtx = vtx_elt;
-      marker[vtx_elt] = mark_vtx;
-      if (!computeL && vtx_elt == vtx)
-	printf ("Pe[%d] ERROR diag elt in U part vtx %d dom_s %d fstV %d lstV %d\n", 
-		iam, vtx, domain_symb, fstVtx, lstVtx);
-      else {
-	sub[next] = vtx_elt; 
-	next ++;
-      }
-      if (vtx_elt < lstVtx) neltsVtx_CSep ++;
-      k++;
-    }
-  }
-  neltsVtxInit = k - x_aind_beg;
-  PS->nops += neltsVtxInit;
-  
-  if (domain_symb) {
-    if (computeL)
-      VInfo->nnz_ainf_loc -= x_aind_end - x_aind_beg;
-    else    
-      VInfo->nnz_asup_loc -= x_aind_end - x_aind_beg;
-  }
-
-#ifdef TEST_SYMB 
-  printf ("compL %d vtx %d vtx_lid %d vtx_prid %d vtx_bel_othSn %d\n", 
-	  computeL, vtx, vtx_lid, vtx_prid, vtx_bel_othSn);
-  PrintInt10 ("A(:, v)", x_aind_end - x_aind_beg, &(sub[xsub[vtx_lid]]));
-#endif
-
-  ind = xsubPr[vtx_prid];
-  if (vtx_bel_othSn == vtx)
-    upd_lstSn = TRUE;
-
-  while (ind != EMPTY || upd_lstSn) {
-    if (upd_lstSn ) {
-      upd_lstSn = FALSE;
-      pr_lid = snrep_lid;
-    }
-    else {
-      pr_lid = subPr[ind];
-      ind = subPr[ind - 1];
-    }
-    
-    if (!computeL)
-      marker[vtx] = mark_vtx;
-    if (pr_lid >= VInfo->nvtcs_loc) {
-      compRcvd = TRUE;
-      xsub_src = xsub_rcvd; sub_src = sub_rcvd;
-      pr_lid -= VInfo->nvtcs_loc;
-      k = xsub_src[pr_lid] + RCVD_IND;
-    }
-    else {
-      compRcvd = FALSE;
-      xsub_src = xsub; sub_src = sub;
-      k = xsub_src[pr_lid];
-    }
-
-    PS->nops += xsub_src[pr_lid+1] - xsub_src[pr_lid];
-    for (; k < xsub_src[pr_lid+1]; k++) {
-      pr_elt = sub_src[k];
-      if (pr_elt >= vtx && marker[pr_elt] != mark_vtx) {
-
-	/* TEST available memory */
-	if (next >= x_aind_end) {	
-	  if (domain_symb) {
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, vtx, next, 0,
-				      computeL, DOMAIN_SYMB, 1, 
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  } else if (mem_error =
-		     psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, 
-					computeL, LL_SYMB, 1, 
-					Pslu_freeable, Llu_symbfact, VInfo, PS))
-	    return (mem_error);
-
-	  x_aind_end = xsub[vtx_lid + 1];
-	  if (computeL)   sub = Llu_symbfact->lsub; 
-	  else   sub = Llu_symbfact->usub; 
-	  if (!compRcvd) 
-	    sub_src = sub;	  
-	}
-
-	sub[next] = pr_elt; next ++;
-
-	if (pr_elt < lstVtx) neltsVtx_CSep ++;
-	if (!computeL)
-	  if (marker[pr_elt] == mark_vtx - 2)
-	    if (pr_elt < prval_curvtx)
-	      prval_curvtx = pr_elt;
-	marker[pr_elt] = mark_vtx;	
-      }
-    }
-  }
-
-  neltsVtx = next - xsub[vtx_lid];
-  neltsZrVtx = 0; /* number of zero elements which would
-		     be introduced in the vertex */
-  neltsZrSn = 0; /* -"- in the supernode */
-  neltsMatched = 0; 
-  if (vtx != fstVtx) {
-    for (k = xsub[snrep_lid]; k < xsub[snrep_lid+1]; k++) {
-      vtx_elt = sub[k];
-      if (vtx_elt >= vtx) {
-	if ((vtx_elt > vtx && !computeL) || 
-	    (vtx_elt >= vtx && computeL)) {
-	  if (marker[vtx_elt] != mark_vtx)
-	    neltsZrVtx ++;
-	  else {
-	    neltsMatched ++;
-	  }
-	}
-	if (computeL && vtx_elt == vtx)
-	  *p_vtx_bel_mySn = vtx;
-	if (!computeL && vtx_elt == vtx + 1)
-	  *p_vtx_bel_mySn = vtx + 1;
-      }
-    }
-  }
-  else {
-    neltsMatched = neltsVtx;
-    if (! computeL) 
-      for (k = xsub[vtx_lid]; k < next; k++) {
-	vtx_elt = sub[k];
-	if (vtx_elt == vtx + 1)
-	  *p_vtx_bel_mySn = vtx + 1;
-      }
-  }
-
-  *p_neltsVtxInit  = neltsVtxInit;
-  *p_neltsVtx      = neltsVtx;
-  *p_neltsVtx_CSep = neltsVtx_CSep;
-  *p_neltsZrVtx    = neltsZrVtx;
-  *p_neltsMatched  = neltsMatched;
-  *p_next          = next;
-  *p_prval_curvtx  = prval_curvtx;
-  return SUCCES_RET;
-}
-
-static int_t
-updateRcvd_prGraph
-(
- int_t n,         /* Input - order of the matrix */
- int   iam,       /* Input - my processor number */
- int_t *sub_rcvd,      /* elements of node */
- int_t sub_rcvd_sz,   /* Input - size of sub to be used in the update */
- int_t fstVtx_toUpd,  /* Input - first vertex to update */
- int_t lstVtx_toUpd,  /* Input - last vertex to update */
- int_t pr_offset,
- int   computeL,
- int_t *marker,
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,   /* Input - local info on vertices distribution */
- psymbfact_stat_t *PS
- /*  marker: first elements of marker contain the nodes that will
-     be used in the updates */
-)
-{
-  int_t i, k, nelts, prVal, vtx_elt, vtx_elt_lid, ind;
-  int_t vtx, vtx_lid, fstVtx_toUpd_lid, fstVtx_srcUpd_lid;
-  int_t *xsub, *sub, *xsub_rcvd, *xsubPr, *subPr, szsubPr, *p_indsubPr;
-  int_t maxNvtcsPProc, *globToLoc, mem_error;
-  int_t nvtcs_toUpd, fstVtx_srcUpd, vtx_lid_p;
-  
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
-  nvtcs_toUpd = lstVtx_toUpd - fstVtx_toUpd;
-  
-  if (computeL) {
-    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
-    xsub_rcvd = Llu_symbfact->xlsub_rcvd;
-    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
-    p_indsubPr = &(Llu_symbfact->indLsubPr);
-    szsubPr = Llu_symbfact->szLsubPr;
-  }
-  else {
-    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
-    xsub_rcvd = Llu_symbfact->xusub_rcvd;
-    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
-    p_indsubPr = &(Llu_symbfact->indUsubPr);
-    szsubPr = Llu_symbfact->szUsubPr;
-  }
-  
-  /* count number of elements in transpose representation of sub_rcvd */
-  /* use marker to count those elements */
-  for (i = 0; i < nvtcs_toUpd; i++)
-    marker[i] = 0;
-  for (i = 0; i <= VInfo->maxSzBlk; i++)
-    xsub_rcvd[i] = 0;
-  
-  i = 0;
-  fstVtx_srcUpd = EMPTY;
-  while (i < sub_rcvd_sz) {
-    vtx   = sub_rcvd[i + DIAG_IND];
-    nelts = sub_rcvd[i + NELTS_IND];
-    i += RCVD_IND;
-    prVal = sub_rcvd[i];
-    if (fstVtx_srcUpd == EMPTY) fstVtx_srcUpd = vtx;
-    xsub_rcvd[vtx - fstVtx_srcUpd] = i - RCVD_IND;
-    xsub_rcvd[vtx-fstVtx_srcUpd+1] = i + nelts;
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = sub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - 
-	      fstVtx_toUpd_lid;
-	    marker[vtx_elt_lid] ++;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }
-
-  vtx_lid = fstVtx_toUpd_lid - pr_offset;
-  ind = 0;
-  for (i = 0; i < nvtcs_toUpd; i++) {
-    if (marker[i] != 0) {
-      xsubPr[vtx_lid] = ind + 1;
-      ind += 2* marker[i];
-      marker[i] = xsubPr[vtx_lid] - 1;
-    }
-    vtx_lid ++;
-  }
-  
-  if (ind == 0) 
-    /* quick return if no update */
-    return;
-
-  /* test if enough memory in usubPr array */
-  if (ind >= szsubPr) {
-    if (mem_error = 
-	psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS))
-      return (mem_error);
-    if (computeL) 
-      subPr = Llu_symbfact->lsubPr;  
-    else 
-      subPr = Llu_symbfact->usubPr;
-  }
-  *p_indsubPr = ind;
-  
-  i = 0;
-  while (i < sub_rcvd_sz) {
-    vtx   = sub_rcvd[i + DIAG_IND];
-    nelts = sub_rcvd[i + NELTS_IND];
-    i += RCVD_IND;
-    prVal = sub_rcvd[i];
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = sub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
-	    vtx_lid_p = vtx_elt_lid - pr_offset;
-	    vtx_elt_lid -= fstVtx_toUpd_lid;
-	    /* add vtx to structure of pruned graph */
-	    if (marker[vtx_elt_lid] != xsubPr[vtx_lid_p] - 1) 
-	      subPr[marker[vtx_elt_lid] - 2] = marker[vtx_elt_lid] + 1;
-	    subPr[marker[vtx_elt_lid] + 1] = vtx - fstVtx_srcUpd + VInfo->nvtcs_loc;
-	    subPr[marker[vtx_elt_lid]] = EMPTY;
-	    marker[vtx_elt_lid] += 2;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }  
-  
-  for (i = fstVtx_toUpd; i < nvtcs_toUpd; i++)
-    marker[i] = 0;
-}
-
-static int_t
-update_prGraph 
-(
- int   iam, 
- int_t n,           /* order of the matrix */
- int_t fstVtx_blk,  /* first vertex in block to factorize */
- int_t lstVtx_blk,  /* last vertex in block to factorize */
- int_t snrep_lid,   /* local index of current supernode reprezentative */
- int_t pr_offset,   /* offset in the indexing of prune structure */
- int_t prval_cursn, /* prune value of current supernode reprezentative */
- int_t xsub_snp1,   /* denotes xsub[snrep_lid + 1] */
- int   computeL,    /* Input - if 1, compute column L(:,vtx)
-		               else compute row U(vtx, :) */
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,   /* Input/Output - local L, U data structures */
- psymbfact_stat_t *PS
- )
-{
-  int_t k, mem_error;
-  int_t kmin, kmax, ktemp, maxElt;
-  int_t sn_elt, sn_elt_prid;
-  int_t *globToLoc, maxNvtcsPProc;
-  int_t *xsub, *sub, *xsubPr, *subPr;
-  int_t *p_indsubPr, szsubPr;
-  
-  globToLoc     = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-
-  if (computeL) {
-    xsub = Llu_symbfact->xlsub; sub = Llu_symbfact->lsub;
-    xsubPr = Llu_symbfact->xlsubPr; subPr = Llu_symbfact->lsubPr;
-    p_indsubPr = &(Llu_symbfact->indLsubPr);
-    szsubPr = Llu_symbfact->szLsubPr;
-  }
-  else {
-    xsub = Llu_symbfact->xusub; sub = Llu_symbfact->usub;
-    xsubPr = Llu_symbfact->xusubPr; subPr = Llu_symbfact->usubPr;
-    p_indsubPr = &(Llu_symbfact->indUsubPr);
-    szsubPr = Llu_symbfact->szUsubPr;
-  }
-  
-  kmin = xsub[snrep_lid];
-  kmax = xsub_snp1 - 1;
-  if (prval_cursn != n)
-    maxElt = prval_cursn;
-  else
-    maxElt = EMPTY;
-  while (kmin <= kmax) {
-    if (prval_cursn == n) {
-      /* compute maximum element of L(:, vtx) */
-      if (sub[kmin] > maxElt)
-	maxElt = sub[kmin];
-      kmin ++;
-    }
-    else {
-      /* Do a quicksort-type partition. */    
-      if (sub[kmax] > prval_cursn) 
-	kmax--;
-      else if (sub[kmin] <= prval_cursn)
-	kmin++;
-      else { /* kmin does'nt belong to G^s(L), and kmax belongs: 
-	      * 	   interchange the two subscripts
-	      */
-	ktemp = sub[kmin];
-	sub[kmin] = sub[kmax];
-	sub[kmax] = ktemp;
-	kmin ++;
-	kmax --;
-      }
-    }
-  }
-  k = xsub[snrep_lid];
-  while (sub[k] <= prval_cursn && k < xsub_snp1) {
-    sn_elt = sub[k];
-    if (sn_elt < lstVtx_blk) {
-      sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset;
-      if ((*p_indsubPr) + 2 >= szsubPr) {
-	if (mem_error = 
-	    psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS))
-	  return (mem_error);
-	if (computeL) {
-	  subPr = Llu_symbfact->lsubPr;  szsubPr = Llu_symbfact->szLsubPr;
-	}
-	else {
-	  subPr = Llu_symbfact->usubPr;  szsubPr = Llu_symbfact->szUsubPr;
-	}
-      }
-      /* add krow to structure of pruned graph */
-      subPr[(*p_indsubPr) + 1] = snrep_lid;
-      subPr[(*p_indsubPr)] = xsubPr[sn_elt_prid];
-      xsubPr[sn_elt_prid] = (*p_indsubPr) + 1;
-      (*p_indsubPr) += 2;
-    }
-    if (sn_elt == maxElt) {
-      /* move prune val in the first position */
-      sub[k] = sub[xsub[snrep_lid]];
-      sub[xsub[snrep_lid]] = sn_elt;
-    }
-    k ++; 
-  }
-  return SUCCES_RET;
-}
-
-static int_t
-blk_symbfact
-(SuperMatrix *A,
- int   iam,
- int   lvl, 
- int   szSep,
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t fstVtx_loc, /* Input - first vertex local of the level */
- int_t fstVtx_blk,
- int_t lstVtx_blk,
- int_t *lsub_rcvd,      /* elements of node */
- int_t lsub_rcvd_sz,    /* size of sub to be explored */
- int_t *usub_rcvd,  
- int_t usub_rcvd_sz,
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,   /* Input/Output - local L, U data structures */  
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *marker,
- int_t *p_mark,    /* marker used to merge elements of vertices */
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_neltsZr, /* no of artificial zeros introduced so far */
- int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
-			 computed so far */
- int_t *p_nsuper_loc
- )
-{
-  int szSep_tmp, lvl_tmp, ii, jj;
-  int_t  *xlsubPr, *xusubPr; 
-  int_t  *xsup, *supno, *lsub, *xlsub, *usub, *xusub;
-  int_t  vtx_lid, vtx_prid, vtx, vtx_super, vtx_elt, maxNvtcsPProc;
-  int_t  ind, pr, pr_elt, newnext, k, vtx_elt_lid;
-  int_t  nextl, nextu, nsuper_loc, nvtcs, n, mem_error;
-  int_t  x_aind_beg, x_aind_end, i, szLp, xlsub_snp1, xusub_snp1;
-  int_t  snrep, snrep_lid, szsn, vtxp1, *globToLoc, domain_symb;
-  int_t lstVtx, neltsCurSep, maxNeltsVtx, fstVtx_loc_lid;
-  /* supernode relaxation parameters */
-  int_t  neltsVtx_L, neltsZrVtx_L, neltsMatched_L, neltsVtx_CSep_L;
-  int_t  neltsVtx_U, neltsZrVtx_U, neltsMatched_U, neltsVtx_CSep_U;
-  int_t  neltsZrSn_L, neltsZrSn_U, neltsZr, neltsTotal, 
-    neltsZr_tmp, neltsTotal_tmp, neltsZrSn, neltsVtxInit_l, neltsVtxInit_u;
-  /* next vertex belongs to current supernode pruned structure */
-  int_t  vtx_bel_snL, vtx_bel_snU;
-  /* marker variables */
-  int_t  markl1_vtx, markl2_vtx, marku1_vtx, marku2_vtx;
-  /* prune structure variables */
-  int_t prval_cursn, prval_curvtx, pr_offset;
-  /* variables for comms info */
-  int_t neltSn_L, neltSn_U, lstVtx_tmp, stat;
-  float relax_param, relax_seps;
-
-  if (fstVtx_blk >= lstVtx_blk)
-    return;
-  
-  /* Initializations */
-  supno   = Pslu_freeable->supno_loc;
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-  xusubPr  = Llu_symbfact->xusubPr; 
-  xlsubPr  = Llu_symbfact->xlsubPr;   
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  maxNeltsVtx   = VInfo->maxNeltsVtx;
-  
-  n          = A->ncol;
-  nextl      = *p_nextl;
-  nextu      = *p_nextu;
-  neltsZr    = *p_neltsZr;
-  neltsTotal = *p_neltsTotal;
-  nsuper_loc = *p_nsuper_loc;
-  marku2_vtx = *p_mark;
-  lstVtx     = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
-
-  snrep = fstVtx_blk; 
-  snrep_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-  szsn = 1;
-  nvtcs = lstVtx_blk - fstVtx_blk;
-  prval_cursn = n;
-  vtx_bel_snL = EMPTY; vtx_bel_snU = EMPTY;
-  
-  /* set up to EMPTY xlsubPr[], xusubPr[] */
-  if (PS->maxSzLPr < Llu_symbfact->indLsubPr)
-    PS->maxSzLPr = Llu_symbfact->indLsubPr;
-  if (PS->maxSzUPr < Llu_symbfact->indUsubPr)
-    PS->maxSzUPr = Llu_symbfact->indUsubPr;
-  for (i = 0; i < nvtcs; i++) {
-    xlsubPr[i] = EMPTY;
-    xusubPr[i] = EMPTY;
-  }
-  Llu_symbfact->indLsubPr = 0;
-  Llu_symbfact->indUsubPr = 0;
-
-  if (ind_sizes1 == 0) 
-    domain_symb = TRUE;
-  else {
-    domain_symb = FALSE;
-    fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
-  }
-  
-  vtx_prid = 0;
-  vtx_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-  pr_offset = vtx_lid;
-
-  if (lsub_rcvd != NULL) {
-    updateRcvd_prGraph (n, iam, lsub_rcvd, lsub_rcvd_sz,
-			fstVtx_blk, lstVtx_blk, pr_offset, 1, marker,
-			Pslu_freeable, Llu_symbfact, VInfo, PS);    
-    updateRcvd_prGraph (n, iam, usub_rcvd, usub_rcvd_sz,
-			fstVtx_blk, lstVtx_blk, pr_offset, 0, marker,
-			Pslu_freeable, Llu_symbfact, VInfo, PS);
-  }
-  
-  for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++, vtx_prid ++) {
-    vtxp1 = vtx + 1;
-    if (marku2_vtx +4 >= n) {
-      /* reset to EMPTY marker array */
-      for (i = 0; i < n; i++)
-	marker[i] = EMPTY;
-      marku2_vtx = EMPTY;
-    }
-    markl1_vtx = marku2_vtx + 1; markl2_vtx = markl1_vtx + 1;
-    marku1_vtx = markl2_vtx + 1; marku2_vtx = marku1_vtx + 1;    
-
-    prval_curvtx   = n;
-    /* Compute nonzero structure L(:,vtx) */
-    if (mem_error = 
-	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, 
-		      fstVtx_blk,  lstVtx,
-		      snrep_lid, szsn, &nextl,
-		      marker, 
-		      lsub_rcvd, lsub_rcvd_sz,
-		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l,
-		      &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, 
-		      &neltsMatched_L, markl1_vtx, &prval_curvtx, 
-		      vtx_bel_snU, &vtx_bel_snL))
-      return (mem_error);
-    lsub = Llu_symbfact->lsub;
-
-#ifdef TEST_SYMB
-    PrintInt10 ("L(:, %d)", nextl - xlsub[vtx_lid], &(lsub[xlsub[vtx_lid]]));
-#endif
-    
-    /* Compute nonzero structure of U(vtx,:) */
-    if (mem_error = 
-	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, 
-		      fstVtx_blk, lstVtx,
-		      snrep_lid, szsn, &nextu,
-		      marker, 
-		      usub_rcvd, usub_rcvd_sz,
-		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u,
-		      &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, 
-		      &neltsMatched_U, marku1_vtx, &prval_curvtx,
-		      vtx_bel_snL, &vtx_bel_snU))
-      return (mem_error);
-    usub = Llu_symbfact->usub;
-
-#ifdef TEST_SYMB
-    PrintInt10 ("U(%d, :)", nextu - xusub[vtx_lid], &(usub[xusub[vtx_lid]])); 
-#endif
-    
-    /* update statistics on fill-in */
-    if (!domain_symb) {
-      stat = CEILING( (neltsVtxInit_l + neltsVtxInit_u), 2);
-      if (Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] != stat) {
-	stat = CEILING(stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid]);
-	PS->fill_pelt[0] += (float) stat;
-	if ((float) stat > PS->fill_pelt[1]) PS->fill_pelt[1] = (float) stat;
-	PS->fill_pelt[2] += 1.;
-      }
-      stat = CEILING( (neltsVtx_L + neltsVtx_U), 2);
-      stat = CEILING( stat, Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] );
-      PS->fill_pelt[3] += (float) stat;
-      if ((float) stat > PS->fill_pelt[4]) PS->fill_pelt[4] = (float) stat;
-      PS->fill_pelt[5] += 1.;
-    }     
-
-    /* compute number of artificial zeros */
-    neltsTotal = 0;
-    neltsZr = 0;
-    neltsZrSn_L    = neltsVtx_L - neltsMatched_L;
-    neltsZrSn_U    = neltsVtx_U - neltsMatched_U;
-    neltsZrSn      = neltsZrVtx_L + neltsZrVtx_U +
-      (neltsZrSn_L + neltsZrSn_U) * szsn;
-    neltsZr_tmp    = neltsZr + neltsZrSn;
-    neltsTotal_tmp = neltsTotal + neltsZrSn + neltsVtx_L + neltsVtx_U;
-    if (neltsTotal_tmp == 0)
-      neltsTotal_tmp = 1;
-    relax_param = (float) (neltsTotal_tmp - neltsZr_tmp) / neltsTotal_tmp;
-
-#ifdef TEST_SYMB
-    printf ("[%d] vtx %d pr %d szsn %d nVtx_L %d nZrSn_L %d nZrVtx_L %d\n",
-	    iam, vtx, prval_curvtx, szsn,neltsVtx_L, neltsZrSn_L, neltsZrVtx_L);
-    printf ("  [%d] nVtx_U %d, nZrSn_U %d nZrVtx_U %d nextl %d nextu %d\n",
-	    iam, neltsVtx_U, neltsZrSn_U, neltsZrVtx_U, nextl, nextu);
-    printf ("  [%d] nZr %d nZr_tmp %d nTot %d nTot_tmp %d rel %f test %d\n\n", 
-	    iam, neltsZr, neltsZr_tmp, neltsTotal, neltsTotal_tmp,
-	    relax_param, i);
-#endif
-
-    /* Check to see if vtx belongs in the same supernode as vtx-1 */
-    supno[vtx_lid] = nsuper_loc;
-    if (vtx == fstVtx_blk) {
-      prval_cursn = prval_curvtx;
-      neltsTotal += neltsVtx_L + neltsVtx_U;
-    }
-    else {
-      if (maxNeltsVtx > 0) {
-	relax_seps = (float) neltsVtx_L / (float) maxNeltsVtx;
-	relax_seps *= (float) (neltsVtx_U+1) / (float) maxNeltsVtx;
-      } 
-      else
-	relax_seps = 0.0;
-
-      /* check if all upper separators are dense */
-      if (relax_seps >= PS->relax_seps ) {
-	VInfo->filledSep = FILLED_SEPS; 
-	*p_nextl      = xlsub[vtx_lid];
-	*p_nextu      = xusub[vtx_lid];
-	nsuper_loc   += 1;
-	*p_nsuper_loc = nsuper_loc;
-	if (mem_error =
-	    dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
-				sizes, fstVtxSep, vtx,
-				Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-				p_nextl, p_nextu, p_nsuper_loc))
-	  return (mem_error);
-	/* set up neltsZr and neltsTotal */
-	vtx = lstVtx_blk;
-	return 0;
-      } /* if all upper separators are dense */
-      else {
-	if (relax_param >= PS->relax_gen) {
-	  /* vertex belongs to the same supernode */
-	  if (prval_cursn > prval_curvtx || prval_cursn <= vtx)
-	    prval_cursn = prval_curvtx;
-	  neltsZr    = neltsZr_tmp;
-	  neltsTotal = neltsTotal_tmp;
-	  szsn ++;
-	  /* add artificial zeros at the structure of current supernode */
-	  newnext = xlsub[snrep_lid+1];
-	  if (neltsZrSn_L != 0) {
-	    for (k = xlsub[snrep_lid]; k < xlsub[snrep_lid+1]; k++) {
-	      vtx_elt = lsub[k];
-	      if (vtx_elt >= vtx) 
-		marker[vtx_elt] = markl2_vtx;
-	    }
-	    for (k = xlsub[vtx_lid]; k < nextl; k++) {
-	      vtx_elt = lsub[k];
-	      if (marker[vtx_elt] != markl2_vtx) {
-		/* add vtx_elt to the structure of snrep */
-		lsub[newnext] = vtx_elt; newnext ++;
-		marker[vtx_elt] = markl2_vtx;
-	      }
-	    }
-	    xlsub[snrep_lid+1] = newnext;
-	  }
-	  xlsub[vtx_lid] = newnext;
-	  nextl = newnext;
-	  neltsVtx_L += neltsZrVtx_L;
-	  
-	  newnext = xusub[snrep_lid+1];
-	  if (neltsZrSn_U != 0) {
-	    for (k = xusub[snrep_lid]; k < xusub[snrep_lid+1]; k++) {
-	      vtx_elt = usub[k];
-	      if (vtx_elt >= vtx) {
-		if (marker[vtx_elt] == markl2_vtx)
-		  if (prval_cursn > vtx_elt && vtx_elt != vtx)
-		    prval_cursn = vtx_elt;
-		marker[vtx_elt] = marku2_vtx;
-	      }
-	    }
-	    for (k = xusub[vtx_lid]; k < nextu; k++) {
-	      vtx_elt = usub[k];
-	      if (marker[vtx_elt] != marku2_vtx) {
-		/* add vtx_elt to the structure of snrep */
-		usub[newnext] = vtx_elt; newnext ++;
-		if (marker[vtx_elt] == markl2_vtx)
-		  if (prval_cursn > vtx_elt && vtx_elt != vtx)
-		    prval_cursn = vtx_elt;
-		marker[vtx_elt] = marku2_vtx;
-	      }
-	    }
-	    if (marker[vtxp1] == marku2_vtx)
-	      vtx_bel_snU = vtxp1;
-	    xusub[snrep_lid+1] = newnext;
-	  }
-	  xusub[vtx_lid] = newnext;
-	  nextu = newnext;
-	  neltsVtx_U += neltsZrVtx_U;
-	}  /* if ( relax_param >= PS->relax_param) */
-      }  /* if (VInfo->filledSep != FILLED_SEPS) */
-    } /* if (vtx != fstVtx_blk) */
-
-    if ((relax_param < PS->relax_gen || vtx == lstVtx_blk-1) 
-	&& VInfo->filledSep != FILLED_SEPS) {
-      /* if a new supernode starts or is the last vertex */
-      /* vtx starts a new supernode. Note we only store the
-       * subscript set of the first column of a supernode.  */
-      
-      if (marker[vtxp1] == marku1_vtx)
-	vtx_bel_snU = vtxp1;
-      /* build the pruned structure */
-      if (relax_param < PS->relax_gen
-	  && vtx == lstVtx_blk - 1 && vtx != fstVtx_blk) 
-	szLp = 2;
-      else
-	szLp = 1;
-      if (vtx == fstVtx_blk) {
-	xlsub_snp1 = nextl;
-	xusub_snp1 = nextu;
-      }
-      else {
-	xlsub_snp1 = xlsub[snrep_lid+1];
-	xusub_snp1 = xusub[snrep_lid+1];	
-      }
-      while (szLp > 0) {
-	szLp --;
-#ifdef TEST_SYMB
-	printf ("End sn %d szsn %d\n", nsuper_loc, szsn);
-	printf ("BLD pr vtx %d snrep %d prval %d szLp %d\n",
-		vtx, snrep, prval_cursn, szLp);
-#endif
-	
-	update_prGraph (iam, n, fstVtx_blk, lstVtx_blk,
-			snrep_lid, pr_offset, prval_cursn,
-			xlsub_snp1, 1,
-			Pslu_freeable, Llu_symbfact, PS);
-	update_prGraph (iam, n, fstVtx_blk, lstVtx_blk,
-			snrep_lid, pr_offset, prval_cursn,
-			xusub_snp1, 0,
-			Pslu_freeable, Llu_symbfact, PS);
-
-#ifdef TEST_SYMB
-	printf ("Adr lsub %p usub %p lsub %p pos %d usub %p pos %d\n", 
-		&(lsub[xlsub[snrep_lid]]), &(usub[xusub[snrep_lid]]),
-		lsub, xlsub[snrep_lid], usub, xusub[snrep_lid]);
-	PrintInt10 ("Lsn", xlsub_snp1 - xlsub[snrep_lid],
-		    &(lsub[xlsub[snrep_lid]]));
-	PrintInt10 ("Usn", xusub_snp1 - xusub[snrep_lid],
-		    &(usub[xusub[snrep_lid]]));
-#endif
-
-	if (prval_cursn >= lstVtx_blk) {
-	  neltSn_L = xlsub_snp1 - xlsub[snrep_lid];
-	  neltSn_U = xusub_snp1 - xusub[snrep_lid];
-	  if (ind_sizes1 != 0) {
-	    CS->snd_intraSz += neltSn_L + neltSn_U + 4;
-	    CS->snd_LintraSz += neltSn_L + 2;
-	  }
-	  if (prval_cursn >= lstVtx) {
-	    /* this supernode will be send to next layers of the tree */
-	    lvl_tmp = lvl;
-	    ii = ind_sizes1;
-	    jj = ind_sizes2;
-	    szSep_tmp = szSep;
-	    lstVtx_tmp = lstVtx;
-	    while (prval_cursn >= lstVtx_tmp && szSep_tmp != 1) {
-	      jj = ii + szSep_tmp + (jj - ii) / 2;
-	      ii += szSep_tmp;
-	      lvl_tmp ++;
-	      szSep_tmp = szSep_tmp / 2;
-	      lstVtx_tmp = fstVtxSep[jj] + sizes[jj];
-	      CS->snd_interSz[lvl_tmp] += neltSn_L + neltSn_U + 4;
-	      CS->snd_LinterSz[lvl_tmp] += neltSn_L + 2;
-	      if (CS->snd_vtxinter[lvl_tmp] == EMPTY)
-		CS->snd_vtxinter[lvl_tmp] = snrep;
-	    }
-	  }
-	}
-	snrep = vtx;
-	snrep_lid = vtx_lid;
-	prval_cursn = prval_curvtx;
-	szsn        = 1;
-	xlsub_snp1  = nextl;
-	xusub_snp1  = nextu;
-      }
-      if (relax_param < PS->relax_gen) {
-	neltsTotal += neltsVtx_L + neltsVtx_U;
-	nsuper_loc ++;	
-	supno[vtx_lid] = nsuper_loc;
-	if (marker[vtxp1] == marku1_vtx)
-	  vtx_bel_snU = vtxp1;
-	else
-	  vtx_bel_snU = EMPTY;
-      }
-    }
-    if (vtx == lstVtx_blk - 1)
-      nsuper_loc ++;
-    
-    /* check if current separator is dense */
-    if (!VInfo->filledSep) {
-      relax_seps = (float) neltsVtx_CSep_L / (float) (lstVtx - vtx);
-      relax_seps *= (float) (neltsVtx_CSep_U+1) / (float) (lstVtx - vtx);
-      if (relax_seps >= PS->relax_curSep ) 
-	VInfo->filledSep = FILLED_SEP;
-    }
-    maxNeltsVtx --;
-  }
-    
-  *p_mark       = marku2_vtx + 1;
-  *p_nextl      = nextl;
-  *p_nextu      = nextu;
-  *p_neltsZr    = neltsZr;
-  *p_neltsTotal = neltsTotal;
-  *p_nsuper_loc = nsuper_loc;
-
-  return 0;
-}
-
-static void
-domain_symbfact
-(SuperMatrix *A,
- int   iam,        /* Input - my processor number */  
- int   lvl,        /* Input - current level in the separator tree */
- int   szSep,      /* Input - size of the current separator (node) */
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t fstVtx,     /* Input - first vertex of current node */ 
- int_t lstVtx,     /* Input - last vertex of current node */ 
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *marker,
- int_t *p_mark,    /* marker used to merge elements of vertices */
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_neltsZr, /* no of artificial zeros introduced so far */
- int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
-			 computed so far */
- int_t *p_nsuper_loc
- )
-{
-  int_t lstVtx_lid, maxNvtcsPProc; 
-
-  /* call blk_symbfact */
-  blk_symbfact (A, iam, lvl, 
-		szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-		EMPTY, fstVtx, lstVtx, 
-		NULL, EMPTY, NULL, EMPTY,
-		Pslu_freeable, Llu_symbfact, VInfo, CS, PS,
-		marker, p_mark,
-		p_nextl, p_nextu, p_neltsZr, p_neltsTotal, 
-		p_nsuper_loc);
-
-  if (VInfo->filledSep != FILLED_SEPS) {
-    maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-    if (fstVtx >= lstVtx)
-      lstVtx_lid = 0;
-    else 
-      lstVtx_lid = LOCAL_IND( Pslu_freeable->globToLoc[lstVtx-1] ) + 1;
-    VInfo->xlsub_nextLvl  = Llu_symbfact->xlsub[lstVtx_lid];
-    Llu_symbfact->xlsub[lstVtx_lid] = *p_nextl;
-    VInfo->xusub_nextLvl  = Llu_symbfact->xusub[lstVtx_lid];
-    Llu_symbfact->xusub[lstVtx_lid] = *p_nextu;
-  }
-  VInfo->maxNeltsVtx -= lstVtx - fstVtx;
-}
-
-/*
- * Compute counts of rows/columns of current separator.
- * cntelt_vtcs[i] is 0 when i is nonzero before current separator
- * and n when i is zero before current separator.
- *
- * Set up nvtcsLvl_loc.
- */
-static void
-initLvl_symbfact
-(
- int_t n,       /* Input - order of the matrix */
- int   iam,     /* Input - my processor number */
- int_t fstVtx,  /* Input - first vertex of current node */   
- int_t lstVtx,  /* Input - last vertex of current node */   
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact, /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS,
- MPI_Comm ndComm,
- int_t  *marker,
- int_t  nextl,
- int_t  nextu
- ) 
-{
-  int_t *cntelt_vtcs, x_aind_beg, x_aind_end, x_aind_beg_l, x_aind_beg_u,
-    nelts_asup, nelts_ainf;
-  int_t nvtcsLvl_loc, fstVtx_loc, fstVtx_loc_lid, fstVtx_nextLvl;
-  int_t curblk_loc, nblks_loc, ind_blk;
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t *begEndBlks_loc, code_err, mem_error;
-  int_t i, j, k, vtx, vtx_lid, fstVtx_blk, lstVtx_blk, vtx_elt, p, fill;
-  int_t nelts, nelts_fill_l, nelts_fill_u, nelts_cnts, maxNvtcsPProc, *globToLoc;
-  int_t use_fillcnts, cntelt_vtx_l, cntelt_vtx_u;
-  MPI_Status status;
-  
-  fill = PS->fill_par;
-  VInfo->filledSep = FALSE;
-  
-  /* Initializations */
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  globToLoc      = Pslu_freeable->globToLoc;
-  curblk_loc     = VInfo->curblk_loc;
-  nblks_loc      = VInfo->nblks_loc;
-  begEndBlks_loc = VInfo->begEndBlks_loc;
-  cntelt_vtcs    = Llu_symbfact->cntelt_vtcs;
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-  
-  /* compute nvtcsLvl_loc */
-  nvtcsLvl_loc = 0;  
-  ind_blk = curblk_loc;
-  while (fstVtx > begEndBlks_loc[ind_blk] && ind_blk < 2 * nblks_loc) {
-    ind_blk += 2;
-  }
-  curblk_loc = ind_blk;
-  fstVtx_loc = begEndBlks_loc[ind_blk];
-  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
-    nvtcsLvl_loc += begEndBlks_loc[ind_blk + 1] - 
-      begEndBlks_loc[ind_blk];
-    ind_blk += 2;
-  }
-  fstVtx_nextLvl = begEndBlks_loc[ind_blk];
-  VInfo->nvtcsLvl_loc = nvtcsLvl_loc;
-  VInfo->curblk_loc = curblk_loc;
-  
-  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
-  vtx_lid      = fstVtx_loc_lid;
-  x_aind_beg_l = VInfo->xlsub_nextLvl;
-  x_aind_beg_u = VInfo->xusub_nextLvl;
-  nelts_cnts   = 0;
-  nelts_fill_l = 0;
-  nelts_fill_u = 0;
-  ind_blk      = curblk_loc;
-  
-  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
-    fstVtx_blk = begEndBlks_loc[ind_blk];
-    lstVtx_blk = begEndBlks_loc[ind_blk + 1];
-    ind_blk += 2;
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) 
-      nelts_cnts += cntelt_vtcs[vtx_lid];      
-    nelts_fill_l += fill * (xlsub[vtx_lid] - x_aind_beg_l);
-    nelts_fill_u += fill * (xusub[vtx_lid] - x_aind_beg_u);
-    x_aind_beg_l = xlsub[vtx_lid];
-    x_aind_beg_u = xusub[vtx_lid];
-  }
-
-  if (nvtcsLvl_loc != 0) {
-    nelts_ainf = xlsub[vtx_lid] - VInfo->xlsub_nextLvl;
-    nelts_asup = xusub[vtx_lid] - VInfo->xusub_nextLvl;
-  }
-  else {
-    nelts_ainf = 0;
-    nelts_asup = 0;
-  }
-  
-  use_fillcnts = FALSE;
-  if (nextl + nelts_cnts >= Llu_symbfact->szLsub - nelts_ainf ||
-      nextu + nelts_cnts >= Llu_symbfact->szUsub - nelts_asup) { 
-    use_fillcnts = TRUE;
-  }
- 
-  use_fillcnts = TRUE; 
-  
-  if (use_fillcnts) {
-    if (nextl + nelts_fill_l >= Llu_symbfact->szLsub - nelts_ainf)
-      mem_error = 
-	psymbfact_LUXpandMem (iam, n, fstVtx, nextl,
-			      nextl + nelts_fill_l, LSUB,
-			      RL_SYMB, 1, 
-			      Pslu_freeable, Llu_symbfact, VInfo, PS);
-    lsub = Llu_symbfact->lsub;
-    if (nextu + nelts_fill_u >= Llu_symbfact->szUsub - nelts_asup) 
-      mem_error = 
-	psymbfact_LUXpandMem (iam, n, fstVtx, nextu,
-			      nextu + nelts_fill_u, USUB,
-			      RL_SYMB, 1, 
-			      Pslu_freeable, Llu_symbfact, VInfo, PS);      
-    usub = Llu_symbfact->usub;
-  }
-
-  /* init xlsub[fstVtx:lstVtx] and xusub[fstVtx:lstVtx] and
-     copy elements of A[fstVtx:lstVtx, fstVtx:lstVtx] in lsub and usub */
-  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] );
-  x_aind_beg_l = VInfo->xlsub_nextLvl;
-  x_aind_beg_u = VInfo->xusub_nextLvl;
-  vtx_lid = fstVtx_loc_lid;
-  ind_blk = curblk_loc;
-
-  while (begEndBlks_loc[ind_blk] < lstVtx && ind_blk < 2 * nblks_loc) {
-    fstVtx_blk = begEndBlks_loc[ind_blk];
-    lstVtx_blk = begEndBlks_loc[ind_blk + 1];
-    ind_blk += 2;
-
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) {
-      if (vtx_lid != fstVtx_loc_lid) {
-	x_aind_beg_l = xlsub[vtx_lid];
-	x_aind_beg_u = xusub[vtx_lid];
-      }
-      if (use_fillcnts) {
-	cntelt_vtx_l = fill * (xlsub[vtx_lid+1] - x_aind_beg_l);
-	cntelt_vtx_u = fill * (xusub[vtx_lid+1] - x_aind_beg_u);
-      }
-      else {
-	cntelt_vtx_l = cntelt_vtcs[vtx_lid];
-	cntelt_vtx_u = cntelt_vtcs[vtx_lid];
-      }
-      x_aind_end = xlsub[vtx_lid + 1];
-      Llu_symbfact->cntelt_vtcsA_lvl[vtx_lid - fstVtx_loc_lid] = 
-	CEILING( (xlsub[vtx_lid+1]-x_aind_beg_l + xusub[vtx_lid+1]-x_aind_beg_u), 2);
-      
-      xlsub[vtx_lid] = nextl;
-      nelts = 0;
-      for (k = x_aind_beg_l; k < x_aind_end; k++) {
-	lsub[nextl] = lsub[k]; nextl ++;
-	nelts ++;
-      }
-      if (nelts < cntelt_vtx_l) 
-	lsub[nextl] = EMPTY; 
-      nextl += cntelt_vtx_l - nelts;
-      x_aind_end = xusub[vtx_lid + 1];
-      xusub[vtx_lid] = nextu;
-      nelts = 0;
-      for (k = x_aind_beg_u; k < x_aind_end; k++) {
-	usub[nextu] = usub[k]; nextu ++;
-	nelts ++;
-      }
-      if (nelts < cntelt_vtx_u) 
-	usub[nextu] = EMPTY; 
-      nextu += cntelt_vtx_u - nelts;
-    }
-  }
- 
-  if (nvtcsLvl_loc == 0) {
-    if (curblk_loc == 0)
-      vtx_lid = 0;
-    else {
-      if (begEndBlks_loc[curblk_loc-1] == 0)
-	vtx_lid = 0;
-      else
-	vtx_lid = LOCAL_IND( globToLoc[begEndBlks_loc[curblk_loc-1] - 1] ) + 1;
-    }
-
-    xlsub[vtx_lid] = nextl;
-    xusub[vtx_lid] = nextu;
-  }
-  else {
-    VInfo->xlsub_nextLvl   = xlsub[vtx_lid];
-    xlsub[vtx_lid] = nextl;
-    VInfo->xusub_nextLvl   = xusub[vtx_lid];
-    xusub[vtx_lid] = nextu;
-    if (PS->estimLSz < nextl)
-      PS->estimLSz = nextl;
-    if (PS->estimUSz < nextu)
-      PS->estimUSz = nextu;
-    
-    VInfo->nnz_ainf_loc -= nelts_ainf;
-    VInfo->nnz_asup_loc -= nelts_asup;
-  }
-  VInfo->fstVtx_nextLvl = fstVtx_nextLvl;
-}
-
-
-static int_t
-expand_RL 
-(
- int_t computeRcvd, /* if = 1, then update from receive buffer,
-		       else update from own data */
- int_t n,
- int   iam,       /* process number */
- int_t *lsub_rcvd,      /* elements of node */
- int_t lsub_rcvd_sz,    /* size of sub to be explored */
- int_t *usub_rcvd,  
- int_t usub_rcvd_sz,
- int_t vtxXp,
- int_t vtx_upd_pr,    /* ind in pruned structure of upd vertex which 
-			 doesn't fit into the alloc memory */
- int_t lstVtx_upd_pr, /* ind in pruned structure of lst vtx to update */
- int_t fstVtx_srcUpd, /* first vertex source of the updates */
- int_t lstVtx_srcUpd, /* last vertex source of the updates */
- int_t fstVtx_toUpd,  /* first vertex to update */
- int_t lstVtx_toUpd,  /* last vertex to update */
- int_t nvtcs_toUpd,   /* no of vertices to update */
- int   computeL,
- int_t *pmarkl,
- int_t *marker,
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS
- )
-{
-  int_t fstVtx_toUpd_lid, vtx_lid, vtx, vtx_elt, vtx_elt_lid, nextl, nelts_in;
-  int_t i, ii, j, nelts, nelts_vtx, mpnelts, lvtx_lid, elt, vtxXp_lid;
-  int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub;
-  int_t markl, *globToLoc, maxNvtcsPProc;
-  int_t mem_error, len_texp;
-  
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-
-  xusubPr = Llu_symbfact->xlsubPr; usubPr  = Llu_symbfact->lsubPr;
-  if (computeL) {
-    xlsub   = Llu_symbfact->xlsub;   lsub    = Llu_symbfact->lsub;
-    xusub   = Llu_symbfact->xusub;   usub    = Llu_symbfact->usub;
-  }
-  else {
-    xlsub   = Llu_symbfact->xusub;   lsub    = Llu_symbfact->usub;
-    xusub   = Llu_symbfact->xlsub;   usub    = Llu_symbfact->lsub;
-  }
-  markl = *pmarkl + 1;
-  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
-  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
-  nextl = xlsub[vtxXp_lid+1];
-    
-  lvtx_lid = EMPTY;
-  if (lstVtx_srcUpd != EMPTY)
-    lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] );
-
-  /* count the number of new elements, and update Llu_symbfact->cntelt_vtcs */
-  vtx_lid = fstVtx_toUpd_lid;
-  vtx_lid += vtx_upd_pr;
-  len_texp = 0;
-  for (i = vtx_upd_pr; i < lstVtx_upd_pr; i++, vtx_lid ++) { 
-    nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-    if (xusubPr[i] != xusubPr[i+1]) {
-      j = xusubPr[i]; 
-      vtx = usubPr[j];
-      /* setup marker structure for already existing elements */
-      ii = xlsub[vtx_lid];
-      while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) {
-	marker[lsub[ii]] = markl;
-	ii ++;
-      }
-      nelts_vtx = ii - xlsub[vtx_lid];
-      for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) {
-	vtx_elt = usubPr[j];
-	ii = marker[vtx_elt];
-	if (computeRcvd) {
-	  nelts = lsub_rcvd[ii + NELTS_IND];
-	  ii += RCVD_IND;
-	  mpnelts = marker[vtx_elt] + nelts + RCVD_IND;
-	}
-	else {
-	  vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
-	  if (vtx_elt_lid == lvtx_lid)
-	    nelts = lsub_rcvd_sz - ii;
-	  else
-	    nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid];
-	  mpnelts = marker[vtx_elt] + nelts;
-	}
-	
-	if (!computeL)
-	  marker[vtx] = markl;
-	for (ii; ii < mpnelts; ii++) {
-	  elt = lsub_rcvd[ii];
-	  if (elt >= vtx) {
-	    if (marker[elt] != markl) {
-	      /* add elt to structure of vtx */
-	      marker[elt] = markl;
-	      nelts_vtx ++;
-	    }
-	  }
-	}
-      }
-      if (nelts_vtx != 0 && (nelts_vtx > xlsub[vtx_lid+1] - xlsub[vtx_lid])) {
-	nelts_in = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-	if (nelts_in == 0) nelts_in = 1;
-	j = nelts_vtx / nelts_in;
-	if (nelts_vtx % nelts_in != 0) j++;
-	nelts_vtx = j * nelts_in;
-      }
-      else
-	nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-      markl ++;
-      if (markl == n) {
-	/* reset marker array */
-	for (j = fstVtx_toUpd; j < n; j++)
-	  marker[j] = EMPTY;
-	markl = 0;
-      }
-    }
-    Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx;
-    len_texp += nelts_vtx;
-  }
-  for (; i < nvtcs_toUpd; i++, vtx_lid++) {
-    nelts_vtx = xlsub[vtx_lid+1] - xlsub[vtx_lid];    
-    Llu_symbfact->cntelt_vtcs[vtx_lid] = nelts_vtx;
-    len_texp += nelts_vtx;
-  }
-
-  *pmarkl = markl;
-  /* mark elements array */
-  for (i = xlsub[vtxXp_lid]; i < nextl; i++) {
-    marker[lsub[i]] = markl;
-  }
-
-  nextl = xlsub[vtxXp_lid+1];  
-  if (mem_error = 
-      psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, 
-			    computeL, Pslu_freeable, Llu_symbfact, VInfo, PS))
-    return (mem_error);		
-
-  return 0;
-}
-
-
-static int_t
-rl_update
-(
- int   computeRcvd, /* if = 1, then update from receive buffer,
-		       else update from own data */
- int_t n,
- int   iam,       /* process number */
- int_t *lsub_rcvd,      /* elements of node */
- int_t lsub_rcvd_sz,    /* size of sub to be explored */
- int_t *usub_rcvd,  
- int_t usub_rcvd_sz,
- int_t fstVtx_srcUpd, /* first vertex source of the updates */
- int_t lstVtx_srcUpd, /* last vertex source of the updates */
- int_t indBlk_srcUpd, /* block index of first vertex */
- int_t fstVtx_toUpd,  /* first vertex to update */
- int_t lstVtx_toUpd,  /* last vertex to update */
- int_t nvtcs_toUpd,   /* no of vertices to update */
- int   computeL,
- int_t *pmarkl,
- int_t *marker,
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS
- /*  marker: first elements of marker contain the nodes that will
-     be used in the updates */
- )
-{
-  int_t i, j, k, prVal, nelts, ind, nextl, ii, mpnelts, mem_error;
-  int_t vtx, vtx_lid, vtx_elt, vtx_elt_lid, lvtx_lid;
-  int_t fstVtx_toUpd_lid, markl, elt, vtx_loc, ind_blk;
-  int_t *xusubPr, *usubPr, *xlsub, *lsub, *xusub, *usub;
-  int_t fstVtx_upd, lstVtx_upd, maxNvtcsPProc, *globToLoc;
-  int_t fstVtx_srcUpd_lid, nelts_vtx, expand;
-  
-  /* quick return */
-  if (fstVtx_toUpd >= lstVtx_toUpd)
-    return;
-
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  
-  fstVtx_upd = EMPTY;
-  lstVtx_upd = EMPTY;
-  xusubPr = Llu_symbfact->xlsubPr; usubPr  = Llu_symbfact->lsubPr;
-  if (computeL) {
-    xlsub   = Llu_symbfact->xlsub;   lsub    = Llu_symbfact->lsub;
-    xusub   = Llu_symbfact->xusub;   usub    = Llu_symbfact->usub;
-  }
-  else {
-    xlsub   = Llu_symbfact->xusub;   lsub    = Llu_symbfact->usub;
-    xusub   = Llu_symbfact->xlsub;   usub    = Llu_symbfact->lsub;
-  }
-  markl = *pmarkl;
-  fstVtx_toUpd_lid = LOCAL_IND( globToLoc[fstVtx_toUpd] );
-
-  /* count number of elements in transpose representation of usub_rcvd */
-  /* use marker to count those elements */
-  for (i = 0; i < nvtcs_toUpd; i++)
-    marker[i] = 0;
-  
-  i = 0;
-  if (fstVtx_srcUpd != EMPTY) {
-    fstVtx_srcUpd_lid = LOCAL_IND( globToLoc[fstVtx_srcUpd] );
-    vtx_lid = fstVtx_srcUpd_lid;
-  }
-  lvtx_lid = EMPTY;
-  if (lstVtx_srcUpd != EMPTY)
-    lvtx_lid = LOCAL_IND( globToLoc[lstVtx_srcUpd - 1] );
-  
-  while (i < usub_rcvd_sz) {
-    if (computeRcvd) {
-      vtx   = usub_rcvd[i + DIAG_IND];
-      nelts = usub_rcvd[i + NELTS_IND];
-      i += RCVD_IND;
-    }
-    else {
-      if (vtx_lid == lvtx_lid)
-	nelts = usub_rcvd_sz - i;
-      else
-	nelts = xusub[vtx_lid + 1] - xusub[vtx_lid];
-      vtx_lid ++;
-    }
-    prVal = usub_rcvd[i];
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = usub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - 
-	      fstVtx_toUpd_lid;
-	    marker[vtx_elt_lid] ++;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }
-
-  ind = 0;
-  for (i = 0; i < nvtcs_toUpd; i++) {
-    if (marker[i] != 0) {
-      marker[i] ++;
-      if (fstVtx_upd == EMPTY)
-	fstVtx_upd = i;
-      lstVtx_upd = i;
-    }
-    xusubPr[i] = ind;
-    ind += marker[i];
-    marker[i] = xusubPr[i];
-  }
-  xusubPr[i] = ind;
-  lstVtx_upd ++;
-
-  if (ind == 0) 
-    /* quick return if no update */
-    return;
-
-  /* test if enough memory in usubPr array */
-  if (ind > Llu_symbfact->szLsubPr) {
-    if (mem_error = 
-	psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS))
-      return (mem_error);
-    usubPr  = Llu_symbfact->lsubPr;
-  }
-  
-  i = 0;
-  if (fstVtx_srcUpd != EMPTY) {
-    vtx_loc = fstVtx_srcUpd;
-    vtx_lid = LOCAL_IND( globToLoc[vtx_loc] );
-    ind_blk = indBlk_srcUpd;
-  }
-  while (i < usub_rcvd_sz) {
-    if (computeRcvd) {
-      vtx   = usub_rcvd[i + DIAG_IND];
-      nelts = usub_rcvd[i + NELTS_IND];
-      i += RCVD_IND;
-    }
-    else {
-      vtx = vtx_loc;
-      if (vtx_lid == lvtx_lid)
-	nelts = usub_rcvd_sz - i;
-      else
-	nelts = xusub[vtx_lid + 1] - xusub[vtx_lid];
-      vtx_lid ++;
-      vtx_loc ++;
-      if (ind_blk != EMPTY)
-	if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) {
-	  ind_blk += 2;
-	  vtx_loc = VInfo->begEndBlks_loc[ind_blk];
-	}
-    }
-
-    prVal = usub_rcvd[i];
-    for (k = i; k < i + nelts; k++) {
-      vtx_elt = usub_rcvd[k];
-      if (vtx_elt > prVal)
-	k = i + nelts;
-      else {
-	if (OWNER( globToLoc[vtx_elt]) == iam) {
-	  if (vtx_elt >= fstVtx_toUpd && vtx_elt < lstVtx_toUpd) {
-	    vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_toUpd_lid;
-	    /* add vtx_elt to the pruned structure */
-	    if (marker[vtx_elt_lid] == xusubPr[vtx_elt_lid]) {
-	      usubPr[marker[vtx_elt_lid]] = vtx_elt;
-	      marker[vtx_elt_lid] ++;
-	    }
-	    usubPr[marker[vtx_elt_lid]] = vtx;
-	    marker[vtx_elt_lid] ++;
-	  }
-	}
-      }
-    }
-    i += nelts;
-  }
-  /* reset marker array */
-  for (i = 0; i < nvtcs_toUpd; i++)
-    marker[i] = EMPTY;
-  if (fstVtx_srcUpd != EMPTY) {
-    vtx_loc = fstVtx_srcUpd;
-    vtx_lid = LOCAL_IND( globToLoc[vtx_loc] );
-    ind_blk = indBlk_srcUpd;
-  }
-  i = 0;
-  while (i < lsub_rcvd_sz) {
-    if (computeRcvd) {
-      vtx   = lsub_rcvd[i + DIAG_IND];
-      nelts = lsub_rcvd[i + NELTS_IND];
-      marker[vtx] = i;
-      i += RCVD_IND;
-    }
-    else {
-      vtx = vtx_loc;
-      if (vtx_lid == lvtx_lid)
-	nelts = lsub_rcvd_sz - i;
-      else
-	nelts = xlsub[vtx_lid + 1] - xlsub[vtx_lid];
-      vtx_lid ++;
-      marker[vtx] = i;
-      vtx_loc ++;
-      if (ind_blk != EMPTY)
-	if (vtx_loc == VInfo->begEndBlks_loc[ind_blk+1]) {
-	  ind_blk += 2;
-	  vtx_loc = VInfo->begEndBlks_loc[ind_blk];
-	}
-    }
-    i += nelts;
-  }
-
-  /* use the pruned structure to update symbolic factorization */
-  vtx_lid = fstVtx_toUpd_lid;
-  vtx_lid += fstVtx_upd;
-  for (i = fstVtx_upd; i < lstVtx_upd; i++, vtx_lid ++) { 
-    if (xusubPr[i] != xusubPr[i+1]) {
-      j = xusubPr[i]; 
-      vtx = usubPr[j];
-      /* setup marker structure for already existing elements */
-      ii = xlsub[vtx_lid];
-      while (lsub[ii] != EMPTY && ii < xlsub[vtx_lid + 1]) {
-	marker[lsub[ii]] = markl;
-	ii ++;
-      }
-      PS->nops += ii - xlsub[vtx_lid];
-      nextl = ii;
-      for (j = xusubPr[i] + 1; j < xusubPr[i+1]; j++) {
-	vtx_elt = usubPr[j];
-	ii = marker[vtx_elt];
-	if (computeRcvd) {
-	  nelts = lsub_rcvd[ii + NELTS_IND];
-	  ii += RCVD_IND;
-	  mpnelts = marker[vtx_elt] + nelts + RCVD_IND;
-	}
-	else {
-	  vtx_elt_lid = LOCAL_IND( globToLoc[vtx_elt] );
-	  if (vtx_elt_lid == lvtx_lid)
-	    nelts = lsub_rcvd_sz - ii;
-	  else
-	    nelts = xlsub[vtx_elt_lid+1] - xlsub[vtx_elt_lid];
-	  mpnelts = marker[vtx_elt] + nelts;
-	}
-		
-	if (!computeL)
-	  marker[vtx] = markl;
-	PS->nops += mpnelts - ii;
-	for (ii; ii < mpnelts; ii++) {
-	  elt = lsub_rcvd[ii];
-	  if (elt >= vtx) {
-	    if (marker[elt] != markl) {
-	      /* add elt to structure of vtx */
-	      if (nextl >= xlsub[vtx_lid + 1]) {
-		if (mem_error = 
-		    expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			       usub_rcvd, usub_rcvd_sz, vtx, i,
-			       lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd,
-			       fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL,
-			       &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS))
-		    return (mem_error);
-		if (computeL) {
-		  lsub    = Llu_symbfact->lsub;
-		  if (!computeRcvd) 
-		    lsub_rcvd    = 
-		      &(Llu_symbfact->lsub[Llu_symbfact->xlsub[fstVtx_srcUpd_lid]]);
-		} else {
-		  marker[vtx] = markl;
-		  lsub    = Llu_symbfact->usub;
-		  if (!computeRcvd) 
-		    lsub_rcvd = 
-		      &(Llu_symbfact->usub[Llu_symbfact->xusub[fstVtx_srcUpd_lid]]);
-		}
-	      }
-	      lsub[nextl] = elt; nextl ++;
-	      marker[elt] = markl;
-	    }
-	  }
-	}
-      }
-      if (nextl < xlsub[vtx_lid+1])
-	lsub[nextl] = EMPTY;
-      markl ++;
-      if (markl == n) {
-	/* reset marker array */
-	for (j = fstVtx_toUpd; j < n; j++)
-	  marker[j] = EMPTY;
-	markl = 0;
-      }
-    }
-  }
-  *pmarkl = markl;
-
-  return 0;
-}
-
-static int_t
-dnsUpSeps_symbfact
-(
- int_t n,
- int   iam,      /* my processor number */
- int   szSep, 
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t fstVtx_dns,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_nsuper_loc
- )
-{
-  int_t nextl, nextu, nsuper_loc, curblk_loc, mem_error;
-  int_t vtx_elt, ind_blk, vtx, k;
-  int_t *xlsub, *xusub, *lsub, *usub;
-  int_t fstVtx_blk, fstVtx_blk_lid, vtx_lid, lstVtx_blk, fstVtx_lvl, lstVtx_lvl;
-  int_t *globToLoc, maxNvtcsPProc;
-  
-  /* Initialization */
-  xlsub = Llu_symbfact->xlsub; lsub = Llu_symbfact->lsub;
-  xusub = Llu_symbfact->xusub; usub = Llu_symbfact->usub;
-
-  globToLoc  = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  nextl      = *p_nextl;
-  nextu      = *p_nextu;
-  nsuper_loc = *p_nsuper_loc;
-  curblk_loc = VInfo->curblk_loc;
-  VInfo->nnz_ainf_loc = 0;
-  VInfo->nnz_asup_loc = 0;
-
-  if (fstVtx_dns == EMPTY)
-    fstVtx_blk     = VInfo->begEndBlks_loc[curblk_loc];
-  else 
-    fstVtx_blk  = fstVtx_dns;
-  if (fstVtx_blk == n)
-    return 0;
-  fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-  vtx_lid        = fstVtx_blk_lid;
-  xlsub[vtx_lid] = nextl;
-  xusub[vtx_lid] = nextu;
-  PS->nDnsUpSeps = 0; 
-  
-  while (szSep >= 1) {
-    PS->nDnsUpSeps++; 
-    fstVtx_lvl = fstVtxSep[ind_sizes2];
-    lstVtx_lvl = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
-    if (fstVtx_blk > fstVtx_lvl)
-      vtx_elt = fstVtx_blk;
-    else 
-      vtx_elt = fstVtx_lvl;
-    if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) {
-      if (mem_error =
-	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 
-				nextl + fstVtx_lvl - vtx_elt,
-				LSUB, DNS_UPSEPS, 1,
-				Pslu_freeable, Llu_symbfact, VInfo, PS))
-	return (mem_error);
-      lsub = Llu_symbfact->lsub;
-    }
-    if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) {
-      if (mem_error =
-	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 
-				nextu + fstVtx_lvl - vtx_elt,
-				LSUB, DNS_UPSEPS, 1,
-				Pslu_freeable, Llu_symbfact, VInfo, PS))
-	return (mem_error);
-      usub = Llu_symbfact->usub;
-    }
-    PS->nops += 2 * (lstVtx_lvl - vtx_elt);
-    for (; vtx_elt < lstVtx_lvl; vtx_elt++) {
-      lsub[nextl] = vtx_elt; nextl++;
-      usub[nextu] = vtx_elt; nextu++;
-    }
-    ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2;
-    ind_sizes1 += szSep;
-    szSep = szSep / 2;
-  }
-  /* delete the diagonal element from the U structure */
-  usub[xusub[fstVtx_blk_lid]] = usub[nextu - 1];
-  nextu --;
-  xlsub[fstVtx_blk_lid+1] = nextl;
-  xusub[fstVtx_blk_lid+1] = nextu;
-
-  vtx_lid = fstVtx_blk_lid;
-  ind_blk = curblk_loc;
-  while (ind_blk < 2 * VInfo->nblks_loc) {
-    if (ind_blk != curblk_loc) {
-      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-
-      xlsub[vtx_lid] = nextl;
-      xusub[vtx_lid] = nextu;
-
-      for (k = xlsub[fstVtx_blk_lid]; k < xlsub[fstVtx_blk_lid+1]; k++) 
-	if (lsub[k] >= fstVtx_blk) {
-	  lsub[nextl] = lsub[k]; nextl ++;
-	  if (nextl >= MEM_LSUB( Llu_symbfact, VInfo ))
- 	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0,
-				      LSUB, DNS_UPSEPS, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  lsub = Llu_symbfact->lsub;
-	}
-      for (k = xusub[fstVtx_blk_lid]; k < xusub[fstVtx_blk_lid+1]; k++)
-	if (usub[k] > fstVtx_blk) {
-	  usub[nextu] = usub[k]; nextu ++;
-	  if (nextu >= MEM_USUB( Llu_symbfact, VInfo ))
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0,
-				      USUB, DNS_UPSEPS, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  usub = Llu_symbfact->usub;
-	}
-      PS->nops += xlsub[fstVtx_blk_lid+1] - xlsub[fstVtx_blk_lid];
-      PS->nops += xusub[fstVtx_blk_lid+1] - xusub[fstVtx_blk_lid];
-    }
-    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-      Pslu_freeable->supno_loc[vtx_lid] = nsuper_loc;
-      if (vtx > fstVtx_blk) {
-	xlsub[vtx_lid] = nextl;
-	xusub[vtx_lid] = nextu;
-      }
-    }
-    ind_blk += 2;
-    nsuper_loc ++;
-  }
-  
-  *p_nextl = nextl;
-  *p_nextu = nextu;
-  *p_nsuper_loc = nsuper_loc;
-/*   VInfo->curblk_loc = ind_blk; */
-  
-  return 0;
-}
-
-static int_t
-dnsCurSep_symbfact
-(
- int_t n,          /* Input - order of the matrix */
- int   iam,        /* Input - my processor number */
- int   ind_sizes1,
- int   ind_sizes2,
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int   szSep,
- int   npNode,
- int_t rcvd_dnsSep,
- int_t *p_nextl,     
- int_t *p_nextu, 
- int_t *p_mark,
- int_t *p_nsuper_loc,
- int_t *marker,   /* temporary array of size n */
- MPI_Comm ndCom,
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- vtcsInfo_symbfact_t *VInfo,  /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS
- )
-{
-  int_t fstVtx_blk, fstVtx_dns, fstVtx_dns_lid, lstVtx_blk, 
-    fstVtx, lstVtx, lstVtx_dns_lid;
-  int_t ind_blk, i, vtx, vtx_lid, vtx_lid_x, nvtcs_upd, save_cnt, mem_error;
-  int_t computeL, computeU, vtx_elt, j, cur_blk, snlid, snrep;
-  int_t *sub, *xsub, *minElt_vtx, *cntelt_vtcs;
-  int_t mark, next, *x_newelts, *x_newelts_L, *x_newelts_U;
-  int_t *newelts_L, *newelts_U, *newelts;
-  int_t *globToLoc, maxNvtcsPProc, lvl;
-  int_t prval, kmin, kmax, maxElt, ktemp, prpos;
-  float mem_dnsCS;
-
-  if (!rcvd_dnsSep)
-    VInfo->curblk_loc += 2;
-  
-  computeL = TRUE; computeU = TRUE;
-  lstVtx_dns_lid = EMPTY;
-  globToLoc = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  fstVtx = fstVtxSep[ind_sizes2];
-  lstVtx = fstVtx + sizes[ind_sizes2];
-  cur_blk = VInfo->curblk_loc;
-  fstVtx_dns = VInfo->begEndBlks_loc[cur_blk];
-  fstVtx_dns_lid = LOCAL_IND( globToLoc[fstVtx_dns] );
-  lvl = (int_t) log2( (double) npNode );
-  x_newelts_U = NULL;
-  newelts_L = NULL;
-  newelts_U = NULL;
-  mem_dnsCS = 0.;
-  
-  PS->nDnsCurSep ++;
-
-  if (CS->rcv_bufSz > n - fstVtx_dns)
-    minElt_vtx = CS->rcv_buf;
-  else {
-    if (!(minElt_vtx = intMalloc_symbfact(n - fstVtx_dns)))
-      ABORT("Malloc fails for minElt_vtx[].");
-    mem_dnsCS += n - fstVtx_dns;
-  }
-  
-  while (computeL || computeU) {
-    if (computeL) {
-      sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
-      x_newelts = Llu_symbfact->cntelt_vtcs;
-      x_newelts_L = x_newelts;
-    }
-    else {
-      sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
-    }
-
-    /* use minElt_vtx to determine starting vertex of each nonzero element */
-    for (i = 0; i < n - fstVtx_dns; i++)
-      minElt_vtx[i] = n;
-
-    ind_blk = cur_blk;
-    vtx_lid = fstVtx_dns_lid;
-    nvtcs_upd = 0;
-    while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-	   ind_blk < 2 * VInfo->nblks_loc) {	  
-      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-      ind_blk += 2;
-      nvtcs_upd += lstVtx_blk - fstVtx_blk;
-      for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-	j = xsub[vtx_lid];
-	while (j < xsub[vtx_lid+1] && sub[j] != EMPTY) {
-	  PS->nops ++;
-	  vtx_elt = sub[j] - fstVtx_dns;
-	  if (minElt_vtx[vtx_elt] == n) {
-	    minElt_vtx[vtx_elt] = vtx;
-	  }
-	  j ++;
-	}
-      }	  
-    }
-    if (!computeL) {
-      if (!(x_newelts_U = intMalloc_symbfact(nvtcs_upd + 1)))
-	ABORT("Malloc fails for x_newelts_U[].");
-      mem_dnsCS += nvtcs_upd + 1;
-      x_newelts = x_newelts_U;
-    }
-    else {
-      /* save the value in cntelt_vtcs[lstVtx_blk_lid] */
-      save_cnt = x_newelts[vtx_lid];
-      lstVtx_dns_lid = vtx_lid;
-    }
-    
-    MPI_Allreduce (&(minElt_vtx[lstVtx - fstVtx_dns]), &(marker[lstVtx]), 
-		   n - lstVtx, mpi_int_t, MPI_MIN, ndCom);
-
-#if ( PRNTlevel>=1 )
-    PS->no_msgsCol += (float) (2 * (int_t) log2( (double) npNode ));
-    PS->sz_msgsCol += (float) (n - lstVtx);
-    if (PS->maxsz_msgCol < n - lstVtx) 
-      PS->maxsz_msgCol = n - lstVtx;      
-#endif
-    
-    /* use x_newelts to determine counts of elements starting in each vertex */
-    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++)
-      x_newelts[vtx_lid] = 0;
-    
-    for (vtx = lstVtx; vtx < n; vtx++) {
-      if (marker[vtx] != n) {
-	vtx_elt = marker[vtx];
-	if (OWNER( globToLoc[vtx_elt] ) == iam) {
-	  x_newelts[ LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid ] ++;
-	}
-	else {
-	  /* find the first vertex > vtx_elt which belongs to iam */
-	  ind_blk = cur_blk;
-	  vtx_lid = 0;
-	  while (vtx_elt > VInfo->begEndBlks_loc[ind_blk] &&
-		 ind_blk < 2 * VInfo->nblks_loc) {
-	    vtx_lid += VInfo->begEndBlks_loc[ind_blk+1] -
-	      VInfo->begEndBlks_loc[ind_blk];
-	    ind_blk += 2;
-	  }
-	  if (VInfo->begEndBlks_loc[ind_blk] < lstVtx) {
-	    x_newelts[vtx_lid] ++;		
-	    marker[vtx] = VInfo->begEndBlks_loc[ind_blk];
-	  }			    
-	  else
-	    marker[vtx] = n;
-	}
-      }
-    }
-    
-    /* set up beginning of new elements for each local vtx */
-    i = 0;
-    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) {
-      j = x_newelts[vtx_lid];
-      x_newelts[vtx_lid] = i;
-      i += j;
-    }
-    x_newelts[vtx_lid] = i;
-    newelts = NULL;
-    if (i != 0) {
-      if (!(newelts = intMalloc_symbfact(x_newelts[vtx_lid])))
-	ABORT("Malloc fails for newelts[].");    
-      mem_dnsCS += x_newelts[vtx_lid];
-      
-      for (vtx = lstVtx; vtx < n; vtx++) {
-	if (marker[vtx] != n) {
-	  vtx_elt = marker[vtx];
-	  vtx_lid = LOCAL_IND( globToLoc[vtx_elt] ) - fstVtx_dns_lid;	  
-	  newelts[x_newelts[vtx_lid]] = vtx;
-	  x_newelts[vtx_lid] ++;
-	}
-      }
-    }
-    /* reset beginning of new elements for each local vertex */
-    i = 0;
-    for (vtx_lid = 0; vtx_lid < nvtcs_upd; vtx_lid++) {
-      j = x_newelts[vtx_lid];
-      x_newelts[vtx_lid] = i;
-      i = j;
-    }
-
-    if (computeL == TRUE) {
-      computeL = FALSE;
-      newelts_L = newelts;
-    }
-    else {
-      computeU = FALSE;
-      newelts_U = newelts;
-    }
-  }
-  
-  for (i = fstVtx_dns; i < n; i++)
-    marker[i] = EMPTY;
-  mark = 0;
-  
-  /* update vertices */
-  prval = n; 	    
-  ind_blk = cur_blk;
-  fstVtx_dns = VInfo->begEndBlks_loc[ind_blk];
-  vtx_lid = LOCAL_IND( globToLoc[fstVtx_dns] );
-  while (VInfo->begEndBlks_loc[ind_blk] < lstVtx && 
-	 ind_blk < 2 * VInfo->nblks_loc) {	  
-    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-    lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-    ind_blk += 2;
-    for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid++) {
-      vtx_lid_x = vtx_lid - fstVtx_dns_lid;
-      Llu_symbfact->xlsub[vtx_lid] = *p_nextl;
-      Llu_symbfact->xusub[vtx_lid] = *p_nextu;
-      if (vtx == fstVtx_blk || x_newelts_L[vtx_lid_x+1] != x_newelts_L[vtx_lid_x] ||
-	  x_newelts_U[vtx_lid_x+1] != x_newelts_U[vtx_lid_x]) {
-	/* a new supernode starts */
-	snlid = vtx_lid;
-	snrep = vtx;
-	if (mark + 2 > n) {
-	  /* reset to EMPTY marker array */
-	  for (i = 0; i < n; i++)
-	    marker[i] = EMPTY;
-	  mark = 0;
-	}
-
-	computeL = TRUE;
-	computeU = FALSE;
-	while (computeL || computeU) {
-	  if (computeL) {
-	    sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
-	    x_newelts = x_newelts_L; newelts = newelts_L;
-	    next = *p_nextl;
-	  }
-	  else {
-	    sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
-	    x_newelts = x_newelts_U; newelts = newelts_U;
-	    next = *p_nextu;
-	  }
-	  xsub[vtx_lid] = next;
-
-	  /* TEST available memory */
-	  j = x_newelts[vtx_lid_x+1] + lstVtx - vtx;
-	  if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) ||
-	      (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) {
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, vtx, next, next + j,
-				      computeL, DNS_CURSEP, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	    if (computeL) sub = Llu_symbfact->lsub;
-	    else sub = Llu_symbfact->usub; 
-	  }
-	  
-	  if (computeL)  i = vtx;
-	  else           i = vtx+1;
-	  while (i < lstVtx) {
-	    sub[next] = i; next ++;
-	    i ++;
-	  }
-	  PS->nops += x_newelts[vtx_lid_x+1];
-	  for (i = 0; i < x_newelts[vtx_lid_x+1]; i++) {
-	    vtx_elt = newelts[i];
-	    sub[next] = vtx_elt; next ++;
-	    if (computeU && vtx_elt < prval 
-		&& marker[vtx_elt] == mark-1)
-	      prval = vtx_elt;
-	    marker[vtx_elt] = mark;
-	  }
-	  if (computeL) {
-	    computeL = FALSE; computeU = TRUE;
-	    *p_nextl = next;
-	  }
-	  else {
-	    computeU = FALSE;
-	    *p_nextu = next;
-	  }
-	  mark ++;	  
-	}	  
-	if (vtx != fstVtx_blk)
-	  (*p_nsuper_loc) ++;
-      } /* a new supernode starts */
-      /* vtx belongs to the curent supernode */
-      Pslu_freeable->supno_loc[vtx_lid] = *p_nsuper_loc;
-    } 
-    (*p_nsuper_loc) ++;
-  }
-  
-  if (ind_blk > 0) {
-    /* if iam owns blocks of this level */
-    i = *p_nextl - Llu_symbfact->xlsub[snlid];
-    j = *p_nextu - Llu_symbfact->xusub[snlid];
-    
-    if (VInfo->begEndBlks_loc[ind_blk - 1] == lstVtx && i > 1 && j > 0) {
-      /* if iam the last processor owning a block of this level */
-      computeL = TRUE; computeU = FALSE;
-      /* prune the structure */
-      while (computeL || computeU) {	   
-	if (computeL) {
-	  sub = Llu_symbfact->lsub; xsub = Llu_symbfact->xlsub;
-	  next = *p_nextl;
-	  computeL = FALSE; computeU = TRUE;
-	}
-	else {
-	  sub = Llu_symbfact->usub; xsub = Llu_symbfact->xusub;
-	  next = *p_nextu;
-	  computeU = FALSE;
-	}
-	
-	kmin = xsub[snlid];
-	kmax = next - 1;
-	if (prval != n) {
-	  maxElt = prval;
-	  while (kmin <= kmax) {
-	    /* Do a quicksort-type partition. */    
-	    if (sub[kmax] > prval) 
-	      kmax--;
-	    else if (sub[kmin] <= prval) {
-	      kmin++;
-	    }
-	    else { /* kmin does'nt belong to G^s(L), and kmax belongs: 
-		    * 	   interchange the two subscripts
-		    */
-	      ktemp = sub[kmin];
-	      sub[kmin] = sub[kmax];
-	      sub[kmax] = ktemp;
-	      kmin ++;
-	      kmax --;
-	    }
-	    if (sub[kmin-1] == prval)
-	      prpos = kmin - 1;
-	  }
-	}
-	else {
-	  maxElt = EMPTY;
-	  while (kmin <= kmax) {
-	    /* compute maximum element of L(:, vtx) */
-	    if (sub[kmin] > maxElt) {
-	      maxElt = sub[kmin];
-	      prpos = kmin;
-	    }
-	    kmin ++;	      
-	  }
-	}
-	ktemp = sub[xsub[snlid]];
-	sub[xsub[snlid]] = maxElt;
-	sub[prpos] = ktemp;
-      }
-      
-      /* setup snd_interSz information */
-      prval = Llu_symbfact->lsub[Llu_symbfact->xlsub[snlid]];
-      if (prval >= lstVtx) {
-	/* this supernode will be send to next layers of the tree */
-	while (prval >= lstVtx && szSep != 1) {
-	  ind_sizes2 = ind_sizes1 + szSep + (ind_sizes2 - ind_sizes1) / 2;
-	  ind_sizes1 += szSep;
-	  lvl ++;
-	  szSep = szSep / 2;
-	  lstVtx = fstVtxSep[ind_sizes2] + sizes[ind_sizes2];
-	  CS->snd_interSz[lvl] += i + j + 4;
-	  CS->snd_LinterSz[lvl] += i + 2;
-	  if (CS->snd_vtxinter[lvl] == EMPTY)
-	    CS->snd_vtxinter[lvl] = snrep;
-	}
-      }
-    }
-  }
-
-  /* restore value in cntelt_vtcs */
-  if (lstVtx_dns_lid != EMPTY)
-    Llu_symbfact->cntelt_vtcs[lstVtx_dns_lid] = save_cnt;
-  *p_mark = mark;
-  if (minElt_vtx != CS->rcv_buf)
-    SUPERLU_FREE (minElt_vtx);  
-  SUPERLU_FREE (x_newelts_U);
-  if (newelts_L) SUPERLU_FREE (newelts_L);
-  if (newelts_U) SUPERLU_FREE (newelts_U);
-  if (PS->szDnsSep < mem_dnsCS)
-    PS->szDnsSep = mem_dnsCS;
-}
-
-/* all processors affected to current node must call this routine
-   when VInfo->filledSep == FILLED_SEP
-   This is necessary since subsequent routines called from here use 
-   MPI_allreduce among all processors affected to curent node */
-
-static int_t
-denseSep_symbfact 
-(
- int   rcvd_dnsSep, /* =1 if processor received info that the separator
-		       became dense,
-		       =0 if myPE determined that separator is full */
- int_t n,           /* Input - order of the matrix */
- int   iam,         /* Input - my processor number */
- int   ind_sizes1,
- int   ind_sizes2,
- int_t *sizes,     /* Input - sizes of each separator in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int   szSep, 
- int   fstP,        /* first pe affected current node */
- int   lstP,        /* last pe affected current node */
- int_t fstVtx_blkCyc, 
- int_t nblk_loc,    /* block number in the block cyclic distribution of current
-		       supernode */
- int_t *p_nextl,
- int_t *p_nextu,
- int_t *p_mark,
- int_t *p_nsuper_loc,
- int_t *marker,
- MPI_Comm ndCom,
- MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- vtcsInfo_symbfact_t *VInfo,  /* Input - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS
-) 
-{
-  int   nprocsLvl, p, prvP, tag;
-  int_t nmsgsToSnd, nmsgsToRcv;
-  int_t ind_blk, mem_error;
-  int_t *rcv_intraLvl;
-  int_t fstVtx, lstVtx, cur_blk, lstVtx_blk, fstVtx_blk;
-  int_t *globToLoc, maxNvtcsPProc;
-  MPI_Status status;
-  
-  globToLoc = Pslu_freeable->globToLoc;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  fstVtx = fstVtxSep[ind_sizes2];
-  lstVtx = fstVtx + sizes[ind_sizes2];
-  rcv_intraLvl = CS->rcv_intraLvl;
-  cur_blk   = VInfo->curblk_loc;
-  nprocsLvl = lstP - fstP;
-  
-  if (nblk_loc == 0) {
-    nmsgsToSnd = 2; nmsgsToRcv = 1;
-  }
-  else {
-    nmsgsToSnd = 1; nmsgsToRcv = 0;
-    if (!rcvd_dnsSep) nmsgsToRcv ++;
-  }
-  if (iam == fstP && rcvd_dnsSep && nblk_loc == 1) 
-    nmsgsToRcv ++;
-  
-  /* first exchange msgs with all processors affected to current node */
-  ind_blk = cur_blk;
-  while ((nmsgsToSnd || nmsgsToRcv) && VInfo->begEndBlks_loc[ind_blk] < lstVtx) {
-    tag = (int) (tag_intraLvl + nblk_loc);
-    if (nmsgsToSnd) {
-      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-      if (lstVtx_blk != lstVtx) {
-	p = OWNER( globToLoc[lstVtx_blk]);
-	MPI_Send (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p,
-		  tag, (*symb_comm));
-#if ( PRNTlevel>=1 )
-	PS->no_shmSnd += (float) 1;
-#endif
-      }
-      nmsgsToSnd --;
-    }
-    ind_blk += 2;
-    nblk_loc ++;
-    tag = tag_intraLvl + nblk_loc;
-    fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-    if (nmsgsToRcv && fstVtx_blk < lstVtx) {
-      if (iam == fstP) tag --;
-      prvP = OWNER( globToLoc[fstVtx_blk - 1]);
-      MPI_Recv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP,
-		tag, (*symb_comm), &status);
-#if ( PRNTlevel>=1 )
-      PS->no_shmRcvd += (float) 1;
-#endif
-      nmsgsToRcv --;
-    }
-  }
-
-  if (VInfo->filledSep == FILLED_SEP) {
-    if (mem_error = 
-	dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, 
-			    szSep, lstP - fstP, rcvd_dnsSep, p_nextl, 
-			    p_nextu, p_mark, p_nsuper_loc, marker, ndCom,
-			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS))
-      return (mem_error);
-  }
-  else if (rcvd_dnsSep) 
-    if (mem_error = 
-	dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
-			    sizes, fstVtxSep, EMPTY,
-			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-			    p_nextl, p_nextu, p_nsuper_loc))
-      return (mem_error);
-  return 0;
-}
-
-
-static int_t
-interLvl_symbfact
-(
- SuperMatrix *A, /* Input - input matrix A */
- int   iam,      /* Input - my processor number */  
- int   lvl,      /* Input - current level in the separator tree */ 
- int   szSep,    /* Input - size of the current separator (node) */
- int   fstP,     /* Input - first processor assigned to current node */
- int   lstP,     /* Input - last processor assigned to current node */
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int_t *p_nextl,
- int_t *p_nextu,
- int_t *p_nsuper_loc,
- int_t *pmark,   /* mark for symbfact */
- int_t *marker,  /* temp array used for marking */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- Pslu_freeable_t *Pslu_freeable,
- comm_symbfact_t *CS,/* infos on communication data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- psymbfact_stat_t *PS,
- MPI_Comm ndComm,
- MPI_Comm    *symb_comm /* Input - communicator for symbolic factorization */
- )
-{
-  MPI_Status  *status; 
-  MPI_Request *request_snd, *request_rcv;
-  
-  int   nprocsLvl, rcvdP, p, filledSep_lvl;
-  int   toSend, toSendL, toSendU;
-  int_t *rcv_interLvl;
-  int_t *snd_interLvl, *snd_interLvl1, *snd_interLvl2,
-    snd_interLvlSz, snd_LinterLvlSz, snd_vtxLvl;
-  int_t  vtx_elt, update_loc, code_err;
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz;
-  int_t  n, mark, max_rcvSz; 
-  int_t nextl, nextu, ind_blk, vtx_lid, k, count, nelts, 
-    lstVtxLvl_loc, lstVtxLvl_loc_lid, mem_error;
-  int_t fstVtx_blk, lstVtx_blk, i, j, vtx, prElt_L, prElt_U, 
-    snd_indBlk, prElt_ind;
-  int_t fstVtxLvl_loc, nvtcsLvl_loc, maxNvtcsPProc, *globToLoc, 
-    fstVtx, lstVtx;
-  int  ind1, nprocsToRcv, nprocsToSnd, ind2, ind_l, ind_u, ij, ik;
-  int_t req_ind, sent_msgs, req_ind_snd;
-  int_t initInfo_loc[2], initInfo_gl[2];
-
-  /* Initialization */
-  n = A->ncol;
-  fstVtx          = fstVtxSep[ind_sizes2];
-  lstVtx          = fstVtx + sizes[ind_sizes2];
-  maxNvtcsPProc   = Pslu_freeable->maxNvtcsPProc;
-  globToLoc       = Pslu_freeable->globToLoc;
-  nprocsLvl       = lstP - fstP;
-  rcv_interLvl    = CS->rcv_interLvl;
-  snd_interLvl    = CS->snd_interLvl;
-  snd_interLvlSz  = CS->snd_interSz[lvl];
-  snd_LinterLvlSz = CS->snd_LinterSz[lvl];
-  snd_vtxLvl      = CS->snd_vtxinter[lvl];
-  fstVtxLvl_loc   = VInfo->begEndBlks_loc[VInfo->curblk_loc];
-  nvtcsLvl_loc    = VInfo->nvtcsLvl_loc;
-  request_snd = NULL;
-  request_rcv = NULL;
-  status = NULL;
-  mark = *pmark;
-  
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-
-  /* snd_vtxLvl denotes the first vertex from which iam needs
-     to send data.  
-     snd_interLvlSz denotes maximum size of the send data,
-     snd_LinterLvlSz denotes send data corresponding to L part */
-
-  /* determine maximum size of receive buffer and information
-   on filled sep */
-  if (snd_interLvlSz != 0) {
-    if (snd_LinterLvlSz == 0) 
-      snd_interLvlSz = 0;
-    if (snd_interLvlSz - snd_LinterLvlSz == 0)
-      snd_interLvlSz = 0;
-  }
-  
-  initInfo_loc[0] = snd_interLvlSz;
-  initInfo_loc[1] = (int_t) VInfo->filledSep;
-  MPI_Allreduce (initInfo_loc, initInfo_gl, 2, 
-		 mpi_int_t, MPI_MAX, ndComm);
-#if ( PRNTlevel>=1 )
-  PS->no_msgsCol += (float) (2 * (int_t) log2( (double) nprocsLvl ));
-  PS->sz_msgsCol += 2;
-  if (PS->maxsz_msgCol < 2) 
-    PS->maxsz_msgCol = 2;      
-#endif  
-  max_rcvSz = initInfo_gl[0];
-  filledSep_lvl = (int) initInfo_gl[1];
-
-  if (filledSep_lvl == FILLED_SEPS) {
-    /* quick return if all upper separators are dense */
-    if (VInfo->filledSep != FILLED_SEPS) {
-      VInfo->filledSep = FILLED_SEPS;
-      if (mem_error = 
-	  dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, 
-			      fstVtxSep,
-			      EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-			      p_nextl, p_nextu, p_nsuper_loc))
-	return (mem_error);
-    }
-    return 0;
-  }
-
-  if (max_rcvSz == 0)
-    /* quick return if no communication necessary */
-    return 0; 
-  
-  /* allocate data for the send buffer */  
-  if (snd_interLvlSz)
-    if (CS->snd_bufSz < snd_interLvlSz) {
-      PS->maxSzBuf += snd_interLvlSz - CS->snd_bufSz;
-      if (CS->snd_bufSz != 0)
-	/* not first time allocate memory */
-	SUPERLU_FREE (CS->snd_buf);
-      CS->snd_bufSz = snd_interLvlSz;
-      if (!(CS->snd_buf = intMalloc_symbfact (snd_interLvlSz))) {
-	ABORT("Malloc fails for snd_buf[].");
-      }
-    }
-    
-  /* snd_interLvl : to which processors the data need to be send 
-   * information setup during the copy of data to be send in the buffer  
-   * rcv_interLvl : from which processors iam receives update data  */
-  for (p = 2*fstP; p < 2*lstP; p++)
-    snd_interLvl[p] = EMPTY;
-
-  if (snd_interLvlSz == 0 && nvtcsLvl_loc == 0) {
-    code_err = MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t,
-			     &(rcv_interLvl[2*fstP]), 2, mpi_int_t,
-			     ndComm);
-#if ( PRNTlevel>=1 )
-    PS->no_msgsCol += (float) (2 * (int_t) log2( (double) nprocsLvl ));
-    PS->sz_msgsCol += 2;
-    if (PS->maxsz_msgCol < 2) 
-      PS->maxsz_msgCol = 2;      
-#endif  
-    return 0;
-  }
-  
-  /* in interLvlInfos, 
-   * obtain from which processors iam receives update information */
-  update_loc = FALSE;
-  nextl = 0; 
-  nextu = snd_LinterLvlSz;
-  if (snd_interLvlSz != 0) {
-    /* copy data to be send */
-    /* find index block from where to send data */
-    ind_blk = VInfo->curblk_loc;
-    while (snd_vtxLvl < VInfo->begEndBlks_loc[ind_blk]) {
-      ind_blk -= 2;
-    }
-    snd_indBlk = ind_blk;
-    vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] );
-    for (; ind_blk < VInfo->curblk_loc; ind_blk += 2) {
-      fstVtx_blk = VInfo->begEndBlks_loc[ind_blk];
-      if (ind_blk == snd_indBlk)
-	fstVtx_blk = snd_vtxLvl;
-      lstVtx_blk = VInfo->begEndBlks_loc[ind_blk + 1];
-      for (vtx = fstVtx_blk; vtx < lstVtx_blk; vtx++, vtx_lid ++) {
-	toSendL = FALSE; toSendU = FALSE;
-	if (xlsub[vtx_lid] != xlsub[vtx_lid+1] && 
-	    xusub[vtx_lid] != xusub[vtx_lid+1]) {
-	  k = xlsub[vtx_lid];
-	  prElt_L = lsub[k];
-	  j = xusub[vtx_lid];
-	  prElt_U = usub[j];
-	  if (prElt_L >= fstVtx || prElt_U >= fstVtx) {
-	    if (prElt_L >= fstVtx)
-	      while (lsub[k] <= prElt_L && k < xlsub[vtx_lid + 1]) {
-		vtx_elt = lsub[k];
-		if (vtx_elt >= fstVtx && vtx_elt < lstVtx) {
-		  p = OWNER( globToLoc[vtx_elt] );
-		  if (p != iam) {
-		    /* vtx will be send to another processor */
-		    snd_interLvl[2*p] = TRUE;
-		    toSendL = TRUE;
-		  }
-		  else
-		    update_loc = TRUE;
-		}
-		k++;
-	      }
-	    if (prElt_U >= fstVtx)
-	      while (usub[j] <= prElt_U && j < xusub[vtx_lid + 1]) {
-		vtx_elt = usub[j];
-		if (vtx_elt >= fstVtx && vtx_elt < lstVtx) {
-		  p = OWNER( globToLoc[vtx_elt] );
-		  if (p != iam) {
-		    /* vtx will be send to another processor */
-		    snd_interLvl[2*p+1] = TRUE;
-		    toSendU = TRUE;
-		  }
-		  else
-		    update_loc = TRUE;
-		}
-		j ++;
-	      }
-	    if (toSendL || toSendU) {
-	      /* L(:, vtx) and U(vtx, :) will be send to processors */
-	      CS->snd_buf[nextu + DIAG_IND]  = vtx;
-	      nelts = xusub[vtx_lid+1] - xusub[vtx_lid];
-	      CS->snd_buf[nextu + NELTS_IND] = nelts;
-	      nextu += 2;
-	      for (j = xusub[vtx_lid]; j < xusub[vtx_lid+1]; j++, nextu ++) {
-		CS->snd_buf[nextu] = usub[j]; 
-	      }
-	      CS->snd_buf[nextl + DIAG_IND] = vtx;
-	      nelts = xlsub[vtx_lid+1] - xlsub[vtx_lid];
-	      CS->snd_buf[nextl + NELTS_IND] = nelts; 
-	      nextl += 2;
-	      for (j = xlsub[vtx_lid]; j < xlsub[vtx_lid+1]; j++, nextl ++) {
-		CS->snd_buf[nextl] = lsub[j];
-	      }
-	    }
-	  }
-	}
-      }
-    }
-    lstVtxLvl_loc = vtx;
-    lstVtxLvl_loc_lid = vtx_lid;
-  }
-  
-  if (nextl == 0 || nextu - snd_LinterLvlSz == 0) {
-    for (p = 2*fstP; p < 2*lstP; p++)
-      snd_interLvl[p] = EMPTY;
-  }
-  
-  nprocsToSnd = 0;
-  for (p = 2*fstP; p < 2*lstP; p +=2) {
-    if (snd_interLvl[p] != EMPTY || snd_interLvl[p+1] != EMPTY) {
-      snd_interLvl[p] = nextl;
-      snd_interLvl[p+1] = nextu - snd_LinterLvlSz;
-      nprocsToSnd ++;
-    }
-  }
-  
-  MPI_Alltoall (&(snd_interLvl[2*fstP]), 2, mpi_int_t,
-		&(rcv_interLvl[2*fstP]), 2, mpi_int_t, ndComm);    
-#if ( PRNTlevel>=1 )
-  PS->no_msgsCol += (float) (2 * (int_t) log2( (double) nprocsLvl ));
-  PS->sz_msgsCol += 2 * nprocsLvl;
-  if (PS->maxsz_msgCol < 2 * nprocsLvl) 
-    PS->maxsz_msgCol = 2 * nprocsLvl;      
-#endif    
-
-  max_rcvSz = 0;
-  nprocsToRcv = 0;
-  for (p = 2*fstP; p < 2*lstP; p +=2) {
-    CS->ptr_rcvBuf[p] = max_rcvSz;
-    if (rcv_interLvl[p] != EMPTY) 
-      max_rcvSz += rcv_interLvl[p];
-    CS->ptr_rcvBuf[p+1] = max_rcvSz;
-    if (rcv_interLvl[p+1] != EMPTY) 
-      max_rcvSz += rcv_interLvl[p+1];
-    if (rcv_interLvl[p] != EMPTY || rcv_interLvl[p+1] != EMPTY) 
-      nprocsToRcv ++;
-  }
-
-  /* allocate data for the receive buffer */  
-  if (CS->rcv_bufSz < max_rcvSz) {
-    PS->maxSzBuf += max_rcvSz - CS->rcv_bufSz;
-    if (CS->rcv_bufSz != 0) /* not first time allocate memory */
-      SUPERLU_FREE (CS->rcv_buf);
-    CS->rcv_bufSz = max_rcvSz;
-    if (!(CS->rcv_buf = intMalloc_symbfact (max_rcvSz))) {
-      ABORT("Malloc fails for rcv_buf[].");
-    }
-  }
-  
-  /* allocate memory for status arrays */
-  if (nprocsToSnd)
-    if ( !(request_snd = (MPI_Request*) 
-	   SUPERLU_MALLOC(2 * nprocsToSnd * sizeof(MPI_Request))))
-      ABORT("Not enough memory when allocating MPI_Request");
-  if (nprocsToRcv)
-    if ( !(request_rcv = (MPI_Request*) 
-	   SUPERLU_MALLOC(2 * nprocsToRcv * sizeof(MPI_Request))))
-      ABORT("Not enough memory when allocating MPI_Request");
-  if (nprocsToRcv || nprocsToSnd)
-    if ( !(status = (MPI_Status*) 
-	   SUPERLU_MALLOC(2 * (lstP-fstP) * sizeof(MPI_Status))))
-      ABORT("Not enough memory when allocating MPI_Request");
-  
-  /* determine if we have to send data */
-  i = 0;
-  for (toSend = fstP, p = 2*fstP; p < 2*lstP; toSend++, p+=2) 
-    if (snd_interLvl[p] != EMPTY && toSend != iam) {
-      MPI_Isend (CS->snd_buf, nextl, mpi_int_t, toSend,
-		 tag_interLvl_LData, (*symb_comm), &(request_snd[2*i]));
-      MPI_Isend (&(CS->snd_buf[snd_LinterLvlSz]), 
-		 nextu - snd_LinterLvlSz, mpi_int_t, toSend,
-		 tag_interLvl_UData, (*symb_comm), &(request_snd[2*i+1]));
-      i++;
-#if ( PRNTlevel>=1 )
-      PS->no_msgsSnd += (float) 2;
-      PS->sz_msgsSnd += (float) (nextl + nextu - snd_LinterLvlSz);
-      if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl;
-      if (PS->maxsz_msgSnd < nextu - snd_LinterLvlSz) 
-	PS->maxsz_msgSnd = nextu - snd_LinterLvlSz;      
-#endif
-    }
-  
-  if (update_loc) {
-    /* use own data to update symbolic factorization */
-    vtx_lid = LOCAL_IND( globToLoc[snd_vtxLvl] );
-    lsub_rcvd    = &(lsub[xlsub[vtx_lid]]);
-    lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid];
-    usub_rcvd    = &(usub[xusub[vtx_lid]]);
-    usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid];
-    
-    mem_error = 
-      rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz,
-		 usub_rcvd, usub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-
-    lsub_rcvd    = &(Llu_symbfact->lsub[xlsub[vtx_lid]]);
-    lsub_rcvd_sz = xlsub[lstVtxLvl_loc_lid] - xlsub[vtx_lid];
-    usub_rcvd    = &(Llu_symbfact->usub[xusub[vtx_lid]]);
-    usub_rcvd_sz = xusub[lstVtxLvl_loc_lid] - xusub[vtx_lid];
-    lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub;
-    mem_error = 
-      rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz,
-		 lsub_rcvd, lsub_rcvd_sz, snd_vtxLvl, EMPTY, snd_indBlk,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-    lsub = Llu_symbfact->lsub; usub = Llu_symbfact->usub;
-  }
-
-  /* post non-blocking receives for all the incoming messages */
-  i = 0;
-  for (rcvdP = fstP, p = 2*fstP; p < 2*lstP; rcvdP++, p += 2) 
-    if (rcv_interLvl[p] != EMPTY) {
-      lsub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[p]]);
-      MPI_Irecv (lsub_rcvd, rcv_interLvl[p], mpi_int_t, rcvdP,
-		 tag_interLvl_LData, (*symb_comm), &(request_rcv[i]));
-      usub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[p+1]]);
-      MPI_Irecv (usub_rcvd, rcv_interLvl[p+1], mpi_int_t, rcvdP,
-		 tag_interLvl_UData, (*symb_comm), &(request_rcv[i+1]));
-      i += 2;
-#if ( PRNTlevel>=1 )
-      PS->no_msgsRcvd += (float) 2;
-      PS->sz_msgsRcvd += (float) (rcv_interLvl[p] + rcv_interLvl[p+1]);
-      if (PS->maxsz_msgRcvd < rcv_interLvl[p])
-	PS->maxsz_msgRcvd = rcv_interLvl[p];
-      if (PS->maxsz_msgRcvd < rcv_interLvl[p+1])
-	PS->maxsz_msgRcvd = rcv_interLvl[p+1];
-#endif
-    }
-  
-  /* wait until messages are received and update local data */
-  for (i = 0; i < nprocsToRcv; i++) {
-    MPI_Waitany (2*nprocsToRcv, request_rcv, &ind1, status);
-    ij = 0;
-    for (p = fstP; p < lstP; p++)
-      if (rcv_interLvl[2*p] != EMPTY) {
-	if (ij <= ind1 && ind1 < ij+2) {
-	  rcvdP = p; p = lstP;
-	  if (ind1 == ij) ind2 = ij+1;
-	  else ind2 = ind1 - 1;
-	  ind_l = ij; ind_u = ij+1;
-	}
-	ij += 2;
-      }
-    MPI_Get_count (status, mpi_int_t, &ij);
-    MPI_Wait (&(request_rcv[ind2]), status);
-    MPI_Get_count (status, mpi_int_t, &ik);    
-    if (ind1 == ind_l) {
-      lsub_rcvd_sz = ij;
-      usub_rcvd_sz = ik;
-    } else {
-      lsub_rcvd_sz = ik;
-      usub_rcvd_sz = ij;
-    }
-    lsub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP]]);
-    usub_rcvd    = &(CS->rcv_buf[CS->ptr_rcvBuf[2*rcvdP+1]]);
-    
-    /* use received data to update symbolic factorization information */
-    mem_error = 
-      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
-		 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 1, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-    lsub = Llu_symbfact->lsub;
-    mem_error = 
-      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
-		 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-		 fstVtxLvl_loc, lstVtx, nvtcsLvl_loc,
-		 0, &mark, marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-    usub = Llu_symbfact->usub;      
-  }
-  
-  if (nprocsToSnd)
-    MPI_Waitall (2*nprocsToSnd, request_snd, status);
-
-  *pmark = mark;
-  if (request_snd != NULL) SUPERLU_FREE (request_snd);
-  if (request_rcv != NULL) SUPERLU_FREE (request_rcv);
-  if (status != NULL) SUPERLU_FREE (status);
-}
-
-
-static void
-createComm 
-(
- int   iam,          /* Input -my processor number */
- int   nprocs,       /* Input -number of procs for the symbolic factorization */
- MPI_Comm *commLvls, /* Output -communicators for the nodes in the sep tree */
- MPI_Comm *symb_comm
- )
-{
-  int szSep, i, j, jj, k, *pranks;
-  int np, npNode, fstP, lstP, p, code_err, ind, col, key;
-  MPI_Group mpi_new_group, mpi_base_group;
-  
-  /* Make a list of the processes in the new communicator. */
-  pranks = (int *) SUPERLU_MALLOC( nprocs * sizeof(int) );
-  
-  i = 2 * nprocs - 2;
-  MPI_Comm_dup ((*symb_comm), &(commLvls[i]));
-  szSep = 2;
-  i -= szSep;
-
-  while (i > 0) {
-    /* for each level in the separator tree */
-    npNode = nprocs / szSep; 
-    fstP = 0; 
-    /* for each node in the level */
-    for (j = i; j < i + szSep; j++) {
-      lstP = fstP + npNode;
-      if (fstP <= iam && iam < lstP) {
-	ind = j;
-	key = iam - fstP;
-	col = fstP;
-      }
-      fstP += npNode;
-    }
-    MPI_Comm_split ((*symb_comm), col, key, &(commLvls[ind]) );
-    szSep *= 2;
-    i -= szSep;
-  }
-  
-  SUPERLU_FREE (pranks);
-}
-
-static void
-intraLvl_symbfact 
-(
- SuperMatrix *A, /* Input - original matrix A  */
- int   iam,      /* Input - my processor number */
- int   lvl,      /* Input - current level in the separator tree */
- int   szSep,    /* Input - size of the current separator(node) */
- int   ind_sizes1,
- int   ind_sizes2, 
- int_t *sizes,     /* Input - sizes of each node in the separator tree */
- int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
- int   fstP,     /* Input - first processor assigned to current node */
- int   lstP,     /* Input - last processor assigned to current node */
- int_t fstVtx,   /* Input - first vertex of current node */
- int_t lstVtx,   /* Input - last vertex of current node */
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS,
- int_t *marker,
- int_t *p_mark,    /* marker used to merge elements of vertices */
- int_t *p_nextl,   /* ptr to nextl in lsub structure */
- int_t *p_nextu,   /* ptr to nextu in usub structure */
- int_t *p_neltsZr, /* no of artificial zeros introduced so far */
- int_t *p_neltsTotal, /* no of nonzeros (including artificials) 
-			 computed so far */
- int_t *p_nsuper_loc,
- MPI_Comm ndComm,
- MPI_Comm    *symb_comm /* Input - communicator for symbolic factorization */
- )
-{
-  int nprocsLvl, p, prvP, rcvP;
-  int toSend, rcvd_prvP, index_req[2];
-  int_t fstVtx_loc_lid, fstVtx_loc, vtx, vtxLvl, curblk_loc, denseSep;
-  int_t fstVtx_blk, fstVtx_blk_lid, lstVtx_blk, lstVtx_blk_lid, tag;
-  int_t nvtcs_blk, xusub_end, xlsub_end, prv_fstVtx_blk;
-  int_t n;
-  int_t *rcv_intraLvl, *snd_intraLvl;
-  int_t *lsub_rcvd, lsub_rcvd_sz, *usub_rcvd, usub_rcvd_sz;
-  int_t nmsgsRcvd, nmsgsTRcv, sz_msg;
-  int_t nvtcsLvl_loc, nextl, nextu, ind_blk, snd_vtxLvl, maxNeltsVtx_in;
-  int_t count, vtx_loc, mem_error, lstBlkRcvd;
-  int_t fstVtx_blk_loc, fstBlk, vtx_lid, prElt, nelts, j, nvtcs_toUpd;
-  int_t snd_LinterLvlSz, fstVtx_blk_loc_lid, prElt_ind, maxNmsgsToRcv;
-  int_t *xlsub, *xusub, *lsub, *usub;
-  int_t *globToLoc, maxNvtcsPProc, nblk_loc, upd_myD, r, fstVtx_blkCyc;
-  int_t k, prElt_L, prElt_U, vtx_elt, fstVtx_toUpd;
-  int intSzMsg;
-
-  MPI_Status status[4];
-  MPI_Request request[4];
-  
-  /* Initializations */
-  lsub    = Llu_symbfact->lsub;   xlsub    = Llu_symbfact->xlsub;
-  usub    = Llu_symbfact->usub;   xusub    = Llu_symbfact->xusub;
-  
-  /* max number of msgs this processor can receive during 
-     intraLvl_symbfact routine */
-  maxNmsgsToRcv  = (lstVtx - fstVtx) / VInfo->maxSzBlk + 1;
-  maxNeltsVtx_in = VInfo->maxNeltsVtx;
-  globToLoc      = Pslu_freeable->globToLoc;
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  n = A->ncol;
-  nprocsLvl       = lstP - fstP;
-  rcv_intraLvl    = CS->rcv_intraLvl;
-  snd_intraLvl    = CS->snd_intraLvl;
-  nvtcsLvl_loc    = VInfo->nvtcsLvl_loc;
-  nmsgsTRcv       = 0;
-  nmsgsRcvd       = 0;
-  nblk_loc        = 0;
-  nvtcs_toUpd     = nvtcsLvl_loc;
-  fstVtx_blk      = fstVtx;
-  denseSep        = FALSE;
-
-  /* determine first vertex that belongs to fstP */
-  k = fstVtx;
-  fstVtx_blkCyc = n;
-  while (k < lstVtx && fstVtx_blkCyc == n) {
-    p = OWNER( globToLoc[k] );
-    if (p == fstP)
-      fstVtx_blkCyc = k;
-    k += VInfo->maxSzBlk;
-  }
-
-  for (p = fstP; p < lstP; p++)
-    rcv_intraLvl[p] = 0;
-
-  for (r = 0; r < 3; r++) 
-    request[r] = MPI_REQUEST_NULL;
-
-  fstVtx_loc = VInfo->begEndBlks_loc[VInfo->curblk_loc];
-  fstVtx_loc_lid = LOCAL_IND( globToLoc[fstVtx_loc] ); 
-  vtx = fstVtx_loc;
-  if (fstVtx_loc >= fstVtx_blkCyc)
-    nblk_loc = 1;
-  while (VInfo->begEndBlks_loc[VInfo->curblk_loc] < lstVtx && !VInfo->filledSep) {
-    CS->snd_intraSz  = 0;
-    CS->snd_LintraSz = 0;
-
-    lstBlkRcvd     = FALSE;
-    prv_fstVtx_blk = fstVtx_blk;
-    fstVtx_blk     = VInfo->begEndBlks_loc[VInfo->curblk_loc];
-    lstVtx_blk     = VInfo->begEndBlks_loc[VInfo->curblk_loc + 1];
-    fstVtx_toUpd   = VInfo->begEndBlks_loc[VInfo->curblk_loc + 2];
-    fstVtx_blk_lid = LOCAL_IND( globToLoc[fstVtx_blk] );
-    lstVtx_blk_lid = LOCAL_IND( globToLoc[lstVtx_blk - 1] + 1);
-    nvtcs_blk      = lstVtx_blk - fstVtx_blk;
-    nvtcs_toUpd   -= nvtcs_blk;
-    nmsgsTRcv      = n;
-    VInfo->maxNeltsVtx -= fstVtx_blk - prv_fstVtx_blk;
-
-    index_req[0] = EMPTY;
-    for (r = 0; r < 3; r++) 
-      request[r] = MPI_REQUEST_NULL;
-    if (fstVtx_blk != fstVtx) {
-      /* if not the first vertex of the level */
-      prvP           = OWNER( globToLoc[fstVtx_blk - 1] );
-      rcvd_prvP      = FALSE;
-      /* receive info on number messages to receive */
-      tag = tag_intraLvl + nblk_loc;
-      if (iam == fstP)  tag --;
-      
-      MPI_Irecv (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, prvP,
-		 tag, (*symb_comm), &(request[1]));
-
-      while (!rcvd_prvP || nmsgsRcvd < nmsgsTRcv) {
-	if (index_req[0] != 1) {
-	  MPI_Irecv (&sz_msg, 1, mpi_int_t, 
-		     MPI_ANY_SOURCE, tag_intraLvl_szMsg, 
-		     (*symb_comm), &(request[0]));  
-	  if (sz_msg > INT_MAX)
-	    ABORT("ERROR in intraLvl_symbfact size to send > INT_MAX\n");
-	}
-	MPI_Waitany (2, request, index_req, status);
-	if (index_req[0] == 1) {
-	  /* receive information on no msgs to receive */
-#if ( PRNTlevel>=1 )
-	  PS->no_shmRcvd ++;
-#endif
-	  rcvd_prvP = TRUE;
-	  nmsgsTRcv = rcv_intraLvl[iam];
-	  /* if dense separator was detected by one of the 
-	     previous processors ... */
-	  if (nmsgsTRcv > maxNmsgsToRcv) {
-	    VInfo->filledSep = (int) nmsgsTRcv / maxNmsgsToRcv;
-	    nmsgsTRcv = nmsgsTRcv % maxNmsgsToRcv;
-	  }
-	  
-	  if (nmsgsTRcv == nmsgsRcvd) {
-	    /* MPI_Cancel (&(request[0])); */
-	    MPI_Send (&r, 1, mpi_int_t, iam, 
-		      tag_intraLvl_szMsg, (*symb_comm));
-	    MPI_Wait (&(request[0]), status);	    
-	  }
-	}
-	if (index_req[0] == 0) {
-	  nmsgsRcvd ++;
-	  if (nmsgsTRcv == nmsgsRcvd)  lstBlkRcvd = TRUE; 
-	  rcvP = status->MPI_SOURCE;
-
-	  /* allocate enough space to receive data */
-	  if (CS->rcv_bufSz < sz_msg) {
-	    PS->maxSzBuf += sz_msg - CS->rcv_bufSz;
-	    if (CS->rcv_bufSz != 0)
-	      /* not first time allocate memory */
-	      SUPERLU_FREE (CS->rcv_buf);
-	    CS->rcv_bufSz = sz_msg;
-	    if (!(CS->rcv_buf = intMalloc_symbfact (sz_msg))) {
-	      ABORT("Malloc fails for rcv_buf[].");
-	    }
-	  }
-	  
-	  /* use received data to update symbolic factorization */
-	  lsub_rcvd = CS->rcv_buf;
-	  MPI_Recv (lsub_rcvd, sz_msg, mpi_int_t, 
-		    rcvP, tag_intraLvl_LData, (*symb_comm), status);
-	  MPI_Get_count (status, mpi_int_t, &intSzMsg);
-	  lsub_rcvd_sz = intSzMsg;
-	  usub_rcvd    = &(CS->rcv_buf[lsub_rcvd_sz]);
-	  MPI_Recv (usub_rcvd, sz_msg - lsub_rcvd_sz, 
-		    mpi_int_t, rcvP,
-		    tag_intraLvl_UData, (*symb_comm), status);
-	  MPI_Get_count (status, mpi_int_t, &intSzMsg);
-	  usub_rcvd_sz = intSzMsg;
-#if ( PRNTlevel>=1 )
-	  PS->no_shmRcvd ++;
-	  PS->no_msgsRcvd += (float) 2;
-	  PS->sz_msgsRcvd += (float) sz_msg;
-	  if (PS->maxsz_msgRcvd < lsub_rcvd_sz) PS->maxsz_msgRcvd = lsub_rcvd_sz;
-	  if (PS->maxsz_msgRcvd < usub_rcvd_sz) PS->maxsz_msgRcvd = usub_rcvd_sz;
-#endif
-
-	  if (!lstBlkRcvd) {
-	    mem_error = 
-	      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-			 fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd,
-			 1, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    lsub = Llu_symbfact->lsub;
-	    mem_error = 
-	      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
-			 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, 
-			 fstVtx_blk, lstVtx, nvtcs_blk + nvtcs_toUpd,
-			 0, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    usub = Llu_symbfact->usub;
-	  }
-	}
-      }
-    }
-  
-    if (VInfo->filledSep) {
-      mem_error = 
-	denseSep_symbfact (1, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-			   szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
-			   p_nextl, p_nextu, p_mark, p_nsuper_loc, marker,
-			   ndComm, symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
-    }
-    else {
-      /* compute symbolic factorization for this block */
-      if (!lstBlkRcvd) {
-	lsub_rcvd = NULL; usub_rcvd = NULL;
-      }
-
-      blk_symbfact (A, iam, lvl, 
-		    szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-		    fstVtx_loc, fstVtx_blk, lstVtx_blk, 
-		    lsub_rcvd, lsub_rcvd_sz, usub_rcvd, usub_rcvd_sz,
-		    Pslu_freeable, Llu_symbfact, VInfo, CS, PS,
-		    marker, p_mark,
-		    p_nextl, p_nextu, p_neltsZr, p_neltsTotal, 
-		    p_nsuper_loc);
-      lsub = Llu_symbfact->lsub;
-      usub = Llu_symbfact->usub; 	 
-      
-      if (lstVtx_blk != lstVtx) {
-	/* if this is not the last block of the level */
-	if (VInfo->filledSep == FILLED_SEPS ||
-	    ( VInfo->filledSep == FILLED_SEP && 
-	      ((lstVtx - lstVtx_blk > VInfo->maxSzBlk * nprocsLvl && nblk_loc > 0) ||
-	       (lstVtx - fstVtx_blkCyc > VInfo->maxSzBlk * nprocsLvl && nblk_loc == 0))))
-	  /* if current separator is dense and this is not the last block, 
-	     then ... */
-	  denseSep = TRUE;
-	else
-	  /* separator dense but not enough uncomputed blocks 
-	     in the separator to take advantage of it */
-	  VInfo->filledSep = FALSE;
-	
-	if (VInfo->filledSep == FILLED_SEPS) {
-	  for (p = fstP; p < lstP; p++)
-	    rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + rcv_intraLvl[p];
-	  denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-			     szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
-			     p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, 
-			     symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
-	}
-	else {
-	  /* send blk to next procs and update the rest of my own blocks */
-	  if (lstBlkRcvd) {
-	    mem_error = 
-	      rl_update (1, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			 usub_rcvd, usub_rcvd_sz, EMPTY, EMPTY, EMPTY,
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 1, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    lsub = Llu_symbfact->lsub;
-	    mem_error = 
-	      rl_update (1, n, iam, usub_rcvd, usub_rcvd_sz,
-			 lsub_rcvd, lsub_rcvd_sz, EMPTY, EMPTY, EMPTY, 
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 0, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    usub = Llu_symbfact->usub;
-	  }
-
-	  upd_myD = FALSE;
-	  /* determine processors to which send this block
-	     and copy data to be sent */
-	  for (p = fstP; p < lstP; p++)
-	    snd_intraLvl[p] = FALSE;
-	  nextl = 0; 
-	  nextu = nextl + CS->snd_LintraSz;
-	  
-	  /* allocate enough space to receive data */
-	  if (CS->rcv_bufSz < CS->snd_intraSz) {
-	    PS->maxSzBuf += CS->snd_intraSz - CS->rcv_bufSz;
-	    if (CS->rcv_bufSz != 0)
-	      /* not first time allocate memory */
-	      SUPERLU_FREE (CS->rcv_buf);
-	    CS->rcv_bufSz = CS->snd_intraSz;
-	    if (!(CS->rcv_buf = intMalloc_symbfact (CS->snd_intraSz))) {
-	      ABORT("Malloc fails for rcv_buf[].");
-	    }
-	  }
-
-	  for (vtx = fstVtx_blk, vtx_lid = fstVtx_blk_lid; 
-	       vtx < lstVtx_blk; vtx++, vtx_lid ++) {
-	    toSend = FALSE;
-	    k = xlsub[vtx_lid];
-	    prElt_L = lsub[k];
-	    j = xusub[vtx_lid];
-	    prElt_U = usub[j];
-
-	    if (prElt_L >= lstVtx_blk || prElt_U >= lstVtx_blk) {
-	      if (vtx == lstVtx_blk - 1) {
-		xlsub_end = *p_nextl;
-		xusub_end = *p_nextu;
-	      }
-	      else {
-		xlsub_end = xlsub[vtx_lid + 1];
-		xusub_end = xusub[vtx_lid + 1];
-	      }
-	      if (prElt_L >= lstVtx_blk) {
-		while (lsub[k] <= prElt_L && k < xlsub_end) {
-		  vtx_elt = lsub[k];
-		  if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) {
-		    p = OWNER( globToLoc[vtx_elt] );
-		    if (p != iam) {
-		      /* vtx will be send to another processor */
-		      snd_intraLvl[p] = TRUE;
-		      toSend = TRUE;
-		    }
-		    else {
-		      upd_myD = TRUE;
-		    }
-		  }
-		  k++;
-		}
-	      }
-	      if (prElt_U >= lstVtx_blk) {
-		while (usub[j] <= prElt_U && j < xusub_end) {
-		  vtx_elt = usub[j];
-		  if (vtx_elt >= lstVtx_blk && vtx_elt < lstVtx) {
-		    p = OWNER( globToLoc[vtx_elt] );
-		    if (p != iam) {
-		      /* vtx will be send to another processor */
-		      snd_intraLvl[p] = TRUE;
-		      toSend = TRUE;
-		    }
-		    else {
-		      upd_myD = TRUE;
-		    }
-		  }
-		  j ++;
-		}
-	      }
-	      if (toSend) {
-		/* L(:, vtx) and U(vtx, :) will be send to processors */
-		nelts = xusub_end - xusub[vtx_lid];
-		CS->rcv_buf[nextu + DIAG_IND]  = vtx;
-		CS->rcv_buf[nextu + NELTS_IND] = nelts;
-		nextu += 2;
-		for (j = xusub[vtx_lid]; j < xusub_end; j++) {
-		  CS->rcv_buf[nextu] = usub[j]; nextu ++;
-		}
-		
-		nelts = xlsub_end - xlsub[vtx_lid];
-		CS->rcv_buf[nextl + DIAG_IND] = vtx;
-		CS->rcv_buf[nextl + NELTS_IND] = nelts; 
-		nextl += 2;
-		for (j = xlsub[vtx_lid]; j < xlsub_end; j++) {
-		  CS->rcv_buf[nextl] = lsub[j]; nextl ++;
-		}
-	      }
-	    }
-	  }
-	  for (p = fstP; p < lstP; p++) 
-	    if (snd_intraLvl[p])
-	      rcv_intraLvl[p] ++;
-
-	  if (VInfo->filledSep == FILLED_SEP) {
-	    for (p = fstP; p < lstP; p++)
-	      rcv_intraLvl[p] = maxNmsgsToRcv * VInfo->filledSep + 
-		rcv_intraLvl[p];
-	  }
-	  else {
-	    /* send to the owner of the next block info on no of messages */
-	    p = OWNER( globToLoc[lstVtx_blk] );
-	    tag = tag_intraLvl + nblk_loc;
-	    
-	    MPI_Isend (&(rcv_intraLvl[fstP]), nprocsLvl, mpi_int_t, p,
-		       tag, (*symb_comm), request);
-#if ( PRNTlevel>=1 )
-	    PS->no_shmSnd ++;
-#endif
-	  }
-
-	  /* there is data to be send */
-	  sz_msg = nextl + nextu - CS->snd_LintraSz;
-	  for (p = fstP; p < lstP; p++) {
-	    if (p != iam && snd_intraLvl[p]) {
-	      MPI_Isend (&sz_msg, 1, mpi_int_t, p,
-			 tag_intraLvl_szMsg, (*symb_comm), &(request[1]));
-	      MPI_Isend (CS->rcv_buf, nextl, mpi_int_t, p,
-			 tag_intraLvl_LData, (*symb_comm), &(request[2]));
-	      MPI_Isend (&(CS->rcv_buf[CS->snd_LintraSz]), 
-			 nextu - CS->snd_LintraSz, mpi_int_t, p,
-			 tag_intraLvl_UData, (*symb_comm), &(request[3]));
-	      MPI_Waitall(3, &(request[1]), &(status[1]));
-#if ( PRNTlevel>=1 )
-	      PS->no_shmSnd ++;
-	      PS->no_msgsSnd += (float) 2;
-	      PS->sz_msgsSnd += (float) sz_msg;
-	      if (PS->maxsz_msgSnd < nextl) PS->maxsz_msgSnd = nextl;
-	      if (PS->maxsz_msgSnd < nextu - CS->snd_LintraSz) 
-		PS->maxsz_msgSnd = nextu - CS->snd_LintraSz;
-#endif
-	    }
-	  }
-	  if (VInfo->filledSep != FILLED_SEP) {
-	    MPI_Wait (request, status);      
-	  }
-
-	  /* update rest of vertices */
-	  if (upd_myD) {
-	    lsub_rcvd_sz = (*p_nextl) - xlsub[fstVtx_blk_lid];
-	    lsub_rcvd    = &(lsub[xlsub[fstVtx_blk_lid]]);
-	    usub_rcvd_sz = (*p_nextu) - xusub[fstVtx_blk_lid];
-	    usub_rcvd    = &(usub[xusub[fstVtx_blk_lid]]);
-	    
-	    mem_error =
-	      rl_update (0, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			 usub_rcvd, usub_rcvd_sz, fstVtx_blk, lstVtx_blk,
-			 EMPTY,
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 1, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    lsub = Llu_symbfact->lsub;
-	    lsub_rcvd    = &(lsub[xlsub[fstVtx_blk_lid]]);
-	    mem_error =
-	      rl_update (0, n, iam, usub_rcvd, usub_rcvd_sz,
-			 lsub_rcvd, lsub_rcvd_sz, fstVtx_blk, lstVtx_blk,
-			 EMPTY,
-			 fstVtx_toUpd, lstVtx, nvtcs_toUpd,
-			 0, p_mark,
-			 marker, Pslu_freeable, Llu_symbfact, VInfo, PS);
-	    usub = Llu_symbfact->usub;
-	  }
-	  if (VInfo->filledSep == FILLED_SEP)
-	    denseSep_symbfact (0, n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
-			       szSep, fstP, lstP, fstVtx_blkCyc, nblk_loc,
-			       p_nextl, p_nextu, p_mark, p_nsuper_loc, marker, ndComm, 
-			       symb_comm, Llu_symbfact, Pslu_freeable, VInfo, CS, PS);
-	}
-      }
-    }
-    VInfo->curblk_loc += 2;
-    nblk_loc ++;
-  }
-  
-  /* update maxNeltsVtx */
-  VInfo->maxNeltsVtx = maxNeltsVtx_in - lstVtx + fstVtx;
-  
-  /* if current separator dense, then reset value of filledSep */
-  if (VInfo->filledSep == FILLED_SEP)
-    VInfo->filledSep = FALSE;
-}
-
-static void
-symbfact_free 
-(
- int   iam,    /* Input - my processor number */
- int   nprocs, /* Input - number of processors for the symbolic factorization */
- Llu_symbfact_t *Llu_symbfact,  /* Input/Output - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input/Output - local info on vertices distribution */
- comm_symbfact_t *CS
- )
-{
-  /* free memory corresponding to prune structure */
-  if (Llu_symbfact->szLsubPr != 0)
-    SUPERLU_FREE( Llu_symbfact->lsubPr );
-#if 0
-  if (Llu_symbfact->szUsubPr != 0)
-    SUPERLU_FREE( Llu_symbfact->usubPr );
-#else /* XSL correction */
-  if (Llu_symbfact->usubPr != NULL)
-    SUPERLU_FREE( Llu_symbfact->usubPr );
-#endif
-  if (Llu_symbfact->xlsubPr != NULL)
-    SUPERLU_FREE( Llu_symbfact->xlsubPr );
-  if (Llu_symbfact->xusubPr != NULL)
-    SUPERLU_FREE( Llu_symbfact->xusubPr );
-  
-  if (Llu_symbfact->xlsub_rcvd != NULL)
-    SUPERLU_FREE( Llu_symbfact->xlsub_rcvd);
-  if (Llu_symbfact->xusub_rcvd != NULL)
-    SUPERLU_FREE( Llu_symbfact->xusub_rcvd);
-  
-  if (Llu_symbfact->cntelt_vtcs != NULL)
-    SUPERLU_FREE( Llu_symbfact->cntelt_vtcs);
-  if (Llu_symbfact->cntelt_vtcsA_lvl != NULL)
-    SUPERLU_FREE( Llu_symbfact->cntelt_vtcsA_lvl);
-  
-  if (CS->rcv_bufSz != 0)
-    SUPERLU_FREE( CS->rcv_buf );
-  if (CS->snd_bufSz != 0)
-    SUPERLU_FREE( CS->snd_buf );
-  
-  SUPERLU_FREE( VInfo->begEndBlks_loc);
-  SUPERLU_FREE( CS->rcv_interLvl);
-  SUPERLU_FREE( CS->snd_interLvl);
-  SUPERLU_FREE( CS->ptr_rcvBuf);
-  SUPERLU_FREE( CS->rcv_intraLvl);
-  SUPERLU_FREE( CS->snd_intraLvl);
-  SUPERLU_FREE( CS->snd_interSz);
-  SUPERLU_FREE( CS->snd_LinterSz);
-  SUPERLU_FREE( CS->snd_vtxinter);  
-}
-
-static void
-estimate_memUsage
-(
- int_t n,  /* Input - order of the matrix */
- int iam,  /* Input - my processor number */
- mem_usage_t *symb_mem_usage,
- float *p_totalMemLU,   /* Output -memory used for symbolic factorization */
- float *p_overestimMem, /* Output -memory allocated during to right looking 
-			   overestimation memory usage */
- Pslu_freeable_t *Pslu_freeable,   /* global LU data structures (modified) */
- Llu_symbfact_t *Llu_symbfact,  /* Input - local L, U data structures */
- vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */
- comm_symbfact_t *CS,
- psymbfact_stat_t *PS
- )
-{
-  int_t nvtcs_loc, lword, nsuper_loc;
-  float lu_mem, other_mem, overestimMem;
-  
-  nvtcs_loc = VInfo->nvtcs_loc;
-  nsuper_loc = Pslu_freeable->supno_loc[nvtcs_loc];
-  lword     = sizeof(int_t);
-  
-  /* memory for xlsub, xusub, supno_loc, cntelt_vtcs */
-  lu_mem = 4.0 * (float) nvtcs_loc * (float) lword;
-  /* memory for xlsubPr, xusubPr */
-  lu_mem += 2.0 * (float) VInfo->maxNvtcsNds_loc * (float) lword;
-  
-  if (PS->estimLSz < Llu_symbfact->xlsub[nvtcs_loc])
-    PS->estimLSz = Llu_symbfact->xlsub[nvtcs_loc];
-  if (PS->estimUSz < Llu_symbfact->xusub[nvtcs_loc])
-    PS->estimUSz = Llu_symbfact->xusub[nvtcs_loc];
-  
-  lu_mem += (float) PS->estimLSz * lword;
-  lu_mem += (float) PS->estimUSz * lword;
-  lu_mem += (float) PS->maxSzLPr * lword;
-  lu_mem += (float) PS->maxSzUPr * lword;
-  lu_mem += (float) PS->szDnsSep * lword;
-  /* memory for globToLoc, tempArray */
-  lu_mem += (float) 2* (float) n * lword;
-  lu_mem += (float) PS->maxSzBuf * lword;
-  
-  overestimMem  = (float) (PS->estimLSz - Llu_symbfact->xlsub[nvtcs_loc]) * lword;
-  overestimMem += (float) (PS->estimUSz - Llu_symbfact->xusub[nvtcs_loc]) * lword;
-  
-  *p_totalMemLU = lu_mem;  
-  *p_overestimMem = overestimMem;
-  
-  symb_mem_usage->for_lu = (float) ((3 * nvtcs_loc + 2 * nsuper_loc) * lword);
-  symb_mem_usage->for_lu += (float) (Llu_symbfact->xlsub[nvtcs_loc] * lword); 
-  symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword);   
-  symb_mem_usage->total = lu_mem;
-}
-
-
-static int_t *
-intMalloc_symbfact(int_t n)
-{
-  int_t *buf;
-  if (n == 0)
-    buf = NULL;
-  else
-    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
-  return buf;
-}
-
-static int_t *
-intCalloc_symbfact(int_t n)
-{
-  int_t *buf;
-  register int_t i;
-
-  if (n == 0)
-    buf = NULL;
-  else
-    buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
-  if ( buf )
-    for (i = 0; i < n; i++) buf[i] = 0;
-  return (buf);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,284 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#ifndef __SUPERLU_DIST_PSYMBFACT /* allow multiple inclusions */
-#define __SUPERLU_DIST_PSYMBFACT
-
-/*
- * File name:	psymbfact.h
- * Purpose:     Definitions for parallel symbolic factorization routine
- */
-
-/*
- *-- Structure returned by the symbolic factorization routine
- *
- * Memory is allocated during parallel symbolic factorization
- * symbfact_dist, and freed after dist_symbLU routine.
- *
- * (xlsub,lsub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xlsub[j] points to the starting
- *	location of the j-th column in lsub[*]. Note that xlsub 
- *	is indexed by column.
- *	Storage: row subscripts
- *
- * (xusub,usub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xusub[j] points to the starting
- *	location of the j-th row in usub[*]. Note that xusub 
- *	is indexed by rows.
- *	Storage: column subscripts
- *
- * (xsup_beg_loc,xsup_end_loc, supno_loc) describes mapping between 
- *      supernode and column, information local to each processor:
- *	xsup_beg_loc[s] is the leading column of the local s-th supernode.
- *	xsup_end_loc[s] is the last column of the local s-th supernode.
- *      supno[i] is the supernode no to which column i belongs;
- *
- */
-
-typedef struct {
-  int_t     *xlsub;  /* pointer to the beginning of each column of L */
-  int_t     *lsub;   /* compressed L subscripts, stored by columns */
-  int_t     szLsub;  /* current max size of lsub */
-  
-  int_t     *xusub;  /* pointer to the beginning of each row of U */
-  int_t     *usub;   /* compressed U subscripts, stored by rows */
-  int_t     szUsub;  /* current max size of usub */
-  
-  int_t     *supno_loc;  
-  int_t     *xsup_beg_loc;
-  int_t     *xsup_end_loc;
-  int_t     nvtcs_loc;       /* number of local vertices */
-  int_t     *globToLoc;      /* global to local indexing */
-  int_t     maxNvtcsPProc;   /* max number of vertices on the processors */
-} Pslu_freeable_t;
-
-
-/*
- *-- The structures are determined by symbfact_dist and not used thereafter.
- *
- * (xlsub,lsub): lsub[*] contains the compressed subscript of L, as described above
- *      for Pslu_freeable_t.  This structure is used internally in symbfact_dist.
- * (xusub,usub): usub[*] contains the compressed subscript of U, as described above
- *      for Pslu_freeable_t.  This structure is used internally in symbfact_dist.
- *
- * (xlsubPr,lsubPr): contains the pruned structure of the graph of
- *      L, stored by rows as a linked list.
- *	xlsubPr[j] points to the starting location of the j-th 
- *      row in lsub[*].
- *	Storage: original row subscripts.
- *      It contains the structure corresponding to one node in the sep_tree.
- *      In each independent domain formed by x vertices, xlsubPr is of size x.
- *      Allocated and freed during domain_symbolic.
- *      For the other nodes in the level tree, formed by a maximum of 
- *      maxNvtcsNds_loc, xlsubPr is of size maxNvtcsNds_loc. 
- *      Allocated after domain_symbolic, freed at the end of symbolic_dist
- *      routine.
- * (xusubPr,usubPr): contains the pruned structure of the graph of
- *      U, stored by columns as a linked list.  Similar to (xlsubPr,lsubPr),
- *      except that it is column oriented. 
- *
- * This is allocated during symbolic factorization symbfact_dist.
- */
-
-typedef struct {
-  int_t     *xlsubPr;  /* pointer to pruned structure of L */
-  int_t     *lsubPr;   /* pruned structure of L */
-  int_t     szLsubPr;  /* size of lsubPr array */
-  int_t     indLsubPr; /* current index in lsubPr */
-  int_t     *xusubPr;  /* pointer to pruned structure of U */
-  int_t     *usubPr;   /* pruned structure of U */
-  int_t     szUsubPr;  /* size of usubPr array */
-  int_t     indUsubPr; /* current index in usubPr */
-
-  int_t     *xlsub_rcvd;
-  int_t     *xlsub;     /* pointer to structure of L, stored by columns */
-  int_t     *lsub;      /* structure of L, stored by columns */
-  int_t     szLsub;     /* current max size of lsub */
-  int_t     nextl;      /* pointer to current computation in lsub */
-  
-  int_t     *xusub_rcvd; /* */
-  int_t     *xusub;      /* pointer to structure of U, stored by rows */
-  int_t     *usub;       /* structure of U, stored by rows */
-  int_t     szUsub;      /* current max size of usub */
-  int_t     nextu;       /* pointer to current computation in usub */
-  
-  int_t     *cntelt_vtcs; /* size of column/row for each vertex */
-  int_t     *cntelt_vtcsA_lvl; /* size of column/row of A for each vertex at the
-				  current level */
-  
-  LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
-  int_t  no_expand;    /* Number of memory expansions */
-  int_t  no_expand_pr; /* Number of memory expansions of the pruned structures */
-  int_t  no_expcp;     /* Number of memory expansions due to the right looking 
-			  overestimation approach */
-} Llu_symbfact_t;
-
-/* Local information on vertices distribution */
-typedef struct {
-  int_t  maxSzBlk;        /* Max no of vertices in a block */
-  int_t  maxNvtcsNds_loc; /* Max number of vertices of a node distributed on one
-			     processor.  The maximum is computed among all the nodes 
-			     of the sep arator tree and among all the processors */
-  int_t  maxNeltsVtx;     /* Max number of elements of a vertex,
-			     that is condisering that the matrix is
-			     dense */
-  int_t  nblks_loc;       /* Number of local blocks */
-  int_t  *begEndBlks_loc; /* Begin and end vertex of each local block.
-			     Array of size 2 * nblks_loc */
-  int_t  curblk_loc;      /* Index of current block in the level under computation */
-  int_t  nvtcs_loc;       /* Number of local vertices distributed on a processor */
-  int_t  nvtcsLvl_loc;    /* Number of local vertices for current
-			     level under computation */
-  int    filledSep;       /* determines if curent or all separators are filled */
-  int_t  nnz_asup_loc;    /* Number of nonzeros in asup not yet consumed.  Used during
-			     symbolic factorization routine to determine how much 
-			     of xusub, usub is still used to store the input matrix AS */
-  int_t  nnz_ainf_loc;    /* Number of nonzeros in ainf.  Similar to nnz_asup_loc. */
-  int_t  xusub_nextLvl;   /* Pointer to usub of the next level */
-  int_t  xlsub_nextLvl;   /* Pointer to lsub of the next level */
-  int_t  fstVtx_nextLvl;  /* First vertex of the next level */
-} vtcsInfo_symbfact_t;
-
-/* Structure used for redistributing A for the symbolic factorization algorithm */
-typedef struct {
-  int_t  *x_ainf;   /* pointers to columns of Ainf */
-  int_t  *ind_ainf; /* column indices of Ainf */
-  int_t  *x_asup;   /* pointers to rows of Asup */
-  int_t  *ind_asup; /* row indices of Asup */
-} matrix_symbfact_t;
-
-typedef struct {
-  int_t  *rcv_interLvl; /* from which processors iam receives data */
-  int_t  *snd_interLvl; /* to which processors iam sends data */
-  int_t  *snd_interSz;  /* size of data to be send */
-  int_t  *snd_LinterSz; /* size of data in L part to be send */
-  int_t  *snd_vtxinter; /* first vertex from where to send data */
-
-  /* inter level data structures */
-  int_t  *snd_intraLvl; /* to which processors iam sends data */
-  int_t  snd_intraSz;   /* size of data to send */
-  int_t  snd_LintraSz;  /* size of data to send */
-  int_t  *rcv_intraLvl; /* from which processors iam receives data */
-  int_t  *rcv_buf;      /* buffer to receive data */
-  int_t  rcv_bufSz;     /* size of the buffer to receive data */
-  int_t  *snd_buf;      /* buffer to send data */
-  int_t  snd_bufSz;     /* size of the buffer to send data */
-  int_t  *ptr_rcvBuf;   /* pointer to rcv_buf, the buffer to receive data */
-} comm_symbfact_t;
-
-/* relaxation parameters used in the algorithms - for future release */
-/* statistics collected during parallel symbolic factorization */
-typedef struct {
-  int_t  fill_par;     /* Estimation of fill.  It corresponds to sp_ienv_dist(6) */
-  float  relax_seps;   /* relaxation parameter -not used in this version */
-  float  relax_curSep; /* relaxation parameter -not used in this version */
-  float  relax_gen;    /* relaxation parameter -not used in this version */
-
-  /* number of operations performed during parallel symbolic factorization */
-  float  nops;
-  
-  /* no of dense current separators per proc */
-  int_t nDnsCurSep;
-  /* no of dense separators up per proc */
-  int_t  nDnsUpSeps;
-  
-  float  no_shmSnd;    /* Number of auxiliary messages for send data */
-  float  no_msgsSnd;   /* Number of messages sending data */
-  int_t  maxsz_msgSnd; /* Max size of messages sending data */
-  float  sz_msgsSnd;   /* Average size of messages sending data */
-  float  no_shmRcvd;   /* Number of auxiliary messages for rcvd data */
-  float  no_msgsRcvd;  /* Number of messages receiving data */
-  int_t  maxsz_msgRcvd;/* Max size of messages receiving data */
-  float  sz_msgsRcvd;  /* Average size of messages receiving data */
-  float  no_msgsCol;   /* Number of messages sent for estimating size
-			  of rows/columns, setup information
-			  interLvl_symbfact,  */
-  int_t  maxsz_msgCol; /* Average size of messages counted in
-			  no_msgsCol */
-  float  sz_msgsCol;   /* Max size of messages counted in no_msgsCol */
-
-  /* statistics on fill-in */
-  float  fill_pelt[6];
-  /* 
-     0 - average fill per elt added during right-looking factorization 
-     1 - max fill per elt added during right-looking factorization 
-     2 - number vertices modified during right-looking factorization 
-     3 - average fill per elt 
-     4 - max fill per elt 
-     5 - number vertices computed in upper levels of separator tree
-  */
-
-  /* Memory usage */
-  int_t  estimLSz; /* size of lsub due to right looking overestimation */
-  int_t  estimUSz; /* size of usub due to right looking overestimation */
-  int_t  maxSzLPr; /* maximum size of pruned L */
-  int_t  maxSzUPr; /* maximum size of pruned U */
-  int_t  maxSzBuf; /* maximum size of the send and receive buffers */
-  int_t  szDnsSep; /* size of memory used when there are dense separators */
-  float  allocMem; /* size of the total memory allocated (in bytes) */
-} psymbfact_stat_t;
-
-/* MACROS */
-
-/* 
-   Macros for comptuting the owner of a vertex and the local index
-   corresponding to a vertex 
-*/
-#define OWNER(x)      ((x) / maxNvtcsPProc)
-#define LOCAL_IND(x)  ((x) % maxNvtcsPProc)
-
-/* Macros for computing the available memory in lsub, usub */
-#define MEM_LSUB(Llu, VInfo) (Llu->szLsub - VInfo->nnz_ainf_loc)
-#define MEM_USUB(Llu, VInfo) (Llu->szUsub - VInfo->nnz_asup_loc)
-
-#define tag_interLvl 2
-#define tag_interLvl_LData 0
-#define tag_interLvl_UData 1
-#define tag_intraLvl_szMsg 1000
-#define tag_intraLvl_LData 1001
-#define tag_intraLvl_UData 1002
-/* tag_intraLvl has to be the last tag number */
-#define tag_intraLvl 1003
-
-/* 
- * Index of diagonal element, no of elements preceding each column/row
- * of L/U send to another processor 
- */
-#define DIAG_IND 0
-#define NELTS_IND 1
-#define RCVD_IND 2
-
-#define SUCCES_RET 0  /* successful return from a routine */
-#define ERROR_RET 1   /* error return code from a routine */
-#define FILLED_SEP 2  /* the current separator is dense */
-#define FILLED_SEPS 3 /* all the separators situated on the path from the current 
-			 separator to the root separator are dense */
-
-/* Code for the type of the memory to expand */
-#define USUB_PR 0
-#define LSUB_PR 1
-#define USUB 0
-#define LSUB 1
-
-/* 
- * Code for the type of computation - right looking (RL_SYMB); left
- * looking (LL_SYMB); symbolic factorization of an independent domain
- * (DOMAIN_SYMB); current separator is dense (DNS_CURSEP); all the
- * separators from the current one to the root of the tree are dense
- * (DNS_UPSEPS).
- */
-#define RL_SYMB 0
-#define DOMAIN_SYMB 1
-#define LL_SYMB 2
-#define DNS_UPSEPS 3
-#define DNS_CURSEP 4
-
-
-#endif /* __SUPERLU_DIST_PSYMBFACT */
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/psymbfact_util.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,522 +0,0 @@
-
-/*
- * -- Distributed symbolic factorization auxialiary routine  (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
- * INRIA France - January 2004
- * Laura Grigori
- *
- * November 1, 2007
- */
-
-#include "superlu_ddefs.h"
-#include "psymbfact.h"
-
-static void
-copy_mem_int(int_t howmany, int_t* old, int_t* new)
-{
-  register int_t i;
-  for (i = 0; i < howmany; i++) new[i] = old[i];
-}
-
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-/************************************************************************/
-static int_t *expand
-/************************************************************************/
-(
- int_t prev_len,    /* length used from previous call */
- int_t min_new_len, /* minimum new length to allocate */
- int_t *prev_mem,    /* pointer to the previous memory */
- int_t *p_new_len,      /* length of the new memory allocated */
- int_t len_tcopy_fbeg,  /* size of the memory to be copied to new store 
-			     starting from the beginning of the memory */
- int_t len_tcopy_fend,  /* size of the memory to be copied to new store,
-			    starting from the end of the memory */
- psymbfact_stat_t *PS
- )
-{
-  float exp = 2.0;
-  float alpha;
-  int_t *new_mem;
-  int_t new_len, tries, lword, extra, bytes_to_copy;
-  
-  alpha = exp;
-  lword = sizeof(int_t);
-  
-  new_len = alpha * prev_len;
-  if (min_new_len > 0 && new_len < min_new_len)
-    new_len = min_new_len;
-  
-  new_mem = (void *) SUPERLU_MALLOC(new_len * lword);
-  PS->allocMem += new_len * lword;
-  
-  if (new_mem) {
-    if (len_tcopy_fbeg != 0)
-      copy_mem_int(len_tcopy_fbeg, prev_mem, new_mem);
-    if (len_tcopy_fend != 0)  
-      copy_mem_int(len_tcopy_fend, &(prev_mem[prev_len-len_tcopy_fend]), 
-		   &(new_mem[new_len-len_tcopy_fend]));
-  }
-  *p_new_len = new_len;
-  return new_mem;
-  
-} /* EXPAND */
-
-
-/*
- * Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-/************************************************************************/
-int_t psymbfact_LUXpandMem
-/************************************************************************/
-(
- int_t iam,
- int_t n,           /* total number of columns */
- int_t vtxXp,       /* current vertex */
- int_t next,        /* number of elements currently in the factors */
- int_t min_new_len, /* minimum new length to allocate */
- int_t mem_type,    /* which type of memory to expand  */
- int_t rout_type,   /* during which type of factorization */
- int_t free_prev_mem, /* =1 if prev_mem has to be freed */
- Pslu_freeable_t *Pslu_freeable,
- Llu_symbfact_t *Llu_symbfact,  /* modified - global LU data structures */
- vtcsInfo_symbfact_t *VInfo,
- psymbfact_stat_t *PS
- )
-{
-  int_t  *new_mem, *prev_mem, *xsub;
-  /* size of the memory to be copied to new store starting from the 
-     beginning/end of the memory */
-  int_t xsub_nextLvl;  
-  int_t exp, prev_xsub_nextLvl, vtxXp_lid;
-  int_t *globToLoc, maxNvtcsPProc, nvtcs_loc;
-  int_t fstVtx_nextLvl, fstVtx_nextLvl_lid, vtx_lid, i, j;
-  int_t len_tcopy_fbeg, len_tcopy_fend, new_len, prev_len;  
-
-  exp  = 2;
-  globToLoc = Pslu_freeable->globToLoc;
-  nvtcs_loc = VInfo->nvtcs_loc;
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  fstVtx_nextLvl = VInfo->fstVtx_nextLvl;
-  vtxXp_lid      = LOCAL_IND( globToLoc[vtxXp] );
-  len_tcopy_fbeg = next;
-  if (fstVtx_nextLvl == n)
-    fstVtx_nextLvl_lid = nvtcs_loc;
-  else
-    fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] );  
-
-  if ( mem_type == LSUB ) {
-    prev_mem = Llu_symbfact->lsub;
-    prev_len = Llu_symbfact->szLsub;
-    xsub = Llu_symbfact->xlsub;
-    if (rout_type == DOMAIN_SYMB)
-      prev_xsub_nextLvl = xsub[vtxXp_lid+1];
-    else
-      prev_xsub_nextLvl = VInfo->xlsub_nextLvl;
-  } else if ( mem_type == USUB ) {
-    prev_mem = Llu_symbfact->usub;
-    prev_len = Llu_symbfact->szUsub;
-    xsub = Llu_symbfact->xusub;
-    if (rout_type == DOMAIN_SYMB)
-      prev_xsub_nextLvl = xsub[vtxXp_lid+1];
-    else
-      prev_xsub_nextLvl = VInfo->xusub_nextLvl;
-  }
-  
-  len_tcopy_fend = prev_len - prev_xsub_nextLvl;  
-  if (rout_type == DNS_UPSEPS || rout_type == DNS_CURSEP) {
-    fstVtx_nextLvl = n;
-    fstVtx_nextLvl_lid = nvtcs_loc;
-    len_tcopy_fend = 0;
-  }
-#ifdef TEST_SYMB
-  printf ("Pe[%d] LUXpand mem_t %d vtxXp %d\n", 
-	  iam, mem_type, vtxXp); 
-#endif
-  new_mem = expand (prev_len, min_new_len, prev_mem,
-		    &new_len, len_tcopy_fbeg, len_tcopy_fend, PS);
-  if ( !new_mem ) {
-    fprintf(stderr, "Pe[%d] Can't exp MemType %d: prv_len %d min_new %d new_l %d\n",
-	   iam, mem_type, prev_len, min_new_len, new_len);
-    return ERROR_RET;
-  }
-  
-  xsub_nextLvl = new_len - len_tcopy_fend;
-  
-  /* reset xsub information pointing to A data */
-  if (fstVtx_nextLvl != n || rout_type == DOMAIN_SYMB) {
-    if (rout_type == DOMAIN_SYMB)
-      vtx_lid = vtxXp_lid + 1;
-    else {
-      vtx_lid = fstVtx_nextLvl_lid +1;
-    }
-    i = xsub_nextLvl + xsub[vtx_lid] - prev_xsub_nextLvl;
-    for (; vtx_lid < nvtcs_loc; vtx_lid ++) {
-      j = xsub[vtx_lid+1] - xsub[vtx_lid];
-      xsub[vtx_lid] = i;
-      i += j;
-    }
-    xsub[vtx_lid] = i;
-  }
-
-  if (free_prev_mem) {
-    SUPERLU_FREE (prev_mem);
-    PS->allocMem -= 0;
-  }
-  
-  if ( mem_type == LSUB ) {
-    Llu_symbfact->lsub   = new_mem;
-    Llu_symbfact->szLsub = new_len;
-    VInfo->xlsub_nextLvl = xsub_nextLvl;
-  } else if ( mem_type == USUB ) {
-    Llu_symbfact->usub   = new_mem;
-    Llu_symbfact->szUsub = new_len;
-    VInfo->xusub_nextLvl = xsub_nextLvl;
-  }
-  
-  Llu_symbfact->no_expand ++;
-  return SUCCES_RET;
-}
-
-/*
- * Expand the data structures for L and U during the factorization.
- * Return value: SUCCES_RET - successful return
- *               ERROR_RET - error due to a memory alocation failure
- */
-/************************************************************************/
-int_t psymbfact_LUXpand
-/************************************************************************/
-(
- int_t iam, 
- int_t n,           /* total number of columns */
- int_t fstVtxLvl_loc, /* first vertex in the level to update */
- int_t vtxXp,         /* current vertex */
- int_t *p_next,        /* number of elements currently in the factors */
- int_t min_new_len, /* minimum new length to allocate */
- int_t mem_type,   /* which type of memory to expand  */
- int_t rout_type,  /* during which type of factorization */
- int_t free_prev_mem, /* =1 if free prev_mem memory */
- Pslu_freeable_t *Pslu_freeable, 
- Llu_symbfact_t *Llu_symbfact,  /* modified - global LU data structures */
- vtcsInfo_symbfact_t *VInfo,
- psymbfact_stat_t *PS
- )
-{
-  int mem_error;
-  int_t  *new_mem, *prev_mem, *xsub, sz_prev_mem;
-  /* size of the memory to be copied to new store starting from the 
-     beginning/end of the memory */
-  int_t exp, prev_xsub_nextLvl, vtxXp_lid, xsub_nextLvl;
-  int_t *globToLoc, nvtcs_loc, maxNvtcsPProc;
-  int_t fstVtx_nextLvl, fstVtx_nextLvl_lid;
-  int_t i, j, k, vtx_lid, len_texp, nelts, nel;
-  int_t fstVtxLvl_loc_lid, prev_len, next;
-  
-  exp  = 2;
-  next = *p_next;
-  globToLoc = Pslu_freeable->globToLoc;
-  nvtcs_loc = VInfo->nvtcs_loc;
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  fstVtx_nextLvl = VInfo->fstVtx_nextLvl;
-  
-  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
-  if (fstVtx_nextLvl == n)
-    fstVtx_nextLvl_lid = VInfo->nvtcs_loc;
-  else
-    fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] );  
-  if (rout_type == RL_SYMB)
-    fstVtxLvl_loc_lid = LOCAL_IND( globToLoc[fstVtxLvl_loc] );
-
-  if ( mem_type == LSUB ) {
-    xsub = Llu_symbfact->xlsub;
-    prev_mem = Llu_symbfact->lsub;
-    prev_xsub_nextLvl = VInfo->xlsub_nextLvl;
-    sz_prev_mem = Llu_symbfact->szLsub;
-  } else if ( mem_type == USUB ) {
-    xsub = Llu_symbfact->xusub;
-    prev_mem = Llu_symbfact->usub;
-    prev_xsub_nextLvl = VInfo->xusub_nextLvl;
-    sz_prev_mem = Llu_symbfact->szUsub;
-  }
-#ifdef TEST_SYMB
-  printf ("Pe[%d] Expand LU mem_t %d vtxXp %d\n", 
-	  iam, mem_type, vtxXp); 
-#endif
-  /* Try to expand the size of xsub in the existing memory */
-  if (rout_type == RL_SYMB) {
-    len_texp = 0;
-    for (vtx_lid = fstVtxLvl_loc_lid; vtx_lid < fstVtx_nextLvl_lid; vtx_lid ++) {
-      nelts = xsub[vtx_lid+1] - xsub[vtx_lid];
-      if (nelts == 0) nelts = 1;
-      nelts = 2 * nelts;
-      if (nelts > Llu_symbfact->cntelt_vtcs[vtx_lid])
-	nelts = Llu_symbfact->cntelt_vtcs[vtx_lid];
-      len_texp += nelts;
-    }
-/*     len_texp = 2 * (xsub[fstVtx_nextLvl_lid] - xsub[fstVtxLvl_loc_lid]); */
-    prev_len = xsub[fstVtxLvl_loc_lid];
-    next = prev_len;
-  }
-  else {
-    nelts = xsub[vtxXp_lid+1] - xsub[vtxXp_lid];
-    if (nelts == 0) nelts = 1;
-    len_texp = xsub[fstVtx_nextLvl_lid] - xsub[vtxXp_lid+1] +
-      4 * nelts;
-    prev_len = xsub[vtxXp_lid];
-  }
-  
-  if (prev_len + len_texp >= prev_xsub_nextLvl) {
-    /* not enough memory */
-    min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl);
-    if (mem_error = 
-	psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
-			      mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact,
-			      VInfo, PS))
-      return (mem_error);
-    if ( mem_type == LSUB ) 
-      new_mem = Llu_symbfact->lsub;
-    else if ( mem_type == USUB ) 
-      new_mem = Llu_symbfact->usub;
-  }
-  else 
-    new_mem = prev_mem;
-
-  if (mem_type == LSUB && PS->estimLSz < (prev_len + len_texp))
-    PS->estimLSz = prev_len + len_texp;
-  if (mem_type == USUB && PS->estimUSz < (prev_len + len_texp))
-    PS->estimUSz = prev_len;
-
-  /* expand the space */
-  if (rout_type == LL_SYMB) {
-    i = xsub[vtxXp_lid] + len_texp;
-    vtx_lid = fstVtx_nextLvl_lid - 1;
-    for (; vtx_lid > vtxXp_lid; vtx_lid --) {
-      j = xsub[vtx_lid];  
-      nel = 0;
-      while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) {
-	nel ++; j ++;
-      }
-      j = xsub[vtx_lid] + nel - 1;  
-      k = i - (xsub[vtx_lid+1] - xsub[vtx_lid]) + nel - 1;
-      if (k+1 < i)  new_mem[k+1] = EMPTY; 
-      while (j >= xsub[vtx_lid]) {
-	new_mem[k] = prev_mem[j]; k--; j--;
-      }
-      k = i;
-      i -= (xsub[vtx_lid+1] - xsub[vtx_lid]);
-      xsub[vtx_lid+1] = k;
-    }
-    xsub[vtx_lid+1] = i;
-    k = *p_next;
-    if (k < xsub[vtx_lid+1])
-      new_mem[k] = EMPTY;
-  }
-
-  if (rout_type == RL_SYMB) {
-    *p_next -= xsub[vtxXp_lid];
-    i = xsub[fstVtxLvl_loc_lid] + len_texp;
-    vtx_lid = fstVtx_nextLvl_lid - 1;
-    for (; vtx_lid >= fstVtxLvl_loc_lid; vtx_lid --) {
-      nelts = 2 * (xsub[vtx_lid+1] - xsub[vtx_lid]);
-      if (nelts == 0) nelts = 2;
-      if (nelts > Llu_symbfact->cntelt_vtcs[vtx_lid])
-	nelts = Llu_symbfact->cntelt_vtcs[vtx_lid];
-      j = xsub[vtx_lid];  
-      nel = 0;
-      while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) {
-	nel ++; j ++;
-      }
-      j = xsub[vtx_lid] + nel - 1;  
-      k = i - nelts + nel - 1;
-      if (k+1 < i) new_mem[k+1] = EMPTY; 
-      while (j >= xsub[vtx_lid]) {
-	new_mem[k] = prev_mem[j]; k--; j--;
-      }
-      k = i;
-      i -= nelts;
-      xsub[vtx_lid+1] = k;
-    }
-    *p_next += xsub[vtxXp_lid];
-  }  
-
-  if (free_prev_mem && new_mem != prev_mem)
-    SUPERLU_FREE (prev_mem);
-  Llu_symbfact->no_expcp ++;
-  
-  return SUCCES_RET;
-}
-
-/*
- * Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-/************************************************************************/
-int_t psymbfact_LUXpand_RL
-/************************************************************************/
-(
- int_t iam, 
- int_t n,           /* total number of columns */
- int_t vtxXp,       /* current vertex */
- int_t next,        /* number of elements currently in the factors */
- int_t len_texp,    /* length to expand */
- int_t mem_type,    /* which type of memory to expand  */
- Pslu_freeable_t *Pslu_freeable, 
- Llu_symbfact_t *Llu_symbfact,  /* modified - global LU data structures */
- vtcsInfo_symbfact_t *VInfo,
- psymbfact_stat_t *PS
- )
-{
-  int_t  *new_mem, *prev_mem, *xsub, mem_error, sz_prev_mem;
-  /* size of the memory to be copied to new store starting from the 
-     beginning/end of the memory */
-  int_t exp, prev_xsub_nextLvl, vtxXp_lid, xsub_nextLvl;
-  int_t *globToLoc, nvtcs_loc, maxNvtcsPProc;
-  int_t fstVtx_nextLvl, fstVtx_nextLvl_lid;
-  int_t i, j, k, vtx_lid, nel;
-  int_t fstVtxLvl_loc_lid, prev_len, min_new_len;
-
-#ifdef TEST_SYMB
-  printf ("Pe[%d] Expand LU_RL mem_t %d vtxXp %d\n", 
-	  iam, mem_type, vtxXp); 
-#endif
-  globToLoc = Pslu_freeable->globToLoc;
-  nvtcs_loc = VInfo->nvtcs_loc;
-  maxNvtcsPProc  = Pslu_freeable->maxNvtcsPProc;
-  fstVtx_nextLvl = VInfo->fstVtx_nextLvl;
-  
-  vtxXp_lid = LOCAL_IND( globToLoc[vtxXp] );
-  if (fstVtx_nextLvl == n)
-    fstVtx_nextLvl_lid = VInfo->nvtcs_loc;
-  else
-    fstVtx_nextLvl_lid = LOCAL_IND( globToLoc[fstVtx_nextLvl] );  
-
-  if ( mem_type == LSUB ) {
-    xsub = Llu_symbfact->xlsub;
-    prev_mem = Llu_symbfact->lsub;
-    prev_xsub_nextLvl = VInfo->xlsub_nextLvl;
-    sz_prev_mem = Llu_symbfact->szLsub;
-  } else if ( mem_type == USUB ) {
-    xsub = Llu_symbfact->xusub;
-    prev_mem = Llu_symbfact->usub;
-    prev_xsub_nextLvl = VInfo->xusub_nextLvl;
-    sz_prev_mem = Llu_symbfact->szUsub;
-  }
-  else ABORT("Tries to expand nonexisting memory type.\n");
-  
-  /* Try to expand the size of xsub in the existing memory */
-  prev_len = xsub[vtxXp_lid];
-  
-  if (prev_len + len_texp >= prev_xsub_nextLvl) {
-    /* not enough memory */
-    min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl);
-    if (mem_error = 
-	psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
-			      mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact,
-			      VInfo, PS))
-      return (mem_error);
-    if ( mem_type == LSUB ) 
-      new_mem = Llu_symbfact->lsub;
-    else if ( mem_type == USUB ) 
-      new_mem = Llu_symbfact->usub;
-  }
-  else 
-    new_mem = prev_mem;
-
-  /* expand the space */
-  if (mem_type == LSUB && PS->estimLSz < (prev_len + len_texp))
-    PS->estimLSz = prev_len + len_texp;
-  if (mem_type == USUB && PS->estimUSz < (prev_len + len_texp))
-    PS->estimUSz = prev_len;
-
-  i = xsub[vtxXp_lid] + len_texp;
-  vtx_lid = fstVtx_nextLvl_lid - 1;
-  for (; vtx_lid > vtxXp_lid; vtx_lid --) {
-    j = xsub[vtx_lid];  
-    nel = 0;
-    while (j < xsub[vtx_lid+1] && prev_mem[j] != EMPTY) {
-      nel ++; j++;
-    }
-    j = xsub[vtx_lid] + nel - 1;  
-    k = i - Llu_symbfact->cntelt_vtcs[vtx_lid] + nel - 1;
-    if (k+1 < i) 
-      new_mem[k+1] = EMPTY; 
-    while (j >= xsub[vtx_lid]) {
-      new_mem[k] = prev_mem[j];
-      k--; j--;
-    }
-    k = i;
-    i -= Llu_symbfact->cntelt_vtcs[vtx_lid];
-    xsub[vtx_lid+1] = k;
-  }
-  xsub[vtx_lid+1] = i;
-  k = next;
-  if (k < xsub[vtx_lid+1])
-    new_mem[k] = EMPTY;
-  
-  if (new_mem != prev_mem)
-    SUPERLU_FREE (prev_mem);
-  Llu_symbfact->no_expcp ++;
-  
-  return SUCCES_RET;
-}
-
-/*
- * Expand the data structures for L and U pruned during the factorization.
- * Return value: SUCCES_RET - successful return
- *               ERROR_RET - error when run out of space
- */
-/************************************************************************/
-int_t psymbfact_prLUXpand
-/************************************************************************/
-(
- int_t iam, 
- int_t min_new_len, /* minimum new length to allocate */ 
- MemType mem_type,  /* which type of memory to expand  */
- Llu_symbfact_t *Llu_symbfact, /* modified L/U pruned structures */
- psymbfact_stat_t *PS
- )
-{
-  int_t *prev_mem, *new_mem;
-  int_t prev_len, new_len, len_tcopy_fbeg;
-  
-  if ( mem_type == LSUB_PR ) {
-    prev_len = Llu_symbfact->szLsubPr;
-    prev_mem = Llu_symbfact->lsubPr;
-    len_tcopy_fbeg = Llu_symbfact->indLsubPr;
-  } else if ( mem_type == USUB_PR ) {
-    prev_len = Llu_symbfact->szUsubPr;
-    prev_mem = Llu_symbfact->usubPr;
-    len_tcopy_fbeg = Llu_symbfact->indUsubPr;
-  } else ABORT("Tries to expand nonexisting memory type.\n");
-  
-#ifdef TEST_SYMB
-  printf ("Pe[%d] Expand prmem prev_len %d min_new_l %d len_tfbeg %d\n", 
-	  iam, prev_len, min_new_len, len_tcopy_fbeg);
-#endif
-  
-  new_mem = expand (prev_len, min_new_len, prev_mem, 
-		    &new_len, len_tcopy_fbeg, 0, PS);
-  
-  if ( !new_mem ) {
-    fprintf(stderr, "Can't expand MemType %d: \n", mem_type);
-    return (ERROR_RET);
-  }
-  
-  Llu_symbfact->no_expand_pr ++;
-  if ( mem_type == LSUB_PR ) {
-    Llu_symbfact->lsubPr  = new_mem;
-    Llu_symbfact->szLsubPr = new_len;
-  } else if ( mem_type == USUB_PR ) {
-    Llu_symbfact->usubPr  = new_mem;
-    Llu_symbfact->szUsubPr = new_len;
-  } else ABORT("Tries to expand nonexisting memory type.\n");
-  
-  SUPERLU_FREE (prev_mem);
-
-  return SUCCES_RET;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pxerbla.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pxerbla.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pxerbla.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pxerbla.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-void pxerbla(char *srname, gridinfo_t *grid, int_t info)
-{
-    printf("{%4d,%4d}: On entry to %6s, parameter number %2d had an illegal value\n",
-	   MYROW(grid->iam, grid), MYCOL(grid->iam, grid), srname, info);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzdistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzdistribute.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzdistribute.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzdistribute.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1043 +0,0 @@
-
-#include "superlu_zdefs.h"
-
-int_t
-zReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
-                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
-                doublecomplex *a[])
-{
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- * Purpose
- * =======
- *   Re-distribute A on the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * A      (input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
- *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * ScalePermstruct (input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * colptr (output) int*
- *
- * rowind (output) int*
- *
- * a      (output) doublecomplex*
- *
- * Return value
- * ============
- *
- */
-    NRformat_loc *Astore;
-    int_t  *perm_r; /* row permutation vector */
-    int_t  *perm_c; /* column permutation vector */
-    int_t  i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize;
-    int_t  nnz_loc;    /* number of local nonzeros */
-    int_t  nnz_remote; /* number of remote nonzeros to be sent */
-    int_t  SendCnt; /* number of remote nonzeros to be sent */
-    int_t  RecvCnt; /* number of remote nonzeros to be sent */
-    int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-    int_t  *ia, *ja, **ia_send, *index, *itemp;
-    int_t  *ptr_to_send;
-    doublecomplex *aij, **aij_send, *nzval, *dtemp;
-    doublecomplex *nzval_a;
-    int    iam, it, p, procs;
-    MPI_Request *send_req;
-    MPI_Status  status;
-    
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter zReDistribute_A()");
-#endif
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    procs = grid->nprow * grid->npcol;
-    Astore = (NRformat_loc *) A->Store;
-    n = A->ncol;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    nnzToRecv = intCalloc_dist(2*procs);
-    nnzToSend = nnzToRecv + procs;
-
-
-    /* ------------------------------------------------------------
-       COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-       THEN ALLOCATE SPACE.
-       THIS ACCOUNTS FOR THE FIRST PASS OF A.
-       ------------------------------------------------------------*/
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-	    jcol = Astore->colind[j];
-	    gbi = BlockNum( irow );
-	    gbj = BlockNum( jcol );
-	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-	    ++nnzToSend[p]; 
-	}
-    }
-
-    /* All-to-all communication */
-    MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		  grid->comm);
-
-    maxnnzToRecv = 0;
-    nnz_loc = SendCnt = RecvCnt = 0;
-
-    for (p = 0; p < procs; ++p) {
-	if ( p != iam ) {
-	    SendCnt += nnzToSend[p];
-	    RecvCnt += nnzToRecv[p];
-	    maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
-	} else {
-	    nnz_loc += nnzToRecv[p];
-	    /*assert(nnzToSend[p] == nnzToRecv[p]);*/
-	}
-    }
-    k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-
-    /* Allocate space for storing the triplets after redistribution. */
-    if ( k ) { /* count can be zero. */
-        if ( !(ia = intMalloc_dist(2*k)) )
-            ABORT("Malloc fails for ia[].");
-        if ( !(aij = doublecomplexMalloc_dist(k)) )
-            ABORT("Malloc fails for aij[].");
-    }
-    ja = ia + k;
-
-    /* Allocate temporary storage for sending/receiving the A triplets. */
-    if ( procs > 1 ) {
-      if ( !(send_req = (MPI_Request *)
-	     SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-      if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
-        ABORT("Malloc fails for ia_send[].");
-      if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) )
-        ABORT("Malloc fails for aij_send[].");
-      if ( SendCnt ) { /* count can be zero */
-          if ( !(index = intMalloc_dist(2*SendCnt)) )
-              ABORT("Malloc fails for index[].");
-          if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) )
-              ABORT("Malloc fails for nzval[].");
-      }
-      if ( !(ptr_to_send = intCalloc_dist(procs)) )
-        ABORT("Malloc fails for ptr_to_send[].");
-      if ( maxnnzToRecv ) { /* count can be zero */
-          if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
-              ABORT("Malloc fails for itemp[].");
-          if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) )
-              ABORT("Malloc fails for dtemp[].");
-      }
-
-      for (i = 0, j = 0, p = 0; p < procs; ++p) {
-          if ( p != iam ) {
-	      ia_send[p] = &index[i];
-	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	      aij_send[p] = &nzval[j];
-	      j += nnzToSend[p];
-	  }
-      }
-    } /* if procs > 1 */
-      
-    if ( !(*colptr = intCalloc_dist(n+1)) )
-        ABORT("Malloc fails for *colptr[].");
-
-    /* ------------------------------------------------------------
-       LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
-       THIS ACCOUNTS FOR THE SECOND PASS OF A.
-       ------------------------------------------------------------*/
-    nnz_loc = 0; /* Reset the local nonzero count. */
-    nzval_a = Astore->nzval;
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-	    jcol = Astore->colind[j];
-	    gbi = BlockNum( irow );
-	    gbj = BlockNum( jcol );
-	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-
-	    if ( p != iam ) { /* remote */
-	        k = ptr_to_send[p];
-	        ia_send[p][k] = irow;
-	        ia_send[p][k + nnzToSend[p]] = jcol;
-		aij_send[p][k] = nzval_a[j];
-		++ptr_to_send[p]; 
-	    } else {          /* local */
-	        ia[nnz_loc] = irow;
-	        ja[nnz_loc] = jcol;
-		aij[nnz_loc] = nzval_a[j];
-		++nnz_loc;
-		++(*colptr)[jcol]; /* Count nonzeros in each column */
-	    }
-	}
-    }
-
-    /* ------------------------------------------------------------
-       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-       NOTE: Can possibly use MPI_Alltoallv.
-       ------------------------------------------------------------*/
-    for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
-	    it = 2*nnzToSend[p];
-	    MPI_Isend( ia_send[p], it, mpi_int_t,
-		       p, iam, grid->comm, &send_req[p] );
-	    it = nnzToSend[p];
-	    MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
-	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
-	}
-    }
-
-    for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
-	    it = 2*nnzToRecv[p];
-	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
-	    it = nnzToRecv[p];
-            MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
-		      grid->comm, &status );
-	    for (i = 0; i < nnzToRecv[p]; ++i) {
-	        ia[nnz_loc] = itemp[i];
-		jcol = itemp[i + nnzToRecv[p]];
-		/*assert(jcol<n);*/
-	        ja[nnz_loc] = jcol;
-		aij[nnz_loc] = dtemp[i];
-		++nnz_loc;
-		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
-	    }
-	}
-    }
-
-    for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
-	    MPI_Wait( &send_req[p], &status);
-	    MPI_Wait( &send_req[procs+p], &status);
-	}
-    }
-
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE
-       ------------------------------------------------------------*/
-
-    SUPERLU_FREE(nnzToRecv);
-
-    if ( procs > 1 ) {
-	SUPERLU_FREE(send_req);
-	SUPERLU_FREE(ia_send);
-	SUPERLU_FREE(aij_send);
-	if ( SendCnt ) {
-            SUPERLU_FREE(index);
-            SUPERLU_FREE(nzval);
-        }
-	SUPERLU_FREE(ptr_to_send);
-        if ( maxnnzToRecv ) {
-            SUPERLU_FREE(itemp);
-            SUPERLU_FREE(dtemp);
-        }
-    }
-
-    /* ------------------------------------------------------------
-       CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
-       ------------------------------------------------------------*/
-    if ( nnz_loc ) { /* nnz_loc can be zero */
-        if ( !(*rowind = intMalloc_dist(nnz_loc)) )
-            ABORT("Malloc fails for *rowind[].");
-        if ( !(*a = doublecomplexMalloc_dist(nnz_loc)) )
-            ABORT("Malloc fails for *a[].");
-    }
-
-    /* Initialize the array of column pointers */
-    k = 0;
-    jsize = (*colptr)[0];
-    (*colptr)[0] = 0;
-    for (j = 1; j < n; ++j) {
-	k += jsize;
-	jsize = (*colptr)[j];
-	(*colptr)[j] = k;
-    }
-    
-    /* Copy the triplets into the column oriented storage */
-    for (i = 0; i < nnz_loc; ++i) {
-	j = ja[i];
-	k = (*colptr)[j];
-	(*rowind)[k] = ia[i];
-	(*a)[k] = aij[i];
-	++(*colptr)[j];
-    }
-
-    /* Reset the column pointers to the beginning of each column */
-    for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
-    (*colptr)[0] = 0;
-
-    if ( nnz_loc ) {
-        SUPERLU_FREE(ia);
-        SUPERLU_FREE(aij);
-    }
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit zReDistribute_A()");
-#endif
- 
-} /* zReDistribute_A */
-
-int_t
-pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
-	     ScalePermstruct_t *ScalePermstruct,
-	     Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct,
-	     gridinfo_t *grid)
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- *
- * Purpose
- * =======
- *   Distribute the matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (input) int
- *        Dimension of the matrix.
- *
- * A      (input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
- *        Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * ScalePermstruct (input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * LUstruct (input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   > 0, working storage required (in bytes).
- *
- */
-{
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
-          len, len1, nsupc;
-    int_t ljb;  /* local block column number */
-    int_t nrbl; /* number of L blocks in current block column */
-    int_t nrbu; /* number of U blocks in current block column */
-    int_t gb;   /* global block number; 0 < gb <= nsuper */
-    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-    int iam, jbrow, kcol, mycol, myrow, pc, pr;
-    int_t mybufmax[NBUFFERS];
-    NRformat_loc *Astore;
-    doublecomplex *a;
-    int_t *asub, *xa;
-    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
-    int_t *lsub, *xlsub, *usub, *xusub;
-    int_t nsupers;
-    int_t next_lind;      /* next available position in index[*] */
-    int_t next_lval;      /* next available position in nzval[*] */
-    int_t *index;         /* indices consist of headers and row subscripts */
-    doublecomplex *lusup, *uval; /* nonzero values in L and U */
-    doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-    doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-
-    /*-- Counts to be used in factorization. --*/
-    int_t  *ToRecv, *ToSendD, **ToSendR;
-
-    /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
-
-    /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
-			      the full array (local)                 */
-
-    /*-- Auxiliary arrays; freed on return --*/
-    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
-    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
-    int_t *Ucbs;       /* number of column blocks in a block row            */
-    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
-    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
-    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
-    doublecomplex *dense, *dense_col; /* SPA */
-    doublecomplex zero = {0.0, 0.0};
-    int_t ldaspa;     /* LDA of SPA */
-    int_t mem_use = 0, iword, dword;
-
-#if ( PRNTlevel>=1 )
-    int_t nLblocks = 0, nUblocks = 0;
-#endif
-#if ( PROFlevel>=1 ) 
-    double t, t_u, t_l;
-    int_t u_blks;
-#endif
-
-    /* Initialization. */
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-    nsupers  = supno[n-1] + 1;
-    Astore   = (NRformat_loc *) A->Store;
-
-#if ( PRNTlevel>=1 )
-    iword = sizeof(int_t);
-    dword = sizeof(doublecomplex);
-#endif
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzdistribute()");
-#endif
-#if ( PROFlevel>=1 )
-    t = SuperLU_timer_();
-#endif
-
-    zReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
-		      grid, &xa, &asub, &a);
-
-#if ( PROFlevel>=1 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf("--------\n"
-		       ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
-#endif
-
-    if ( fact == SamePattern_SameRowPerm ) {
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* We can propagate the new values of A into the existing
-	   L and U data structures.            */
-	ilsum = Llu->ilsum;
-	ldaspa = Llu->ldalsum;
-	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
-	if ( !(Urb_length = intCalloc_dist(nrbu)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-	Unzval_br_ptr = Llu->Unzval_br_ptr;
-#if ( PRNTlevel>=1 )
-	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
-#endif
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_();
-#endif
-
-	/* Initialize Uval to zero. */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
-	    index = Ufstnz_br_ptr[lb];
-	    if ( index ) {
-		uval = Unzval_br_ptr[lb];
-		len = index[1];
-		for (i = 0; i < len; ++i) uval[i] = zero;
-	    } /* if index != NULL */
-	} /* for lb ... */
-
-	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-
- 		/* Scatter A into SPA (for L), or into U directly. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa[j]; i < xa[j+1]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
- 			    if ( gb < jb ) { /* in U */
- 				index = Ufstnz_br_ptr[lb];
- 				uval = Unzval_br_ptr[lb];
- 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
- 				    /* Skip nonzero values in this block */
- 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
- 				    /* Move pointer to the next block */
- 				    Urb_indptr[lb] += UB_DESCRIPTOR
- 					+ SuperSize( k );
- 				}
- 				/*assert(k == jb);*/
- 				/* start fstnz */
- 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
- 				len = Urb_length[lb];
- 				fsupc1 = FstBlockC( gb+1 );
- 				k = j - fsupc;
- 				/* Sum the lengths of the leading columns */
- 				for (jj = 0; jj < k; ++jj)
-				    len += fsupc1 - index[istart++];
-				/*assert(irow>=index[istart]);*/
-				uval[len + irow - index[istart]] = a[i];
-			    } else { /* in L; put in SPA first */
-  				irow = ilsum[lb] + irow - FstBlockC( gb );
-  				dense_col[irow] = a[i];
-  			    }
-  			}
-		    } /* for i ... */
-  		    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif
-
-		/* Gather the values of A from SPA into Lnzval[]. */
-		ljb = LBj( jb, grid ); /* Local block number */
-		index = Lrowind_bc_ptr[ljb];
-		if ( index ) {
-		    nrbl = index[0];   /* Number of row blocks. */
-		    len = index[1];    /* LDA of lusup[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (jj = 0; jj < nrbl; ++jj) {
-			gb = index[next_lind++];
-			len1 = index[next_lind++]; /* Rows in the block. */
-			lb = LBi( gb, grid );
-			for (bnnz = 0; bnnz < len1; ++bnnz) {
-			    irow = index[next_lind++]; /* Global index. */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    k = next_lval++;
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			} /* for bnnz ... */
-		    } /* for jj ... */
-		} /* if index ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-	} /* for jb ... */
-
-	SUPERLU_FREE(dense);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
-			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } else {
-        /* ------------------------------------------------------------
-	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
-	   ------------------------------------------------------------*/
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* We first need to set up the L and U data structures and then
-	 * propagate the values of A into them.
-	 */
-	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
-	xlsub = Glu_freeable->xlsub;
-	usub = Glu_freeable->usub;    /* compressed U subscripts */
-	xusub = Glu_freeable->xusub;
-    
-	if ( !(ToRecv = intCalloc_dist(nsupers)) )
-	    ABORT("Calloc fails for ToRecv[].");
-
-	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
-	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for ToSendR[].");
-	j = k * grid->npcol;
-	if ( !(index = intMalloc_dist(j)) )
-	    ABORT("Malloc fails for index[].");
-#if ( PRNTlevel>=1 )
-	mem_use += k*sizeof(int_t*) + (j + nsupers)*iword;
-#endif
-	for (i = 0; i < j; ++i) index[i] = EMPTY;
-	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-
-	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
-              (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
-	    ABORT("Malloc fails for Unzval_br_ptr[].");
-	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
-	if ( !(ToSendD = intCalloc_dist(k)) )
-	    ABORT("Malloc fails for ToSendD[].");
-	if ( !(ilsum = intMalloc_dist(k+1)) )
-	    ABORT("Malloc fails for ilsum[].");
-
-	/* Auxiliary arrays used to set up U block data structures.
-	   They are freed on return. */
-	if ( !(rb_marker = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for rb_marker[].");
-	if ( !(Urb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	if ( !(Urb_fstnz = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_fstnz[].");
-	if ( !(Ucbs = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Ucbs[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword;
-#endif
-	/* Compute ldaspa and ilsum[]. */
-	ldaspa = 0;
-	ilsum[0] = 0;
-	for (gb = 0; gb < nsupers; ++gb) {
-	    if ( myrow == PROW( gb, grid ) ) {
-		i = SuperSize( gb );
-		ldaspa += i;
-		lb = LBi( gb, grid );
-		ilsum[lb + 1] = ilsum[lb] + i;
-	    }
-	}
-	
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_();
-#endif
-	/* ------------------------------------------------------------
-	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
-	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
-	   ------------------------------------------------------------*/
-	
-	/* Loop through each supernode column. */
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    fsupc = FstBlockC( jb );
-	    nsupc = SuperSize( jb );
-	    /* Loop through each column in the block. */
-	    for (j = fsupc; j < fsupc + nsupc; ++j) {
-		/* usub[*] contains only "first nonzero" in each segment. */
-		for (i = xusub[j]; i < xusub[j+1]; ++i) {
-		    irow = usub[i]; /* First nonzero of the segment. */
-		    gb = BlockNum( irow );
-		    kcol = PCOL( gb, grid );
-		    ljb = LBj( gb, grid );
-		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
-		    pr = PROW( gb, grid );
-		    lb = LBi( gb, grid );
-		    if ( mycol == pc ) {
-			if  ( myrow == pr ) {
-			    ToSendD[lb] = YES;
-			    /* Count nonzeros in entire block row. */
-			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
-			    if (rb_marker[lb] <= jb) {/* First see the block */
-				rb_marker[lb] = jb + 1;
-				Urb_fstnz[lb] += nsupc;
-				++Ucbs[lb]; /* Number of column blocks
-					       in block row lb. */
-#if ( PRNTlevel>=1 )
-				++nUblocks;
-#endif
-			    }
-			    ToRecv[gb] = 1;
-			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
-		    }
-		} /* for i ... */
-	    } /* for j ... */
-	} /* for jb ... */
-	
-	/* Set up the initial pointers for each block row in U. */
-	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    len = Urb_length[lb];
-	    rb_marker[lb] = 0; /* Reset block marker. */
-	    if ( len ) {
-		/* Add room for descriptors */
-		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
-		if ( !(index = intMalloc_dist(len1+1)) )
-		    ABORT("Malloc fails for Uindex[].");
-		Ufstnz_br_ptr[lb] = index;
-		if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) )
-		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
-		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-		index[0] = Ucbs[lb]; /* Number of column blocks */
-		index[1] = len;      /* Total length of nzval[] */
-		index[2] = len1;     /* Total length of index[] */
-		index[len1] = -1;    /* End marker */
-	    } else {
-		Ufstnz_br_ptr[lb] = NULL;
-		Unzval_br_ptr[lb] = NULL;
-	    }
-	    Urb_length[lb] = 0; /* Reset block length. */
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
- 	    Urb_fstnz[lb] = BR_HEADER;
-	} /* for lb ... */
-
-	SUPERLU_FREE(Ucbs);
-
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_() - t;
-	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
-#endif
-#if ( PRNTlevel>=1 )
-        mem_use -= 2*k * iword;
-#endif
-	/* Auxiliary arrays used to set up L block data structures.
-	   They are freed on return.
-	   k is the number of local row blocks.   */
-	if ( !(Lrb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Lrb_length[].");
-	if ( !(Lrb_number = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_number[].");
-	if ( !(Lrb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_indptr[].");
-	if ( !(Lrb_valptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_valptr[].");
-	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-
-	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for bmod[].");
-	/* ------------------------------------------------ */
-#if ( PRNTlevel>=1 )	
-	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword;
-#endif
-	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-
-	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr = 
-              (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
-	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
-	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
-	Lrowind_bc_ptr[k-1] = NULL;
-
-	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for fsendx_plist[].");
-	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
-	/* -------------------------------------------------------------- */
-#if ( PRNTlevel>=1 )
-	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
-#endif
-
-	/*------------------------------------------------------------
-	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-	  ------------------------------------------------------------*/
-
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-		ljb = LBj( jb, grid ); /* Local block number */
-		
-		/* Scatter A into SPA. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa[j]; i < xa[j+1]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    dense_col[irow] = a[i];
-			}
-		    }
-		    dense_col += ldaspa;
-		} /* for j ... */
-
-		jbrow = PROW( jb, grid );
-
-		/*------------------------------------------------
-		 * SET UP U BLOCKS.
-		 *------------------------------------------------*/
-#if ( PROFlevel>=1 )
-		t = SuperLU_timer_();
-#endif
-		kseen = 0;
-		dense_col = dense;
-		/* Loop through each column in the block column. */
-		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
-		    istart = xusub[j];
-		    /* NOTE: Only the first nonzero index of the segment
-		       is stored in usub[]. */
-		    for (i = istart; i < xusub[j+1]; ++i) {
-			irow = usub[i]; /* First nonzero in the segment. */
-			gb = BlockNum( irow );
-			pr = PROW( gb, grid );
-			if ( pr != jbrow &&
-			     myrow == jbrow &&  /* diag. proc. owning jb */
-			     bsendx_plist[ljb][pr] == EMPTY ) {
-			    bsendx_plist[ljb][pr] = YES;
-			    ++nbsendx;
-                        }
-			if ( myrow == pr ) {
-			    lb = LBi( gb, grid ); /* Local block number */
-			    index = Ufstnz_br_ptr[lb];
-			    uval = Unzval_br_ptr[lb];
-			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
-							  the block       */
-				rb_marker[lb] = jb + 1;
-				Urb_indptr[lb] = Urb_fstnz[lb];;
-				index[Urb_indptr[lb]] = jb; /* Descriptor */
-				Urb_indptr[lb] += UB_DESCRIPTOR;
-				/* Record the first location in index[] of the
-				   next block */
-				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
-				len = Urb_indptr[lb];/* Start fstnz in index */
-				index[len-1] = 0;
-				for (k = 0; k < nsupc; ++k)
-				    index[len+k] = fsupc1;
-				if ( gb != jb )/* Exclude diagonal block. */
-				    ++bmod[lb];/* Mod. count for back solve */
-				if ( kseen == 0 && myrow != jbrow ) {
-				    ++nbrecvx;
-				    kseen = 1;
-				}
-			    } else { /* Already saw the block */
-				len = Urb_indptr[lb];/* Start fstnz in index */
-			    }
-			    jj = j - fsupc;
-			    index[len+jj] = irow;
-			    /* Load the numerical values */
-			    k = fsupc1 - irow; /* No. of nonzeros in segment */
-			    index[len-1] += k; /* Increment block length in
-						  Descriptor */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (ii = 0; ii < k; ++ii) {
-				uval[Urb_length[lb]++] = dense_col[irow + ii];
-				dense_col[irow + ii] = zero;
-			    }
-			} /* if myrow == pr ... */
-		    } /* for i ... */
-                    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif		
-		/*------------------------------------------------
-		 * SET UP L BLOCKS.
-		 *------------------------------------------------*/
-
-		/* Count number of blocks and length of each block. */
-		nrbl = 0;
-		len = 0; /* Number of row subscripts I own. */
-		kseen = 0;
-		istart = xlsub[fsupc];
-		for (i = istart; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    gb = BlockNum( irow ); /* Global block number */
-		    pr = PROW( gb, grid ); /* Process row owning this block */
-		    if ( pr != jbrow &&
-			 myrow == jbrow &&  /* diag. proc. owning jb */
-			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
-			fsendx_plist[ljb][pr] = YES;
-			++nfsendx;
-                    }
-		    if ( myrow == pr ) {
-			lb = LBi( gb, grid );  /* Local block number */
-			if (rb_marker[lb] <= jb) { /* First see this block */
-			    rb_marker[lb] = jb + 1;
-			    Lrb_length[lb] = 1;
-			    Lrb_number[nrbl++] = gb;
-			    if ( gb != jb ) /* Exclude diagonal block. */
-				++fmod[lb]; /* Mod. count for forward solve */
-			    if ( kseen == 0 && myrow != jbrow ) {
-				++nfrecvx;
-				kseen = 1;
-			    }
-#if ( PRNTlevel>=1 )
-			    ++nLblocks;
-#endif
-			} else {
-			    ++Lrb_length[lb];
-			}
-			++len;
-		    }
-		} /* for i ... */
-
-		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
-		       index[] and nzval[]. */
-		    /* Add room for descriptors */
-		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index[]");
-		    Lrowind_bc_ptr[ljb] = index;
-		    if (!(Lnzval_bc_ptr[ljb] = 
-                         doublecomplexMalloc_dist(len*nsupc))) {
-			fprintf(stderr, "col block %d ", jb);
-			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
-		    }
-		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-		    index[0] = nrbl;  /* Number of row blocks */
-		    index[1] = len;   /* LDA of the nzval[] */
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (k = 0; k < nrbl; ++k) {
-			gb = Lrb_number[k];
-			lb = LBi( gb, grid );
-			len = Lrb_length[lb];
-			Lrb_length[lb] = 0;  /* Reset vector of block length */
-			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
-			Lrb_indptr[lb] = next_lind;
-			Lrb_valptr[lb] = next_lval;
-			next_lind += len;
-			next_lval += len;
-		    }
-		    /* Propagate the compressed row subscripts to Lindex[],
-                       and the initial values of A from SPA into Lnzval[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    len = index[1];  /* LDA of lusup[] */
-		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
-			irow = lsub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    k = Lrb_indptr[lb]++; /* Random access a block */
-			    index[k] = irow;
-			    k = Lrb_valptr[lb]++;
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			}
-		    } /* for i ... */
-		} else {
-		    Lrowind_bc_ptr[ljb] = NULL;
-		    Lnzval_bc_ptr[ljb] = NULL;
-		} /* if nrbl ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-
-	} /* for jb ... */
-
-	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-	Llu->Unzval_br_ptr = Unzval_br_ptr;
-	Llu->ToRecv = ToRecv;
-	Llu->ToSendD = ToSendD;
-	Llu->ToSendR = ToSendR;
-	Llu->fmod = fmod;
-	Llu->fsendx_plist = fsendx_plist;
-	Llu->nfrecvx = nfrecvx;
-	Llu->nfsendx = nfsendx;
-	Llu->bmod = bmod;
-	Llu->bsendx_plist = bsendx_plist;
-	Llu->nbrecvx = nbrecvx;
-	Llu->nbsendx = nbsendx;
-	Llu->ilsum = ilsum;
-	Llu->ldalsum = ldaspa;
-	
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-			   nLblocks, nUblocks);
-#endif
-
-	SUPERLU_FREE(rb_marker);
-	SUPERLU_FREE(Urb_fstnz);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-	SUPERLU_FREE(Lrb_length);
-	SUPERLU_FREE(Lrb_number);
-	SUPERLU_FREE(Lrb_indptr);
-	SUPERLU_FREE(Lrb_valptr);
-	SUPERLU_FREE(dense);
-
-	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
-
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 1st distribute time:\n "
-			   "\tL\t%.2f\n\tU\t%.2f\n"
-			   "\tu_blks %d\tnrbu %d\n--------\n",
-  			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } /* else fact != SamePattern_SameRowPerm */
-
-    if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
-        SUPERLU_FREE(asub);
-        SUPERLU_FREE(a);
-    }
-    SUPERLU_FREE(xa);
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed:
-       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
-    CHECK_MALLOC(iam, "Exit pzdistribute()");
-#endif
-    
-    return (mem_use);
-} /* PZDISTRIBUTE */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsequ.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,228 +0,0 @@
-
-/*
- * File name:	pzgsequ.c
- * History:     Modified from LAPACK routine ZGEEQU
- */
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-pzgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
-{
-/*    
-    Purpose   
-    =======   
-
-    PZGSEQU computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
-    R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) double*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= M:  the i-th row of A is exactly zero   
-                  >  M:  the (i-M)-th column of A is exactly zero   
-
-    GRID    (input) gridinof_t*
-            The 2D process mesh.
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NRformat_loc *Astore;
-    doublecomplex *Aval;
-    int i, j, irow, jcol, m_loc;
-    double rcmin, rcmax;
-    double bignum, smlnum;
-    extern double dlamch_(char *);
-    double tempmax, tempmin;
-    double *loc_max;
-    int *r_sizes, *displs;
-    double *loc_r;
-    int_t  procs;
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NR_loc || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("pzgsequ", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    m_loc = Astore->m_loc;
-    
-    /* Get machine constants. */
-    smlnum = dlamch_("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    irow = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) {
-	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-            r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[j]) );
-	++irow;
-    }
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-  
-    /* Get the global MAX and MIN for R */
-    tempmax = rcmax;
-    tempmin = rcmin;
-    MPI_Allreduce( &tempmax, &rcmax, 
-		1, MPI_DOUBLE, MPI_MAX, grid->comm);
-    MPI_Allreduce( &tempmin, &rcmin, 
-		1, MPI_DOUBLE, MPI_MIN, grid->comm);
-
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    irow = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	    jcol = Astore->colind[j];
-	    c[jcol] = SUPERLU_MAX( c[jcol], z_abs1(&Aval[j]) * r[irow] );
-	}
-	++irow;
-    }
-
-    /* Find the global maximum for c[j] */
-    if ( !(loc_max = doubleMalloc_dist(A->ncol)))
-      ABORT("Malloc fails for loc_max[].");
-    for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
-    MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm);
-    SUPERLU_FREE(loc_max);
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* gather R from each process to get the global R.  */
-
-    procs = grid->nprow * grid->npcol;
-    if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
-      ABORT("Malloc fails for r_sizes[].");
-    displs = r_sizes + procs;
-    if ( !(loc_r = doubleMalloc_dist(m_loc)))
-      ABORT("Malloc fails for loc_r[].");
-    j = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
-
-    /* First gather the size of each piece. */
-    MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
-      
-    /* Set up the displacements for allgatherv */
-    displs[0] = 0;
-    for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
-
-    /* Now gather the actual data */
-    MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
-                MPI_DOUBLE, grid->comm);
-      
-    SUPERLU_FREE(r_sizes);
-    SUPERLU_FREE(loc_r);
-
-    return;
-
-} /* pzgsequ */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv_AXglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,304 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-
-static void zcreate_msr_matrix(SuperMatrix *, int_t [], int_t,
-			      doublecomplex **, int_t **);
-static void zPrintMSRmatrix(int, doublecomplex [], int_t [], gridinfo_t *);
-
-
-int pzgsmv_AXglobal_setup
-(
- SuperMatrix *A,       /* Matrix A permuted by columns (input).
-			  The type of A can be:
-			  Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */
- Glu_persist_t *Glu_persist, /* input */
- gridinfo_t *grid,     /* input */
- int_t *m,             /* output */
- int_t *update[],      /* output */
- doublecomplex *val[],        /* output */
- int_t *bindx[],       /* output */
- int_t *mv_sup_to_proc /* output */
- )
-{
-    int n;
-    int input_option;
-    int N_update;    /* Number of variables updated on this process (output) */
-    int iam = grid->iam;
-    int nprocs = grid->nprow * grid->npcol;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *supno = Glu_persist->supno;
-    int_t nsupers;
-    int i, nsup, p, t1, t2, t3;
-
-
-    /* Initialize the list of global indices.
-     * NOTE: the list of global indices must be in ascending order.
-     */
-    n = A->nrow;
-    input_option = SUPER_LINEAR;
-    nsupers = supno[n-1] + 1;
-
-#if ( DEBUGlevel>=2 )
-    if ( !iam ) {
-	PrintInt10("xsup", supno[n-1]+1, xsup);
-	PrintInt10("supno", n, supno);
-    }
-#endif
-
-    if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on
-					     individual rows.  */
-	/* Figure out mv_sup_to_proc[] on all processes. */
-	for (p = 0; p < nprocs; ++p) {
-	    t1 = n / nprocs;       /* Number of rows */
-	    t2 = n - t1 * nprocs;  /* left-over, which will be assigned
-				      to the first t2 processes.  */
-	    if ( p >= t2 ) t2 += (p * t1); /* Starting row number */
-	    else { /* First t2 processes will get one more row. */
- 	        ++t1;              /* Number of rows. */
-		t2 = p * t1;       /* Starting row. */
-	    }
-	    /* Make sure the starting and ending rows are at the
-	       supernode boundaries. */
-	    t3 = t2 + t1;      /* Ending row. */
-	    nsup = supno[t2];
-	    if ( t2 > xsup[nsup] ) { /* Round up the starting row. */
-		t1 -= xsup[nsup+1] - t2;
-		t2 = xsup[nsup+1];
-	    }
-	    nsup = supno[t3];
-	    if ( t3 > xsup[nsup] ) /* Round up the ending row. */
-		t1 += xsup[nsup+1] - t3;
-	    t3 = t2 + t1 - 1;
-	    if ( t1 ) {
-		for (i = supno[t2]; i <= supno[t3]; ++i) {
-		    mv_sup_to_proc[i] = p;
-#if ( DEBUGlevel>=3 )
-		    if ( mv_sup_to_proc[i] == p-1 ) {
-			fprintf(stderr, 
-				"mv_sup_to_proc conflicts at supno %d\n", i);
-			exit(-1);
-		    }
-#endif
-		}
-	    }
-	    
-	    if ( iam == p ) {
-		N_update = t1;
-		if ( N_update ) {
-		    if ( !(*update = intMalloc_dist(N_update)) )
-			ABORT("Malloc fails for update[]");
-		}
-		for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i;
-#if ( DEBUGlevel>=3 )
-		printf("(%2d) N_update = %4d\t"
-		       "supers %4d to %4d\trows %4d to %4d\n",
-		       iam, N_update, supno[t2], supno[t3], t2, t3);
-#endif
-	    }
-	} /* for p ... */
-    } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on
-						   individual supernodes.  */
-	/* This may cause bad load balance, because the blocks are usually
-	   small in the beginning and large toward the end.   */
-	t1 = nsupers / nprocs;
-	t2 = nsupers - t1 * nprocs; /* left-over */
-	if ( iam >= t2 ) t2 += (iam * t1);
-	else {
-	    ++t1;          /* Number of blocks. */
-	    t2 = iam * t1; /* Starting block. */
-	}
-	N_update = xsup[t2+t1] - xsup[t2];
-	if ( !(*update = intMalloc_dist(N_update)) )
-	    ABORT("Malloc fails for update[]");
-	for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i;
-    }
-
-
-    /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */
-    zcreate_msr_matrix(A, *update, N_update, val, bindx);
-
-#if ( DEBUGlevel>=2 )
-    PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc);
-    zPrintMSRmatrix(N_update, *val, *bindx, grid);
-#endif
-
-    *m = N_update;
-    return 0;
-} /* PZGSMV_AXglobal_SETUP */
-
-
-/* Create the distributed modified sparse row (MSR) matrix: bindx/val.
- * For a submatrix of size m-by-n, the MSR arrays are as follows:
- *    bindx[0]      = m + 1
- *    bindx[0..m]   = pointer to start of each row
- *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
- *                    where, ks = bindx[k], ke = bindx[k+1]-1
- *    val[k]        = A(k,k), k < m, diagonal elements
- *    val[m]        = not used
- *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
- * Both arrays are of length nnz + 1.
- */
-static void zcreate_msr_matrix
-(
- SuperMatrix *A,       /* Matrix A permuted by columns (input).
-			  The type of A can be:
-			  Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. */
- int_t update[],       /* input (local) */
- int_t N_update,       /* input (local) */
- doublecomplex **val,         /* output */
- int_t **bindx         /* output */
-)
-{
-    int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag;
-    NCPformat *Astore;
-    doublecomplex *nzval;
-    int_t *rowcnt;
-    doublecomplex zero = {0.0, 0.0};
-    
-    if ( !N_update ) return;
-
-    n = A->ncol;
-    Astore = A->Store;
-    nzval = Astore->nzval;
-
-    /* One pass of original matrix A to count nonzeros of each row. */
-    if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) )
-	ABORT("Malloc fails for rowcnt[]");
-    lo = update[0];
-    hi = update[N_update-1];
-    nnz_local = 0;
-    nnz_diag = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
-	    irow = Astore->rowind[i];
-	    if ( irow >= lo && irow <= hi ) {
-		if ( irow != j ) /* Exclude diagonal */
-		    ++rowcnt[irow - lo];
-		else ++nnz_diag; /* Count nonzero diagonal entries */
-		++nnz_local;
-	    }
-	}
-    }
-
-    /* Add room for the logical diagonal zeros which are not counted
-       in nnz_local. */
-    nnz_local += (N_update - nnz_diag);
-
-    /* Allocate storage for bindx[] and val[]. */
-    if ( !(*val = (doublecomplex *) doublecomplexMalloc_dist(nnz_local+1)) )
-	ABORT("Malloc fails for val[]");
-    for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */
-    if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) )
-	ABORT("Malloc fails for bindx[]");
-
-    /* Set up row pointers. */
-    (*bindx)[0] = N_update + 1;
-    for (j = 1; j <= N_update; ++j) {
-	(*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1];
-	rowcnt[j-1] = (*bindx)[j-1];
-    }
-
-    /* One pass of original matrix A to fill in matrix entries. */
-    for (j = 0; j < n; ++j) {
-	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
-	    irow = Astore->rowind[i];
-	    if ( irow >= lo && irow <= hi ) {
-		if ( irow == j ) /* Diagonal */
-		    (*val)[irow - lo] = nzval[i];
-		else {
-		    irow -= lo;
-		    k = rowcnt[irow];
-		    (*bindx)[k] = j;
-		    (*val)[k] = nzval[i];
-		    ++rowcnt[irow];
-		}
-	    }
-	}
-    }
-
-    SUPERLU_FREE(rowcnt);
-}
-
-/*
- * Performs sparse matrix-vector multiplication.
- *   - val/bindx stores the distributed MSR matrix A
- *   - X is global
- *   - ax product is distributed the same way as A
- */
-int
-pzgsmv_AXglobal(int_t m, int_t update[], doublecomplex val[], int_t bindx[],
-                doublecomplex X[], doublecomplex ax[])
-{
-    int_t i, j, k;
-    doublecomplex zero = {0.0, 0.0};
-    doublecomplex temp;
-
-    if ( m <= 0 ) return; /* number of rows (local) */
-
-    for (i = 0; i < m; ++i) {
-	ax[i] = zero;
-
-	for (k = bindx[i]; k < bindx[i+1]; ++k) {
-	    j = bindx[k];       /* column index */
-	    zz_mult(&temp, &val[k], &X[j]);
-	    z_add(&ax[i], &ax[i], &temp);
-	}
-	zz_mult(&temp, &val[i], &X[update[i]]); /* diagonal */
-	z_add(&ax[i], &ax[i], &temp);
-    }
-} /* PZGSMV_AXglobal */
- 
-/*
- * Performs sparse matrix-vector multiplication.
- *   - val/bindx stores the distributed MSR matrix A
- *   - X is global
- *   - ax product is distributed the same way as A
- */
-int
-pzgsmv_AXglobal_abs(int_t m, int_t update[], doublecomplex val[], int_t bindx[],
-	            doublecomplex X[], double ax[])
-{
-    int_t i, j, k;
-
-    if ( m <= 0 ) return; /* number of rows (local) */
-
-    for (i = 0; i < m; ++i) {
-	ax[i] = 0.0;
-	for (k = bindx[i]; k < bindx[i+1]; ++k) {
-	    j = bindx[k];       /* column index */
-	    ax[i] += z_abs1(&val[k]) * z_abs1(&X[j]);
-	}
-	ax[i] += z_abs1(&val[i]) * z_abs1(&X[update[i]]); /* diagonal */
-    }
-} /* PZGSMV_AXglobal_ABS */
-
-/*
- * Print the local MSR matrix
- */
-static void zPrintMSRmatrix
-(
- int m,       /* Number of rows of the submatrix. */
- doublecomplex val[],
- int_t bindx[],
- gridinfo_t *grid
-)
-{
-    int iam, nnzp1;
-
-    if ( !m ) return;
-
-    iam = grid->iam;
-    nnzp1 = bindx[m];
-    printf("(%2d) MSR submatrix has %d rows -->\n", iam, m);
-    PrintDoublecomplex("val", nnzp1, val);
-    PrintInt10("bindx", nnzp1, bindx);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsmv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsmv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,372 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void pzgsmv_init
-(
- SuperMatrix *A,       /* Matrix A permuted by columns (input/output).
-			  The type of A can be:
-			  Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. */
- int_t *row_to_proc,   /* Input. Mapping between rows and processes. */
- gridinfo_t *grid,     /* Input */
- pzgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
- )
-{
-    NRformat_loc *Astore;
-    int iam, p, procs;
-    int *SendCounts, *RecvCounts;
-    int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
-    int_t TotalIndSend, TotalValSend;
-    int_t *colind, *rowptr;
-    int_t *ind_tosend = NULL, *ind_torecv = NULL;
-    int_t *ptr_ind_tosend, *ptr_ind_torecv;
-    int_t *extern_start, *spa, *itemp;
-    doublecomplex *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
-    MPI_Request *send_req, *recv_req;
-    MPI_Status status;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pzgsmv_init()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    iam = grid->iam;
-    procs = grid->nprow * grid->npcol;
-    Astore = (NRformat_loc *) A->Store;
-    m = A->nrow;
-    n = A->ncol;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    colind = Astore->colind;
-    rowptr = Astore->rowptr;
-    nzval = Astore->nzval;
-    if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
-        ABORT("Malloc fails for SendCounts[]");
-    /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
-    RecvCounts = SendCounts + procs;
-    if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
-        ABORT("Malloc fails for ptr_ind_tosend[]");
-    ptr_ind_torecv = ptr_ind_tosend + procs + 1;
-    if ( !(extern_start = intMalloc_dist(m_loc)) )
-        ABORT("Malloc fails for extern_start[]");
-    for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
-
-    /* ------------------------------------------------------------
-       COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
-       THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
-       SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
-       LOCAL PART OF X.
-       THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
-       ------------------------------------------------------------*/
-    if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
-        ABORT("Malloc fails for spa[]");
-    for (p = 0; p < procs; ++p) SendCounts[p] = 0;
-    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-        k = extern_start[i];
-        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
-	    jcol = colind[j];
-            p = row_to_proc[jcol];
-	    if ( p != iam ) { /* External */
-	        if ( spa[jcol] == 0 ) { /* First time see this index */
-		    ++SendCounts[p];
-		    spa[jcol] = 1;
-                }
-	    } else { /* Swap to beginning the part of A corresponding
-			to the local part of X */
-		l = colind[k];
-		t = nzval[k];
-		colind[k] = jcol;
-		nzval[k] = nzval[j];
-		colind[j] = l;
-		nzval[j] = t;
-		++k;
-	    }
-	}
-	extern_start[i] = k;
-    }
-
-    /* ------------------------------------------------------------
-       LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
-       THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
-       ------------------------------------------------------------*/
-    /* Build pointers to ind_tosend[]. */
-    ptr_ind_tosend[0] = 0;
-    for (p = 0, TotalIndSend = 0; p < procs; ++p) {
-        TotalIndSend += SendCounts[p]; /* Total to send. */
-	ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
-    }
-#if 0
-    ptr_ind_tosend[iam] = 0; /* Local part of X */
-#endif
-    if ( TotalIndSend ) {
-        if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
-	    ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
-    }
-
-    /* Build SPA to aid global to local translation. */
-    for (i = 0; i < n; ++i) spa[i] = EMPTY;
-    for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
-        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    jcol = colind[j];
-	    if ( spa[jcol] == EMPTY ) { /* First time see this index */
-	        p = row_to_proc[jcol];
-		if ( p == iam ) { /* Local */
-		  /*assert(jcol>=fst_row);*/
-		  spa[jcol] = jcol - fst_row; /* Relative position in local X */
-		} else {          /* External */
-		  ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
-		  spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
-		  ++ptr_ind_tosend[p];
-		}
-	    }
-	}
-    }
-    
-    /* ------------------------------------------------------------
-       TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
-       THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
-       ------------------------------------------------------------*/
-    for (i = 0; i < m_loc; ++i) {
-        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    jcol = colind[j];
-	    colind[j] = spa[jcol];
-	}
-    }
-
-    /* ------------------------------------------------------------
-       COMMUNICATE THE EXTERNAL INDICES OF X.
-       ------------------------------------------------------------*/
-    MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
-		 grid->comm);
-
-    /* Build pointers to ind_torecv[]. */
-    ptr_ind_torecv[0] = 0;
-    for (p = 0, TotalValSend = 0; p < procs; ++p) {
-        TotalValSend += RecvCounts[p]; /* Total to receive. */
-	ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
-    }
-    if ( TotalValSend ) {
-        if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
-	    ABORT("Malloc fails for ind_torecv[]");
-    }
-
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
-        ABORT("Malloc fails for recv_req[].");
-    recv_req = send_req + procs;
-    for (p = 0; p < procs; ++p) {
-        ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
-        if ( SendCounts[p] ) {
-	    MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
-		      mpi_int_t, p, iam, grid->comm, &send_req[p]);
-	}
-	if ( RecvCounts[p] ) {
-	    MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
-		      mpi_int_t, p, p, grid->comm, &recv_req[p]);
-	}
-    }
-    for (p = 0; p < procs; ++p) {
-        if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);
-	if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status);
-    }
-
-    /* Allocate storage for the X values to to transferred. */
-    if ( TotalIndSend &&
-         !(val_torecv = doublecomplexMalloc_dist(TotalIndSend)) )
-        ABORT("Malloc fails for val_torecv[].");
-    if ( TotalValSend &&
-         !(val_tosend = doublecomplexMalloc_dist(TotalValSend)) )
-        ABORT("Malloc fails for val_tosend[].");
-
-    gsmv_comm->extern_start = extern_start;
-    gsmv_comm->ind_tosend = ind_tosend;
-    gsmv_comm->ind_torecv = ind_torecv;
-    gsmv_comm->ptr_ind_tosend = ptr_ind_tosend;
-    gsmv_comm->ptr_ind_torecv = ptr_ind_torecv;
-    gsmv_comm->SendCounts = SendCounts;
-    gsmv_comm->RecvCounts = RecvCounts;
-    gsmv_comm->val_tosend = val_tosend;
-    gsmv_comm->val_torecv = val_torecv;
-    gsmv_comm->TotalIndSend = TotalIndSend;
-    gsmv_comm->TotalValSend = TotalValSend;
-    
-    SUPERLU_FREE(spa);
-    SUPERLU_FREE(send_req);
-
-#if ( DEBUGlevel>=2 )
-    PrintInt10("pzgsmv_init::rowptr", m_loc+1, rowptr);
-    PrintInt10("pzgsmv_init::extern_start", m_loc, extern_start);
-#endif
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgsmv_init()");
-#endif
-
-} /* PZGSMV_INIT */
-
-
-/*
- * Performs sparse matrix-vector multiplication.
- */
-void
-pzgsmv
-(
- int_t  abs,               /* Input. Do abs(A)*abs(x). */
- SuperMatrix *A_internal,  /* Input. Matrix A permuted by columns.
-			      The column indices are translated into
-			      the relative positions in the gathered x-vector.
-			      The type of A can be:
-			      Stype = NR_loc; Dtype = SLU_Z; Mtype = GE. */
- gridinfo_t *grid,         /* Input */
- pzgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */
- doublecomplex x[],       /* Input. The distributed source vector */
- doublecomplex ax[]       /* Output. The distributed destination vector */
-)
-{
-    NRformat_loc *Astore;
-    int iam, procs;
-    int_t i, j, p, m, m_loc, n, fst_row, jcol;
-    int_t *colind, *rowptr;
-    int   *SendCounts, *RecvCounts;
-    int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv;
-    int_t *extern_start, TotalValSend;
-    doublecomplex *nzval, *val_tosend, *val_torecv;
-    doublecomplex zero = {0.0, 0.0}, temp;
-    double *ax_abs = (double *) ax;
-    MPI_Request *send_req, *recv_req;
-    MPI_Status status;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pzgsmv()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    iam = grid->iam;
-    procs = grid->nprow * grid->npcol;
-    Astore = (NRformat_loc *) A_internal->Store;
-    m = A_internal->nrow;
-    n = A_internal->ncol;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    colind = Astore->colind;
-    rowptr = Astore->rowptr;
-    nzval = (doublecomplex *) Astore->nzval;
-    extern_start = gsmv_comm->extern_start;
-    ind_torecv = gsmv_comm->ind_torecv;
-    ptr_ind_tosend = gsmv_comm->ptr_ind_tosend;
-    ptr_ind_torecv = gsmv_comm->ptr_ind_torecv;
-    SendCounts = gsmv_comm->SendCounts;
-    RecvCounts = gsmv_comm->RecvCounts;
-    val_tosend = (doublecomplex *) gsmv_comm->val_tosend;
-    val_torecv = (doublecomplex *) gsmv_comm->val_torecv;
-    TotalValSend = gsmv_comm->TotalValSend;
-
-    /* ------------------------------------------------------------
-       COPY THE X VALUES INTO THE SEND BUFFER.
-       ------------------------------------------------------------*/
-    for (i = 0; i < TotalValSend; ++i) {
-        j = ind_torecv[i] - fst_row; /* Relative index in x[] */
-	val_tosend[i] = x[j];
-    }
-
-    /* ------------------------------------------------------------
-       COMMUNICATE THE X VALUES.
-       ------------------------------------------------------------*/
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
-        ABORT("Malloc fails for recv_req[].");
-    recv_req = send_req + procs;
-    for (p = 0; p < procs; ++p) {
-        if ( RecvCounts[p] ) {
-	    MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p],
-                      SuperLU_MPI_DOUBLE_COMPLEX, p, iam,
-                      grid->comm, &send_req[p]);
-	}
-	if ( SendCounts[p] ) {
-	    MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p],
-                      SuperLU_MPI_DOUBLE_COMPLEX, p, p,
-                      grid->comm, &recv_req[p]);
-	}
-    }
-    
-    /* ------------------------------------------------------------
-       PERFORM THE ACTUAL MULTIPLICATION.
-       ------------------------------------------------------------*/
-    if ( abs ) { /* Perform abs(A)*abs(x) */
-        /* Multiply the local part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-            ax_abs[i] = 0.0;
-	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
-	        jcol = colind[j];
-		ax_abs[i] += z_abs1(&nzval[j]) * z_abs1(&x[jcol]);
-	    }
-        }
-
-        for (p = 0; p < procs; ++p) {
-            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
-	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
-        }
-
-        /* Multiply the external part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
-	        jcol = colind[j];
-	        ax_abs[i] += z_abs1(&nzval[j]) * z_abs(&val_torecv[jcol]);
-	    }
-	}
-    } else {
-        /* Multiply the local part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    ax[i] = zero;
-	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
-	        jcol = colind[j];
-                zz_mult(&temp, &nzval[j], &x[jcol]);
-                z_add(&ax[i], &ax[i], &temp);
-	    }
-        }
-
-        for (p = 0; p < procs; ++p) {
-            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
-	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
-        }
-
-        /* Multiply the external part. */
-        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
-	        jcol = colind[j];
-                zz_mult(&temp, &nzval[j], &val_torecv[jcol]);
-                z_add(&ax[i], &ax[i], &temp);
-	    }
-	}
-    }
-
-    SUPERLU_FREE(send_req);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgsmv()");
-#endif
-
-} /* PZGSMV */
-
-void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm)
-{
-    int_t *it;
-    doublecomplex *dt;
-    SUPERLU_FREE(gsmv_comm->extern_start);
-    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
-    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
-    SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
-    SUPERLU_FREE(gsmv_comm->SendCounts);
-    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
-    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,445 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-/*-- Function prototypes --*/
-static void gather_1rhs_diag_to_all(int_t, doublecomplex [], Glu_persist_t *,
-                                    LocalLU_t *, gridinfo_t *, int_t, int_t [],
-				    int_t [], doublecomplex [], doublecomplex []);
-static void redist_all_to_diag(int_t, doublecomplex [], Glu_persist_t *,
-                               LocalLU_t *, gridinfo_t *, int_t [], doublecomplex []);
-
-void
-pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-		  gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx,
-		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * pzgsrfs_ABXglobal improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
- *        are permutation matrices. The type of A can be:
- *        Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- *        NOTE: Currently, A must reside in all processes when calling
- *              this routine.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pzgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) doublecomplex* (global)
- *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *       
- *        NOTE: Currently, B must reside on all processes when calling
- *              this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) doublecomplex* (global)
- *        On entry, the solution matrix X, as computed by PZGSTRS.
- *        On exit, the improved solution matrix X.
- *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
- *        in order to obtain the solution to the original system.
- *
- *        NOTE: Currently, X must reside on all processes when calling
- *              this routine.
- *
- * ldx    (input) int (global)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    /* 
-     * Data structures used by matrix-vector multiply routine.
-     */
-    int_t  N_update; /* Number of variables updated on this process */
-    int_t  *update;  /* vector elements (global index) updated 
-			on this processor.                     */
-    int_t  *bindx;
-    doublecomplex *val;
-    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
-			       matrix-vector multiply.  */
-    /*-- end data structures for matrix-vector multiply --*/
-    doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col,
-           *x_trs, *dx_trs;
-    double *rwork;
-    int_t notran;
-    int_t count, ii, j, jj, k, knsupc, lk, lwork,
-          nprow, nsupers, nz, p;
-    int   i, iam, pkk;
-    int_t *ilsum, *xsup;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* NEW STUFF */
-    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
-    int_t *diag_len; /* Length of the X vector on diagonal processes. */
-
-    /*-- Function prototypes --*/
-    extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *,
-			 doublecomplex *, int, SuperLUStat_t *, int *);
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("pzgsrfs_ABXglobal", &i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-    /* Initialization. */
-    iam = grid->iam;
-    nprow = grid->nprow;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-    notran = 1;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()");
-#endif
-
-    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		   &diag_procs, &diag_len);
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. number of diag processes = %d\n", num_diag_procs);
-	PrintInt10("diag_procs", num_diag_procs, diag_procs);
-	PrintInt10("diag_len", num_diag_procs, diag_len);
-    }
-#endif
-
-    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
-	ABORT("Calloc fails for mv_sup_to_proc[]");
-
-    pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
-		          &val, &bindx, mv_sup_to_proc);
-
-    i = CEILING( nsupers, nprow ); /* Number of local block rows */
-    ii = Llu->ldalsum + i * XK_H;
-    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
-    jj = diag_len[0];
-    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
-    jj = SUPERLU_MAX( jj, N_update );
-    lwork = N_update         /* For ax and R */
-	  + ii               /* For dx_trs */
-	  + ii               /* For x_trs */
-          + k                /* For b */
-	  + jj;              /* for temp */
-    if ( !(work = doublecomplexMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = work;
-    dx_trs = work + N_update;
-    x_trs  = dx_trs + ii;
-    b      = x_trs + ii;
-    temp   = b + k;
-    if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) )
-	ABORT("Malloc fails for rwork[]");
-
-#if ( DEBUGlevel>=2 )
-    {
-	doublecomplex *dwork = doublecomplexMalloc_dist(n);
-	for (i = 0; i < n; ++i) {
-	    if ( i & 1 ) dwork[i].r = 1.;
-	    else dwork[i].r = 2.;
-	    dwork[i].i = 0.;
-        }
-	/* Check correctness of matrix-vector multiply. */
-	pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
-	PrintDouble5("Mult A*x", N_update, ax);
-	SUPERLU_FREE(dwork);
-    }
-#endif
-
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-
-    /* Set SAFE1 essentially to be the underflow threshold times the
-       number of additions in each row. */
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-
-	/* Copy X into x on the diagonal processes. */
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-	for (p = 0; p < num_diag_procs; ++p) {
-	    pkk = diag_procs[p];
-	    if ( iam == pkk ) {
-		for (k = p; k < nsupers; k += num_diag_procs) {
-		    knsupc = SuperSize( k );
-		    lk = LBi( k, grid );
-		    ii = ilsum[lk] + (lk+1)*XK_H;
-		    jj = FstBlockC( k );
-		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
-		    dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */
-		}
-	    }
-	}
-	/* Copy B into b distributed the same way as matrix-vector product. */
-        if ( N_update ) ii = update[0];
-	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
-	    
-	    /* Compute residual. */
-	    for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]);
-
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork);
-	    for (i = 0; i < N_update; ++i) rwork[i] += z_abs1(&b[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < N_update; ++i) {
-		if ( rwork[i] > safe2 ) {
-		    s = SUPERLU_MAX(s, z_abs1(&R[i]) / rwork[i]);
-		} else if ( rwork[i] != 0.0 ) {
-		    s = SUPERLU_MAX(s, (safe1 + z_abs1(&R[i])) / rwork[i]);
-                }
-                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
-                   we know the true residual also must be exactly 0.0. */
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
-				   mv_sup_to_proc, dx_trs);
-		pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
-
-		/* Update solution. */
-		for (p = 0; p < num_diag_procs; ++p) 
-		    if ( iam == diag_procs[p] )
-			for (k = p; k < nsupers; k += num_diag_procs) {
-			    lk = LBi( k, grid );
-			    ii = ilsum[lk] + (lk+1)*XK_H;
-			    knsupc = SuperSize( k );
-			    for (i = 0; i < knsupc; ++i)
-				z_add(&x_trs[i + ii], &x_trs[i + ii], 
-				      &dx_trs[i + ii]);
-			}
-		lstres = berr[j];
-		++count;
-		/* Transfer x_trs (on diagonal processes) into X
-		   (on all processes). */
-		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
-					num_diag_procs, diag_procs, diag_len,
-					X_col, temp);
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-
-    /* Deallocate storage used by matrix-vector multiplication. */
-    SUPERLU_FREE(diag_procs);
-    SUPERLU_FREE(diag_len);
-    if ( N_update ) {
-	SUPERLU_FREE(update);
-	SUPERLU_FREE(bindx);
-	SUPERLU_FREE(val);
-    }
-    SUPERLU_FREE(mv_sup_to_proc);
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(rwork);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()");
-#endif
-
-} /* PZGSRFS_ABXGLOBAL */
-
-
-/*
- * r[] is the residual vector distributed the same way as
- * matrix-vector product.
- */
-static void
-redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist,
-		   LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
-		   doublecomplex work[])
-{
-    int_t i, ii, k, lk, lr, nsupers;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, psrc, pkk;
-    MPI_Status status;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-    lr = 0;
-
-    for (k = 0; k < nsupers; ++k) {
-	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
-	psrc = mv_sup_to_proc[k];
-	knsupc = SuperSize( k );
-	lk = LBi( k, grid );
-	ii = ilsum[lk] + (lk+1)*XK_H;
-	if ( iam == psrc ) {
-	    if ( iam != pkk ) { /* Send X component. */
-		MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk,
-			 grid->comm );
-	    } else { /* Local copy. */
-		for (i = 0; i < knsupc; ++i)
-		    work[i + ii] = r[i + lr];
-	    }
-	    lr += knsupc;
-	} else {
-	    if ( iam == pkk ) { /* Recv X component. */
-		MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk,
-			 grid->comm, &status );
-	    }
-	}
-    }
-} /* REDIST_ALL_TO_DIAG */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_1rhs_diag_to_all(int_t n, doublecomplex x[],
-			Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-			gridinfo_t *grid, int_t num_diag_procs,
-			int_t diag_procs[], int_t diag_len[],
-			doublecomplex y[], doublecomplex work[])
-{
-    int_t i, ii, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = ilsum[lk] + (lk+1)*XK_H;
-		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
-		lwork += knsupc;
-	    }
-	    MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
-	    lwork += knsupc;
-	}
-    }
-} /* GATHER_1RHS_DIAG_TO_ALL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs_ABXglobal.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,439 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-/*-- Function prototypes --*/
-static void gather_1rhs_diag_to_all(int_t, doublecomplex [], Glu_persist_t *,
-                                    LocalLU_t *, gridinfo_t *, int_t, int_t [],
-				    int_t [], doublecomplex [], doublecomplex []);
-static void redist_all_to_diag(int_t, doublecomplex [], Glu_persist_t *,
-                               LocalLU_t *, gridinfo_t *, int_t [], doublecomplex []);
-
-void
-pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-		  gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx,
-		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * pzgsrfs_ABXglobal improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
- *        are permutation matrices. The type of A can be:
- *        Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- *        NOTE: Currently, A must reside in all processes when calling
- *              this routine.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pzgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) doublecomplex* (global)
- *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *       
- *        NOTE: Currently, B must reside on all processes when calling
- *              this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) doublecomplex* (global)
- *        On entry, the solution matrix X, as computed by PZGSTRS.
- *        On exit, the improved solution matrix X.
- *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
- *        in order to obtain the solution to the original system.
- *
- *        NOTE: Currently, X must reside on all processes when calling
- *              this routine.
- *
- * ldx    (input) int (global)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    /* 
-     * Data structures used by matrix-vector multiply routine.
-     */
-    int_t  N_update; /* Number of variables updated on this process */
-    int_t  *update;  /* vector elements (global index) updated 
-			on this processor.                     */
-    int_t  *bindx;
-    doublecomplex *val;
-    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
-			       matrix-vector multiply.  */
-    /*-- end data structures for matrix-vector multiply --*/
-    doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col,
-           *x_trs, *dx_trs;
-    double *rwork;
-    int_t notran;
-    int_t count, ii, j, jj, k, knsupc, lk, lwork,
-          nprow, nsupers, nz, p;
-    int   i, iam, pkk;
-    int_t *ilsum, *xsup;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* NEW STUFF */
-    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
-    int_t *diag_len; /* Length of the X vector on diagonal processes. */
-
-    /*-- Function prototypes --*/
-    extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *,
-			 doublecomplex *, int, SuperLUStat_t *, int *);
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("pzgsrfs_ABXglobal", &i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-    /* Initialization. */
-    iam = grid->iam;
-    nprow = grid->nprow;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-    notran = 1;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()");
-#endif
-
-    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		   &diag_procs, &diag_len);
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. number of diag processes = %d\n", num_diag_procs);
-	PrintInt10("diag_procs", num_diag_procs, diag_procs);
-	PrintInt10("diag_len", num_diag_procs, diag_len);
-    }
-#endif
-
-    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
-	ABORT("Calloc fails for mv_sup_to_proc[]");
-
-    pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
-		          &val, &bindx, mv_sup_to_proc);
-
-    i = CEILING( nsupers, nprow ); /* Number of local block rows */
-    ii = Llu->ldalsum + i * XK_H;
-    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
-    jj = diag_len[0];
-    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
-    jj = SUPERLU_MAX( jj, N_update );
-    lwork = N_update         /* For ax and R */
-	  + ii               /* For dx_trs */
-	  + ii               /* For x_trs */
-          + k                /* For b */
-	  + jj;              /* for temp */
-    if ( !(work = doublecomplexMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = work;
-    dx_trs = work + N_update;
-    x_trs  = dx_trs + ii;
-    b      = x_trs + ii;
-    temp   = b + k;
-    if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) )
-	ABORT("Malloc fails for rwork[]");
-
-#if ( DEBUGlevel>=2 )
-    {
-	doublecomplex *dwork = doublecomplexMalloc_dist(n);
-	for (i = 0; i < n; ++i) {
-	    if ( i & 1 ) dwork[i].r = 1.;
-	    else dwork[i].r = 2.;
-	    dwork[i].i = 0.;
-        }
-	/* Check correctness of matrix-vector multiply. */
-	pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
-	PrintDouble5("Mult A*x", N_update, ax);
-	SUPERLU_FREE(dwork);
-    }
-#endif
-
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-
-	/* Copy X into x on the diagonal processes. */
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-	for (p = 0; p < num_diag_procs; ++p) {
-	    pkk = diag_procs[p];
-	    if ( iam == pkk ) {
-		for (k = p; k < nsupers; k += num_diag_procs) {
-		    knsupc = SuperSize( k );
-		    lk = LBi( k, grid );
-		    ii = ilsum[lk] + (lk+1)*XK_H;
-		    jj = FstBlockC( k );
-		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
-		    dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */
-		}
-	    }
-	}
-	/* Copy B into b distributed the same way as matrix-vector product. */
-        if ( N_update ) ii = update[0];
-	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
-	    
-	    /* Compute residual. */
-	    for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]);
-
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork);
-	    for (i = 0; i < N_update; ++i) rwork[i] += z_abs1(&b[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < N_update; ++i) {
-		if ( rwork[i] > safe2 )
-		    s = SUPERLU_MAX(s, z_abs1(&R[i]) / rwork[i]);
-		else
-		    s = SUPERLU_MAX(s, (z_abs1(&R[i])+safe1)/(rwork[i]+safe1));
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
-				   mv_sup_to_proc, dx_trs);
-		pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
-
-		/* Update solution. */
-		for (p = 0; p < num_diag_procs; ++p) 
-		    if ( iam == diag_procs[p] )
-			for (k = p; k < nsupers; k += num_diag_procs) {
-			    lk = LBi( k, grid );
-			    ii = ilsum[lk] + (lk+1)*XK_H;
-			    knsupc = SuperSize( k );
-			    for (i = 0; i < knsupc; ++i)
-				z_add(&x_trs[i + ii], &x_trs[i + ii], 
-				      &dx_trs[i + ii]);
-			}
-		lstres = berr[j];
-		++count;
-		/* Transfer x_trs (on diagonal processes) into X
-		   (on all processes). */
-		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
-					num_diag_procs, diag_procs, diag_len,
-					X_col, temp);
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-
-    /* Deallocate storage used by matrix-vector multiplication. */
-    SUPERLU_FREE(diag_procs);
-    SUPERLU_FREE(diag_len);
-    if ( N_update ) {
-	SUPERLU_FREE(update);
-	SUPERLU_FREE(bindx);
-	SUPERLU_FREE(val);
-    }
-    SUPERLU_FREE(mv_sup_to_proc);
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(rwork);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()");
-#endif
-
-} /* PZGSRFS_ABXGLOBAL */
-
-
-/*
- * r[] is the residual vector distributed the same way as
- * matrix-vector product.
- */
-static void
-redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist,
-		   LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
-		   doublecomplex work[])
-{
-    int_t i, ii, k, lk, lr, nsupers;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, psrc, pkk;
-    MPI_Status status;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-    lr = 0;
-
-    for (k = 0; k < nsupers; ++k) {
-	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
-	psrc = mv_sup_to_proc[k];
-	knsupc = SuperSize( k );
-	lk = LBi( k, grid );
-	ii = ilsum[lk] + (lk+1)*XK_H;
-	if ( iam == psrc ) {
-	    if ( iam != pkk ) { /* Send X component. */
-		MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk,
-			 grid->comm );
-	    } else { /* Local copy. */
-		for (i = 0; i < knsupc; ++i)
-		    work[i + ii] = r[i + lr];
-	    }
-	    lr += knsupc;
-	} else {
-	    if ( iam == pkk ) { /* Recv X component. */
-		MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk,
-			 grid->comm, &status );
-	    }
-	}
-    }
-} /* REDIST_ALL_TO_DIAG */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_1rhs_diag_to_all(int_t n, doublecomplex x[],
-			Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-			gridinfo_t *grid, int_t num_diag_procs,
-			int_t diag_procs[], int_t diag_len[],
-			doublecomplex y[], doublecomplex work[])
-{
-    int_t i, ii, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = ilsum[lk] + (lk+1)*XK_H;
-		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
-		lwork += knsupc;
-	    }
-	    MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
-	    lwork += knsupc;
-	}
-    }
-} /* GATHER_1RHS_DIAG_TO_ALL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,246 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-pzgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
-	doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, 
-	SOLVEstruct_t *SOLVEstruct,
-	double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * PZGSRFS improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
- *        Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_zdefs.h for the definition of 'LUstruct_t'.
- *
- * ScalePermstruct (input) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_defs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) doublecomplex* (local)
- *        The m_loc-by-NRHS right-hand side matrix of the possibly
- *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *       
- * ldb    (input) int (local)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) doublecomplex* (local)
- *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
- *            transformed system A1*Y = Pc*Pr*B. where
- *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
- *        On exit, the improved solution matrix Y.
- *
- *        In order to obtain the solution X to the original system,
- *        Y should be permutated by Pc^T, and premultiplied by diag(C)
- *        if DiagScale = COL or BOTH.
- *        This must be done after this routine is called.
- *
- * ldx    (input) int (local)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * SOLVEstruct (output) SOLVEstruct_t* (global)
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    doublecomplex *ax, *R, *dx, *temp, *work, *B_col, *X_col;
-    double *rtemp;
-    int_t count, i, j, lwork, nz;
-    int   iam;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* Data structures used by matrix-vector multiply routine. */
-    pzgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
-    NRformat_loc *Astore;
-    int_t        m_loc, fst_row;
-
-
-    /* Initialization. */
-    Astore = (NRformat_loc *) A->Store;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    iam = grid->iam;
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-	      || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	pxerbla("PZGSRFS", grid, i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgsrfs()");
-#endif
-
-    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
-    if ( !(work = doublecomplexMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = dx = work;
-    temp = ax + m_loc;
-    rtemp = (double *) temp;
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-
-    /* Set SAFE1 essentially to be the underflow threshold times the
-       number of additions in each row. */
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pzgsmv(0, A, grid, gsmv_comm, X_col, ax);
-	    
-	    /* Compute residual, stored in R[]. */
-	    for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]);
-
-	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
-	    pzgsmv(1, A, grid, gsmv_comm, X_col, temp);
-            /* NOTE: rtemp is aliased to temp */
-	    for (i = 0; i < m_loc; ++i) rtemp[i] += z_abs1(&B_col[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < m_loc; ++i) {
-		if ( rtemp[i] > safe2 ) {
-		    s = SUPERLU_MAX(s, z_abs1(&R[i]) / rtemp[i]);
-		} else if ( rtemp[i] != 0.0 ) {
-		    s = SUPERLU_MAX(s, (safe1 + z_abs1(&R[i])) / rtemp[i]);
-                }
-                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
-                   we know the true residual also must be exactly 0.0. */
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		pzgstrs(n, LUstruct, ScalePermstruct, grid,
-			dx, m_loc, fst_row, m_loc, 1, 
-			SOLVEstruct, stat, info);
-
-		/* Update solution. */
-		for (i = 0; i < m_loc; ++i)
-                    z_add(&X_col[i], &X_col[i], &dx[i]);
-
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-    /* Deallocate storage. */
-    SUPERLU_FREE(work);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgsrfs()");
-#endif
-
-} /* PZGSRFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgsrfs.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,240 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-pzgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
-	ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
-	doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, 
-	SOLVEstruct_t *SOLVEstruct,
-	double *berr, SuperLUStat_t *stat, int *info)
-{
-/* 
- * Purpose
- * =======
- *
- * PZGSRFS improves the computed solution to a system of linear   
- * equations and provides error bounds and backward error estimates
- * for the solution. 
- *
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, or the scaled A if equilibration was done.
- *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
- *        Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pdgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_zdefs.h for the definition of 'LUstruct_t'.
- *
- * ScalePermstruct (input) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_defs.h for the definition of 'gridinfo_t'.
- *
- * B      (input) doublecomplex* (local)
- *        The m_loc-by-NRHS right-hand side matrix of the possibly
- *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *       
- * ldb    (input) int (local)
- *        Leading dimension of matrix B.
- *
- * X      (input/output) doublecomplex* (local)
- *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
- *            transformed system A1*Y = Pc*Pr*B. where
- *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
- *        On exit, the improved solution matrix Y.
- *
- *        In order to obtain the solution X to the original system,
- *        Y should be permutated by Pc^T, and premultiplied by diag(C)
- *        if DiagScale = COL or BOTH.
- *        This must be done after this routine is called.
- *
- * ldx    (input) int (local)
- *        Leading dimension of matrix X.
- *
- * nrhs   (input) int
- *        Number of right-hand sides.
- *
- * SOLVEstruct (output) SOLVEstruct_t* (global)
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * berr   (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the refinement steps.
- *        See util.h for the definition of SuperLUStat_t.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        
- * Internal Parameters   
- * ===================   
- *
- * ITMAX is the maximum number of steps of iterative refinement.   
- *
- */
-
-#define ITMAX 20
-    
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    doublecomplex *ax, *R, *dx, *temp, *work, *B_col, *X_col;
-    double *rtemp;
-    int_t count, i, j, lwork, nz;
-    int   iam;
-    double eps, lstres;
-    double s, safmin, safe1, safe2;
-
-    /* Data structures used by matrix-vector multiply routine. */
-    pzgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
-    NRformat_loc *Astore;
-    int_t        m_loc, fst_row;
-
-
-    /* Initialization. */
-    Astore = (NRformat_loc *) A->Store;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    iam = grid->iam;
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-	      || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
-    else if ( nrhs < 0 ) *info = -13;
-    if (*info != 0) {
-	i = -(*info);
-	pxerbla("PZGSRFS", grid, i);
-	return;
-    }
-
-    /* Quick return if possible. */
-    if ( n == 0 || nrhs == 0 ) {
-	return;
-    }
-
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgsrfs()");
-#endif
-
-    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
-    if ( !(work = doublecomplexMalloc_dist(lwork)) )
-	ABORT("Malloc fails for work[]");
-    ax = R = dx = work;
-    temp = ax + m_loc;
-    rtemp = (double *) temp;
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
-		       eps, anorm, safe1, safe2);
-#endif
-
-    /* Do for each right-hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	B_col = &B[j*ldb];
-	X_col = &X[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-
-	    /* Matrix-vector multiply. */
-	    pzgsmv(0, A, grid, gsmv_comm, X_col, ax);
-	    
-	    /* Compute residual, stored in R[]. */
-	    for (i = 0; i < m_loc; ++i) z_sub(&R[i], &B_col[i], &ax[i]);
-
-	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
-	    pzgsmv(1, A, grid, gsmv_comm, X_col, temp);
-            /* NOTE: rtemp is aliased to temp */
-	    for (i = 0; i < m_loc; ++i) rtemp[i] += z_abs1(&B_col[i]);
-	    
-	    s = 0.0;
-	    for (i = 0; i < m_loc; ++i) {
-		if ( rtemp[i] > safe2 )
-		    s = SUPERLU_MAX(s, z_abs1(&R[i]) / rtemp[i]);
-		else
-		    s = SUPERLU_MAX(s, (z_abs1(&R[i])+safe1)/(rtemp[i]+safe1));
-	    }
-	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
-		
-#if ( PRNTlevel>= 1 )
-	    if ( !iam )
-		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
-#endif
-	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
-		/* Compute new dx. */
-		pzgstrs(n, LUstruct, ScalePermstruct, grid,
-			dx, m_loc, fst_row, m_loc, 1, 
-			SOLVEstruct, stat, info);
-
-		/* Update solution. */
-		for (i = 0; i < m_loc; ++i)
-                    z_add(&X_col[i], &X_col[i], &dx[i]);
-
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-    } /* for j ... */
-
-    /* Deallocate storage. */
-    SUPERLU_FREE(work);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgsrfs()");
-#endif
-
-} /* PZGSRFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx_ABglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1064 +0,0 @@
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-
-void
-pzgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, 
-		 ScalePermstruct_t *ScalePermstruct,
-		 doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
-		 LUstruct_t *LUstruct, double *berr,
-		 SuperLUStat_t *stat, int *info)
-{
-/* 
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- *
- * pzgssvx_ABglobal solves a system of linear equations A*X=B,
- * by using Gaussian elimination with "static pivoting" to
- * compute the LU factorization of A.
- *
- * Static pivoting is a technique that combines the numerical stability
- * of partial pivoting with the scalability of Cholesky (no pivoting),
- * to run accurately and efficiently on large numbers of processors.
- *
- * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
- * description of the parallel algorithms.
- *
- * Here are the options for using this code:
- *
- *   1. Independent of all the other options specified below, the
- *      user must supply
- *
- *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
- *      -  grid, a structure describing the 2D processor mesh
- *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
- *            iterative refinement
- *
- *      On output, B is overwritten with the solution X.
- *
- *   2. Depending on options->Fact, the user has several options
- *      for solving A*X=B. The standard option is for factoring
- *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
- *      the previous factorization.)
- *
- *      -  options->Fact = DOFACT: A is factored "from scratch"
- *
- *      In this case the user must also supply
- *
- *      -  A, the input matrix
- *
- *      as well as the following options, which are described in more 
- *      detail below:
- *
- *      -  options->Equil,   to specify how to scale the rows and columns
- *                           of A to "equilibrate" it (to try to reduce its
- *                           condition number and so improve the
- *                           accuracy of the computed solution)
- *
- *      -  options->RowPerm, to specify how to permute the rows of A
- *                           (typically to control numerical stability)
- *
- *      -  options->ColPerm, to specify how to permute the columns of A
- *                           (typically to control fill-in and enhance
- *                           parallelism during factorization)
- *
- *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
- *                           pivots encountered during factorization
- *                           (to control numerical stability)
- *
- *      The outputs returned include
- *         
- *      -  ScalePermstruct,  modified to describe how the input matrix A
- *                           was equilibrated and permuted:
- *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
- *                                        columns of A were scaled
- *         -  ScalePermstruct->R, array of row scale factors
- *         -  ScalePermstruct->C, array of column scale factors
- *         -  ScalePermstruct->perm_r, row permutation vector
- *         -  ScalePermstruct->perm_c, column permutation vector
- *
- *            (part of ScalePermstruct may also need to be supplied on input,
- *             depending on options->RowPerm and options->ColPerm as described 
- *             later).
- *
- *      -  A, the input matrix A overwritten by the scaled and permuted matrix
- *                Pc*Pr*diag(R)*A*diag(C)
- *             where 
- *                Pr and Pc are row and columns permutation matrices determined
- *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
- *                  respectively, and 
- *                diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
- *                  ScalePermstruct->C
- *
- *      -  LUstruct, which contains the L and U factorization of A1 where
- *
- *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
- *               in A on output.)
- *
- *   3. The second value of options->Fact assumes that a matrix with the same
- *      sparsity pattern as A has already been factored:
- *     
- *      -  options->Fact = SamePattern: A is factored, assuming that it has
- *            the same nonzero pattern as a previously factored matrix. In this
- *            case the algorithm saves time by reusing the previously computed
- *            column permutation vector stored in ScalePermstruct->perm_c
- *            and the "elimination tree" of A stored in LUstruct->etree.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *      -  options->Equil
- *      -  options->RowPerm
- *      -  options->ReplaceTinyPivot
- *
- *      but not options->ColPerm, whose value is ignored. This is because the
- *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
- *
- *      -  A, the input matrix
- *      -  ScalePermstruct->perm_c, the column permutation
- *      -  LUstruct->etree, the elimination tree
- *
- *      The outputs returned include
- *         
- *      -  A, the input matrix A overwritten by the scaled and permuted matrix
- *            as described above
- *      -  ScalePermstruct,  modified to describe how the input matrix A was
- *                           equilibrated and row permuted
- *      -  LUstruct, modified to contain the new L and U factors
- *
- *   4. The third value of options->Fact assumes that a matrix B with the same
- *      sparsity pattern as A has already been factored, and where the
- *      row permutation of B can be reused for A. This is useful when A and B
- *      have similar numerical values, so that the same row permutation
- *      will make both factorizations numerically stable. This lets us reuse
- *      all of the previously computed structure of L and U.
- *
- *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
- *            assuming not only the same nonzero pattern as the previously
- *            factored matrix B, but reusing B's row permutation.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *      -  options->Equil
- *      -  options->ReplaceTinyPivot
- *
- *      but not options->RowPerm or options->ColPerm, whose values are ignored.
- *      This is because the permutations from ScalePermstruct->perm_r and
- *      ScalePermstruct->perm_c are used as input.
- *
- *      The user must also supply 
- *
- *      -  A, the input matrix
- *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
- *                                     column scaled
- *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
- *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
- *                             if any
- *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
- *      -  ScalePermstruct->perm_c, the column permutation of the previous 
- *                                  matrix
- *      -  all of LUstruct, the previously computed information about L and U
- *                (the actual numerical values of L and U stored in
- *                 LUstruct->Llu are ignored)
- *
- *      The outputs returned include
- *         
- *      -  A, the input matrix A overwritten by the scaled and permuted matrix
- *            as described above
- *      -  ScalePermstruct,  modified to describe how the input matrix A was
- *                           equilibrated 
- *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
- *      -  LUstruct, modified to contain the new L and U factors
- *
- *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
- *      call, and reuses its entire LU factorization
- *
- *      -  options->Fact = Factored: A is identical to a previously
- *            factorized matrix, so the entire previous factorization
- *            can be reused.
- *
- *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
- *       options->ReplaceTinyPivot)
- *
- *      The user must also supply 
- *
- *      -  A, the unfactored matrix, only in the case that iterative refinment
- *            is to be done (specifically A must be the output A from 
- *            the previous call, so that it has been scaled and permuted)
- *      -  all of ScalePermstruct
- *      -  all of LUstruct, including the actual numerical values of L and U
- *
- *      all of which are unmodified on output.
- *         
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following fields should be defined for this structure:
- *         
- *         o Fact (fact_t)
- *           Specifies whether or not the factored form of the matrix
- *           A is supplied on entry, and if not, how the matrix A should
- *           be factorized based on the previous history.
- *
- *           = DOFACT: The matrix A will be factorized from scratch.
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *
- *           = SamePattern: the matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the elimination tree
- *             LUstruct->etree
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ReplaceTinyPivot
- *                          ScalePermstruct->perm_c
- *                          LUstruct->etree
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
- *                          rest of LUstruct (GLU_persist, Llu)
- *
- *           = SamePattern_SameRowPerm: the matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *                 Inputs:  A
- *                          options->Equil, ReplaceTinyPivot
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          modified LUstruct->Llu
- *           = FACTORED: the matrix A is already factored.
- *                 Inputs:  all of ScalePermstruct
- *                          all of LUstruct
- *
- *         o Equil (yes_no_t)
- *           Specifies whether to equilibrate the system.
- *           = NO:  no equilibration.
- *           = YES: scaling factors are computed to equilibrate the system:
- *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
- *                  Whether or not the system will be equilibrated depends
- *                  on the scaling of the matrix A, but if equilibration is
- *                  used, A is overwritten by diag(R)*A*diag(C) and B by
- *                  diag(R)*B.
- *
- *         o RowPerm (rowperm_t)
- *           Specifies how to permute rows of the matrix A.
- *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
- *                        the original matrix to make the diagonal large
- *                        relative to the off-diagonal.
- *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
- *                        input by the user.
- *           
- *         o ColPerm (colperm_t)
- *           Specifies what type of column permutation to use to reduce fill.
- *           = NATURAL:       natural ordering.
- *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
- *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
- *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
- *         o ReplaceTinyPivot (yes_no_t)
- *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
- *                  LU factorization.
- *
- *         o IterRefine (IterRefine_t)
- *           Specifies how to perform iterative refinement.
- *           = NO:     no iterative refinement.
- *           = DOUBLE: accumulate residual in double precision.
- *           = EXTRA:  accumulate residual in extra precision.
- *
- *         NOTE: all options must be indentical on all processes when
- *               calling this routine.
- *
- * A (input/output) SuperMatrix*
- *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *         The number of linear equations is A->nrow. The type of A must be:
- *         Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. That is, A is stored in
- *         compressed column format (also known as Harwell-Boeing format).
- *         See supermatrix.h for the definition of 'SuperMatrix'.
- *         This routine only handles square A, however, the LU factorization
- *         routine pzgstrf can factorize rectangular matrices.
- *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
- *         depending on ScalePermstruct->DiagScale, options->RowPerm and
- *         options->colpem:
- *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
- *                diag(R)*A*diag(C).
- *             if options->RowPerm != NATURAL, A is further overwritten by
- *                Pr*diag(R)*A*diag(C).
- *             if options->ColPerm != NATURAL, A is further overwritten by
- *                Pc*Pr*diag(R)*A*diag(C).
- *         If all the above condition are true, the LU decomposition is
- *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
- *
- *         NOTE: Currently, A must reside in all processes when calling
- *               this routine.
- *
- * ScalePermstruct (input/output) ScalePermstruct_t*
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *         It contains the following fields:
- *
- *         o DiagScale (DiagScale_t)
- *           Specifies the form of equilibration that was done.
- *           = NOEQUIL: no equilibration.
- *           = ROW:     row equilibration, i.e., A was premultiplied by
- *                      diag(R).
- *           = COL:     Column equilibration, i.e., A was postmultiplied
- *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
- *                      replaced by diag(R)*A*diag(C).
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
- *           DiagScale is an input argument; otherwise it is an output
- *           argument.
- *
- *         o perm_r (int*)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *           If options->RowPerm = MY_PERMR, or
- *           options->Fact = SamePattern_SameRowPerm, perm_r is an
- *           input argument; otherwise it is an output argument.
- *
- *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
- *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
- *           input argument; otherwise, it is an output argument.
- *           On exit, perm_c may be overwritten by the product of the input
- *           perm_c and a permutation that postorders the elimination tree
- *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
- *           is already in postorder.
- *
- *         o R (double*) dimension (A->nrow)
- *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
- *                          diag(R).
- *           If DiagScale = NOEQUIL or COL, R is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
- *           an input argument; otherwise, R is an output argument.
- *
- *         o C (double*) dimension (A->ncol)
- *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
- *                          diag(C).
- *           If DiagScale = NOEQUIL or ROW, C is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
- *           an input argument; otherwise, C is an output argument.
- *         
- * B       (input/output) doublecomplex*
- *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
- *         On exit, the solution matrix if info = 0;
- *
- *         NOTE: Currently, B must reside in all processes when calling
- *               this routine.
- *
- * ldb     (input) int (global)
- *         The leading dimension of matrix B.
- *
- * nrhs    (input) int (global)
- *         The number of right-hand sides.
- *         If nrhs = 0, only LU decomposition is performed, the forward
- *         and back substitutions are skipped.
- *
- * grid    (input) gridinfo_t*
- *         The 2D process mesh. It contains the MPI communicator, the number
- *         of process rows (NPROW), the number of process columns (NPCOL),
- *         and my process rank. It is an input argument to all the
- *         parallel routines.
- *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *         See superlu_zdefs.h for the definition of 'gridinfo_t'.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         It contains the following fields:
- *
- *         o etree (int*) dimension (A->ncol)
- *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
- *           It is computed in sp_colorder() during the first factorization,
- *           and is reused in the subsequent factorizations of the matrices
- *           with the same nonzero pattern.
- *           On exit of sp_colorder(), the columns of A are permuted so that
- *           the etree is in a certain postorder. This postorder is reflected
- *           in ScalePermstruct->perm_c.
- *           NOTE:
- *           Etree is a vector of parent pointers for a forest whose vertices
- *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- *         o Glu_persist (Glu_persist_t*)
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (LocalLU_t*)
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * berr    (output) double*, dimension (nrhs)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *
- *
- * See superlu_zdefs.h for the definitions of various data types.
- *
- */
-    SuperMatrix AC;
-    NCformat *Astore;
-    NCPformat *ACstore;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    Glu_freeable_t *Glu_freeable;
-            /* The nonzero structures of L and U factors, which are
-	       replicated on all processrs.
-	           (lsub, xlsub) contains the compressed subscript of
-		                 supernodes in L.
-          	   (usub, xusub) contains the compressed subscript of
-		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
-	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
-	      routine. They will be freed after DDISTRIBUTE routine.
-	      If options->Fact == SamePattern_SameRowPerm, these
-	      structures are not used.                                  */
-    fact_t   Fact;
-    doublecomplex   *a;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *colptr, *rowind;
-    int_t    colequ, Equil, factored, job, notran, rowequ;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
-    int      iam;
-    int      ldx;  /* LDA for matrix X (global). */
-    char     equed[1], norm[1];
-    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
-    doublecomplex   *X, *b_col, *b_work, *x_col;
-    double   t;
-    static mem_usage_t num_mem_usage, symb_mem_usage;
-#if ( PRNTlevel>= 2 )
-    double   dmin, dsum, dprod;
-#endif
-
-    /* Test input parameters. */
-    *info = 0;
-    Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
-	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
-	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
-	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
-	*info = -1;
-    else if ( options->IterRefine == EXTRA ) {
-	*info = -1;
-	fprintf(stderr, "Extra precise iterative refinement yet to support.");
-    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
-         A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < A->nrow )
-	*info = -5;
-    else if ( nrhs < 0 )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	pxerbla("pzgssvx_ABglobal", grid, -*info);
-	return;
-    }
-
-    /* Initialization */
-    factored = (Fact == FACTORED);
-    Equil = (!factored && options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    iam = grid->iam;
-    job = 5;
-    m = A->nrow;
-    n = A->ncol;
-    Astore = A->Store;
-    nnz = Astore->nnz;
-    a = Astore->nzval;
-    colptr = Astore->colptr;
-    rowind = Astore->rowind;
-    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
-	rowequ = (ScalePermstruct->DiagScale == ROW) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-	colequ = (ScalePermstruct->DiagScale == COL) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-    } else rowequ = colequ = FALSE;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgssvx_ABglobal()");
-#endif
-
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    etree = LUstruct->etree;
-    R = ScalePermstruct->R;
-    C = ScalePermstruct->C;
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	/* Allocate storage if not done so before. */
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->R = R;
-		ScalePermstruct->C = C;
-		break;
-	    case ROW: 
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->C = C;
-		break;
-	    case COL: 
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-		ScalePermstruct->R = R;
-		break;
-	}
-    }
-
-    /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix.
-       ------------------------------------------------------------*/
-    if ( Equil ) {
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Enter equil");
-#endif
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm ) {
-	    /* Reuse R and C. */
-	    switch ( ScalePermstruct->DiagScale ) {
-	      case NOEQUIL:
-		break;
-	      case ROW:
-		for (j = 0; j < n; ++j) {
-		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			irow = rowind[i];
-			zd_mult(&a[i], &a[i], R[i]); /* Scale rows. */
-		    }
-		}
-		break;
-	      case COL:
-		for (j = 0; j < n; ++j)
-		    for (i = colptr[j]; i < colptr[j+1]; ++i)
-			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
-		break;
-	      case BOTH: 
-		for (j = 0; j < n; ++j) {
-		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			irow = rowind[i];
-			zd_mult(&a[i], &a[i], R[irow]); /* Scale rows. */
-			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
-		    }
-		}
-	        break;
-	    }
-	} else {
-	    if ( !iam ) {
-		/* Compute row and column scalings to equilibrate matrix A. */
-		zgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
-	    
-		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
-		if ( iinfo == 0 ) {
-		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
-		} else {
-		    if ( iinfo > 0 ) {
-			if ( iinfo <= m )
-			    fprintf(stderr, "The %d-th row of A is exactly zero\n", 
-				    iinfo);
-			else fprintf(stderr, "The %d-th column of A is exactly zero\n", 
-				     iinfo-n);
-			exit(-1);
-		    }
-		}
-	    } else {
-		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
-		if ( iinfo == 0 ) {
-		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
-		} else {
-		    ABORT("ZGSEQU failed\n");
-		}
-	    }
-	
-	    /* Equilibrate matrix A. */
-	    zlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
-	    if ( lsame_(equed, "R") ) {
-		ScalePermstruct->DiagScale = rowequ = ROW;
-	    } else if ( lsame_(equed, "C") ) {
-		ScalePermstruct->DiagScale = colequ = COL;
-	    } else if ( lsame_(equed, "B") ) {
-		ScalePermstruct->DiagScale = BOTH;
-		rowequ = ROW;
-		colequ = COL;
-	    } else ScalePermstruct->DiagScale = NOEQUIL;
-
-#if ( PRNTlevel>=1 )
-	    if ( !iam ) {
-		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
-	    }
-#endif
-	} /* if Fact ... */
-
-	stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit equil");
-#endif
-    } /* end if Equil ... */
-    
-    /* ------------------------------------------------------------
-       Permute rows of A. 
-       ------------------------------------------------------------*/
-    if ( options->RowPerm != NO ) {
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
-	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
-	    for (j = 0; j < n; ++j) {
-		for (i = colptr[j]; i < colptr[j+1]; ++i) {
-		    irow = rowind[i];
-		    rowind[i] = perm_r[irow];
-		}
-	    }
-	} else if ( !factored ) {
-	    if ( job == 5 ) {
-		/* Allocate storage for scaling factors. */
-		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
-		    ABORT("SUPERLU_MALLOC fails for R1[]");
-		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
-		    ABORT("SUPERLU_MALLOC fails for C1[]");
-	    }
-
-	    if ( !iam ) {
-		/* Process 0 finds a row permutation for large diagonal. */
-		zldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1);
-		
-		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		if ( job == 5 && Equil ) {
-		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		}
-	    } else {
-		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		if ( job == 5 && Equil ) {
-		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		}
-	    }
-
-#if ( PRNTlevel>=2 )
-	    dmin = dlamch_("Overflow");
-	    dsum = 0.0;
-	    dprod = 1.0;
-#endif
-	    if ( job == 5 ) {
-		if ( Equil ) {
-		    for (i = 0; i < n; ++i) {
-			R1[i] = exp(R1[i]);
-			C1[i] = exp(C1[i]);
-		    }
-		    for (j = 0; j < n; ++j) {
-			for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			    irow = rowind[i];
-			    zd_mult(&a[i], &a[i], R1[irow]); /* Scale rows. */
-			    zd_mult(&a[i], &a[i], C1[j]); /* Scale columns. */
-			    rowind[i] = perm_r[irow];
-#if ( PRNTlevel>=2 )
-			    if ( rowind[i] == j ) /* New diagonal */
-				dprod *= z_abs1(&a[i]);
-#endif
-			}
-		    }
-
-		    /* Multiply together the scaling factors. */
-		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
-		    else for (i = 0; i < m; ++i) R[i] = R1[i];
-		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
-		    else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
-		    ScalePermstruct->DiagScale = BOTH;
-		    rowequ = colequ = 1;
-		} else { /* No equilibration. */
-		    for (j = 0; j < n; ++j) {
-			for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			    irow = rowind[i];
-			    rowind[i] = perm_r[irow];
-			}
-		    }
-		}
-		SUPERLU_FREE (R1);
-		SUPERLU_FREE (C1);
-	    } else { /* job = 2,3,4 */
-		for (j = 0; j < n; ++j) {
-		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			irow = rowind[i];
-			rowind[i] = perm_r[irow];
-#if ( PRNTlevel>=2 )
-			if ( rowind[i] == j ) { /* New diagonal */
-			    if ( job == 2 || job == 3 )
-				dmin = SUPERLU_MIN(dmin, z_abs1(&a[i]));
-			    else if ( job == 4 )
-				dsum += z_abs1(&a[i]);
-			    else if ( job == 5 )
-				dprod *= z_abs1(&a[i]);
-			}
-#endif
-		    }
-		}
-	    }
-
-#if ( PRNTlevel>=2 )
-	    if ( job == 2 || job == 3 ) {
-		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
-	    } else if ( job == 4 ) {
-		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
-	    } else if ( job == 5 ) {
-		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
-	    }
-#endif
-	    
-        } /* else !factored */
-
-	t = SuperLU_timer_() - t;
-	stat->utime[ROWPERM] = t;
-    
-    } else { /* options->RowPerm == NOROWPERM */
-        for (i = 0; i < m; ++i) perm_r[i] = i;
-    }
-
-    if ( !factored || options->IterRefine ) {
-	/* Compute norm(A), which will be used to adjust small diagonal. */
-	if ( notran ) *(unsigned char *)norm = '1';
-	else *(unsigned char *)norm = 'I';
-	anorm = zlangs_dist(norm, A);
-    }
-
-    /* ------------------------------------------------------------
-       Perform the LU factorization.
-       ------------------------------------------------------------*/
-    if ( !factored ) {
-	t = SuperLU_timer_();
-	/*
-	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	if ( permc_spec != MY_PERMC && Fact == DOFACT )
-	    /* Use an ordering provided by SuperLU */
-	    get_perm_c_dist(iam, permc_spec, A, perm_c);
-
-	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
-	   (a.k.a. column etree), depending on the choice of ColPerm.
-	   Adjust perm_c[] to be consistent with a postorder of etree.
-	   Permute columns of A to form A*Pc'. */
-	sp_colorder(options, A, perm_c, etree, &AC);
-
-	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
-	ACstore = AC.Store;
-	for (j = 0; j < n; ++j) 
-	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
-		irow = ACstore->rowind[i];
-		ACstore->rowind[i] = perm_c[irow];
-	    }
-	stat->utime[COLPERM] = SuperLU_timer_() - t;
-
-	/* Perform a symbolic factorization on matrix A and set up the
-	   nonzero data structures which are suitable for supernodal GENP. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-#if ( PRNTlevel>=1 ) 
-	    if ( !iam ) 
-		printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
-#endif
-	    t = SuperLU_timer_();
-	    if ( !(Glu_freeable = (Glu_freeable_t *)
-		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
-		ABORT("Malloc fails for Glu_freeable.");
-
-	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
-			     Glu_persist, Glu_freeable);
-
-	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-
-	    if ( iinfo < 0 ) {
-		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 ) 
-		if ( !iam ) {
-		    printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
-		    printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
-		    printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
-		    printf("\tint %d, short %d, float %d, double %d\n", 
-			   sizeof(int_t), sizeof(short), sizeof(float),
-			   sizeof(double));
-		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-			   symb_mem_usage.for_lu*1e-6, 
-			   symb_mem_usage.total*1e-6,
-			   symb_mem_usage.expansions);
-		}
-#endif
-	    } else {
-		if ( !iam ) {
-		    fprintf(stderr, "symbfact() error returns %d\n", iinfo);
-		    exit(-1);
-		}
-	    }
-	}
-
-	/* Distribute the L and U factors onto the process grid. */
-	t = SuperLU_timer_();
-	dist_mem_use = zdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
-	stat->utime[DIST] = SuperLU_timer_() - t;
-
-	/* Deallocate storage used in symbolic factor. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-	    iinfo = symbfact_SubFree(Glu_freeable);
-	    SUPERLU_FREE(Glu_freeable);
-	}
-
-	/* Perform numerical factorization in parallel. */
-	t = SuperLU_timer_();
-	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
-	stat->utime[FACT] = SuperLU_timer_() - t;
-
-#if ( PRNTlevel>=1 )
-	{
-	    int_t TinyPivots;
-	    float for_lu, total, max, avg, temp;
-	    zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
-	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Reduce( &num_mem_usage.total, &total,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    temp = SUPERLU_MAX(symb_mem_usage.total,
-			       symb_mem_usage.for_lu +
-			       (float)dist_mem_use + num_mem_usage.for_lu);
-	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
-			  MPI_SUM, grid->comm );
-	    stat->TinyPivots = TinyPivots;
-	    if ( !iam ) {
-		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
-		       for_lu*1e-6, total*1e-6);
-		printf("\tAll space (MB):"
-		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
-		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
-		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
-	    }
-	}
-#endif
-    
-#if ( PRNTlevel>=2 )
-	if ( !iam ) printf(".. pzgstrf INFO = %d\n", *info);
-#endif
-
-    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
-	/* Permute columns of A to form A*Pc' using the existing perm_c.
-	 * NOTE: rows of A were previously permuted to Pc*A.
-	 */
-	sp_colorder(options, A, perm_c, NULL, &AC);
-    } /* if !factored ... */
-	
-    /* ------------------------------------------------------------
-       Compute the solution matrix X.
-       ------------------------------------------------------------*/
-    if ( nrhs ) {
-
-	if ( !(b_work = doublecomplexMalloc_dist(n)) )
-	    ABORT("Malloc fails for b_work[]");
-
-	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    if ( rowequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], R[i]);
-		    b_col += ldb;
-		}
-	    }
-	} else if ( colequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], C[i]);
-		b_col += ldb;
-	    }
-	}
-
-	/* ------------------------------------------------------------
-	   Permute the right-hand side to form Pr*B.
-	   ------------------------------------------------------------*/
-	if ( options->RowPerm != NO ) {
-	    if ( notran ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
-		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
-		    b_col += ldb;
-		}
-	    }
-	}
-
-
-	/* ------------------------------------------------------------
-	   Permute the right-hand side to form Pc*B.
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
-		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
-		b_col += ldb;
-	    }
-	}
-
-	/* Save a copy of the right-hand side. */
-	ldx = ldb;
-	if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) )
-	    ABORT("Malloc fails for X[]");
-	x_col = X;  b_col = B;
-	for (j = 0; j < nrhs; ++j) {
-	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
-	    x_col += ldx;  b_col += ldb;
-	}
-
-	/* ------------------------------------------------------------
-	   Solve the linear system.
-	   ------------------------------------------------------------*/
-	pzgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);
-
-	/* ------------------------------------------------------------
-	   Use iterative refinement to improve the computed solution and
-	   compute error bounds and backward error estimates for it.
-	   ------------------------------------------------------------*/
-	if ( options->IterRefine ) {
-	    /* Improve the solution by iterative refinement. */
-	    t = SuperLU_timer_();
-	    pzgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
-			      X, ldx, nrhs, berr, stat, info);
-	    stat->utime[REFINE] = SuperLU_timer_() - t;
-	}
-
-	/* Permute the solution matrix X <= Pc'*X. */
-	for (j = 0; j < nrhs; j++) {
-	    b_col = &B[j*ldb];
-	    x_col = &X[j*ldx];
-	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
-	}
-	
-	/* Transform the solution matrix X to a solution of the original system
-	   before the equilibration. */
-	if ( notran ) {
-	    if ( colequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], C[i]);
-		    b_col += ldb;
-		}
-	    }
-	} else if ( rowequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], R[i]);
-		b_col += ldb;
-	    }
-	}
-
-	SUPERLU_FREE(b_work);
-	SUPERLU_FREE(X);
-
-    } /* end if nrhs != 0 */
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-#endif
-
-    /* Deallocate R and/or C if it is not used. */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-	        SUPERLU_FREE(R);
-		SUPERLU_FREE(C);
-		break;
-	    case ROW: 
-		SUPERLU_FREE(C);
-		break;
-	    case COL: 
-		SUPERLU_FREE(R);
-		break;
-	}
-    }
-    if ( !factored || (factored && options->IterRefine) )
-	Destroy_CompCol_Permuted_dist(&AC);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgssvx_ABglobal()");
-#endif
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1280 +0,0 @@
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-pzgssvx(superlu_options_t *options, SuperMatrix *A, 
-	ScalePermstruct_t *ScalePermstruct,
-	doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
-	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
-	SuperLUStat_t *stat, int *info)
-{
-/* 
- * -- Distributed SuperLU routine (version 2.2) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- * Feburary 20, 2008
- *
- *
- * Purpose
- * =======
- *
- * PZGSSVX solves a system of linear equations A*X=B,
- * by using Gaussian elimination with "static pivoting" to
- * compute the LU factorization of A.
- *
- * Static pivoting is a technique that combines the numerical stability
- * of partial pivoting with the scalability of Cholesky (no pivoting),
- * to run accurately and efficiently on large numbers of processors.
- * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
- * description of the parallel algorithms.
- *
- * The input matrices A and B are distributed by block rows.
- * Here is a graphical illustration (0-based indexing):
- *
- *                        A                B
- *               0 ---------------       ------
- *                   |           |        |  |
- *                   |           |   P0   |  |
- *                   |           |        |  |
- *                 ---------------       ------
- *        - fst_row->|           |        |  |
- *        |          |           |        |  |
- *       m_loc       |           |   P1   |  |
- *        |          |           |        |  |
- *        -          |           |        |  |
- *                 ---------------       ------
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                 ---------------       ------
- * 
- * where, fst_row is the row number of the first row,
- *        m_loc is the number of rows local to this processor
- * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
- *
- *
- * Here are the options for using this code:
- *
- *   1. Independent of all the other options specified below, the
- *      user must supply
- *
- *      -  B, the matrix of right-hand sides, distributed by block rows,
- *            and its dimensions ldb (local) and nrhs (global)
- *      -  grid, a structure describing the 2D processor mesh
- *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
- *            iterative refinement
- *
- *      On output, B is overwritten with the solution X.
- *
- *   2. Depending on options->Fact, the user has four options
- *      for solving A*X=B. The standard option is for factoring
- *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
- *      the previous factorization.)
- *
- *      -  options->Fact = DOFACT: A is factored "from scratch"
- *
- *      In this case the user must also supply
- *
- *        o  A, the input matrix
- *
- *        as well as the following options to determine what matrix to
- *        factorize.
- *
- *        o  options->Equil,   to specify how to scale the rows and columns
- *                             of A to "equilibrate" it (to try to reduce its
- *                             condition number and so improve the
- *                             accuracy of the computed solution)
- *
- *        o  options->RowPerm, to specify how to permute the rows of A
- *                             (typically to control numerical stability)
- *
- *        o  options->ColPerm, to specify how to permute the columns of A
- *                             (typically to control fill-in and enhance
- *                             parallelism during factorization)
- *
- *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
- *                             pivots encountered during factorization
- *                             (to control numerical stability)
- *
- *      The outputs returned include
- *         
- *        o  ScalePermstruct,  modified to describe how the input matrix A
- *                             was equilibrated and permuted:
- *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
- *                                         columns of A were scaled
- *          .  ScalePermstruct->R, array of row scale factors
- *          .  ScalePermstruct->C, array of column scale factors
- *          .  ScalePermstruct->perm_r, row permutation vector
- *          .  ScalePermstruct->perm_c, column permutation vector
- *
- *          (part of ScalePermstruct may also need to be supplied on input,
- *           depending on options->RowPerm and options->ColPerm as described 
- *           later).
- *
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix diag(R)*A*diag(C)*Pc^T, where 
- *              Pc is the row permutation matrix determined by
- *                  ScalePermstruct->perm_c
- *              diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
- *                  ScalePermstruct->C
- *
- *        o  LUstruct, which contains the L and U factorization of A1 where
- *
- *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
- *                in A on output.)
- *
- *   3. The second value of options->Fact assumes that a matrix with the same
- *      sparsity pattern as A has already been factored:
- *     
- *      -  options->Fact = SamePattern: A is factored, assuming that it has
- *            the same nonzero pattern as a previously factored matrix. In
- *            this case the algorithm saves time by reusing the previously
- *            computed column permutation vector stored in
- *            ScalePermstruct->perm_c and the "elimination tree" of A
- *            stored in LUstruct->etree
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->RowPerm
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->ColPerm, whose value is ignored. This is because the
- *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->perm_c, the column permutation
- *        o  LUstruct->etree, the elimination tree
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct, modified to describe how the input matrix A was
- *                            equilibrated and row permuted
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   4. The third value of options->Fact assumes that a matrix B with the same
- *      sparsity pattern as A has already been factored, and where the
- *      row permutation of B can be reused for A. This is useful when A and B
- *      have similar numerical values, so that the same row permutation
- *      will make both factorizations numerically stable. This lets us reuse
- *      all of the previously computed structure of L and U.
- *
- *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
- *            assuming not only the same nonzero pattern as the previously
- *            factored matrix B, but reusing B's row permutation.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->RowPerm or options->ColPerm, whose values are
- *      ignored. This is because the permutations from ScalePermstruct->perm_r
- *      and ScalePermstruct->perm_c are used as input.
- *
- *      The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->DiagScale, how the previous matrix was row
- *                                       and/or column scaled
- *        o  ScalePermstruct->R, the row scalings of the previous matrix,
- *                               if any
- *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
- *                               if any
- *        o  ScalePermstruct->perm_r, the row permutation of the previous
- *                                    matrix
- *        o  ScalePermstruct->perm_c, the column permutation of the previous 
- *                                    matrix
- *        o  all of LUstruct, the previously computed information about
- *                            L and U (the actual numerical values of L and U
- *                            stored in LUstruct->Llu are ignored)
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct,  modified to describe how the input matrix A was
- *                             equilibrated (thus ScalePermstruct->DiagScale,
- *                             R and C may be modified)
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
- *      call, and reuses its entire LU factorization
- *
- *      -  options->Fact = Factored: A is identical to a previously
- *            factorized matrix, so the entire previous factorization
- *            can be reused.
- *
- *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
- *       options->ReplaceTinyPivot)
- *
- *      The user must also supply 
- *
- *        o  A, the unfactored matrix, only in the case that iterative
- *              refinment is to be done (specifically A must be the output
- *              A from the previous call, so that it has been scaled and permuted)
- *        o  all of ScalePermstruct
- *        o  all of LUstruct, including the actual numerical values of
- *           L and U
- *
- *      all of which are unmodified on output.
- *         
- * Arguments
- * =========
- *
- * options (input) superlu_options_t* (global)
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following fields should be defined for this structure:
- *         
- *         o Fact (fact_t)
- *           Specifies whether or not the factored form of the matrix
- *           A is supplied on entry, and if not, how the matrix A should
- *           be factorized based on the previous history.
- *
- *           = DOFACT: The matrix A will be factorized from scratch.
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *
- *           = SamePattern: the matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the elimination tree
- *             LUstruct->etree
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ReplaceTinyPivot
- *                          ScalePermstruct->perm_c
- *                          LUstruct->etree
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
- *                          rest of LUstruct (GLU_persist, Llu)
- *
- *           = SamePattern_SameRowPerm: the matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *                 Inputs:  A
- *                          options->Equil, ReplaceTinyPivot
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          modified LUstruct->Llu
- *           = FACTORED: the matrix A is already factored.
- *                 Inputs:  all of ScalePermstruct
- *                          all of LUstruct
- *
- *         o Equil (yes_no_t)
- *           Specifies whether to equilibrate the system.
- *           = NO:  no equilibration.
- *           = YES: scaling factors are computed to equilibrate the system:
- *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
- *                  Whether or not the system will be equilibrated depends
- *                  on the scaling of the matrix A, but if equilibration is
- *                  used, A is overwritten by diag(R)*A*diag(C) and B by
- *                  diag(R)*B.
- *
- *         o RowPerm (rowperm_t)
- *           Specifies how to permute rows of the matrix A.
- *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
- *                        the original matrix to make the diagonal large
- *                        relative to the off-diagonal.
- *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
- *                        input by the user.
- *           
- *         o ColPerm (colperm_t)
- *           Specifies what type of column permutation to use to reduce fill.
- *           = NATURAL:       natural ordering.
- *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
- *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
- *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
- *         o ReplaceTinyPivot (yes_no_t)
- *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
- *                  LU factorization.
- *
- *         o IterRefine (IterRefine_t)
- *           Specifies how to perform iterative refinement.
- *           = NO:     no iterative refinement.
- *           = DOUBLE: accumulate residual in double precision.
- *           = EXTRA:  accumulate residual in extra precision.
- *
- *         NOTE: all options must be indentical on all processes when
- *               calling this routine.
- *
- * A (input/output) SuperMatrix* (local)
- *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *           The number of linear equations is A->nrow. The type of A must be:
- *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *           That is, A is stored in distributed compressed row format.
- *           See supermatrix.h for the definition of 'SuperMatrix'.
- *           This routine only handles square A, however, the LU factorization
- *           routine PDGSTRF can factorize rectangular matrices.
- *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
- *           depending on ScalePermstruct->DiagScale and options->ColPerm:
- *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
- *                diag(R)*A*diag(C).
- *             if options->ColPerm != NATURAL, A is further overwritten by
- *                diag(R)*A*diag(C)*Pc^T.
- *           If all the above condition are true, the LU decomposition is
- *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
- *
- * ScalePermstruct (input/output) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *         It contains the following fields:
- *
- *         o DiagScale (DiagScale_t)
- *           Specifies the form of equilibration that was done.
- *           = NOEQUIL: no equilibration.
- *           = ROW:     row equilibration, i.e., A was premultiplied by
- *                      diag(R).
- *           = COL:     Column equilibration, i.e., A was postmultiplied
- *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
- *                      replaced by diag(R)*A*diag(C).
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
- *           DiagScale is an input argument; otherwise it is an output
- *           argument.
- *
- *         o perm_r (int*)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *           If options->RowPerm = MY_PERMR, or
- *           options->Fact = SamePattern_SameRowPerm, perm_r is an
- *           input argument; otherwise it is an output argument.
- *
- *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
- *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
- *           input argument; otherwise, it is an output argument.
- *           On exit, perm_c may be overwritten by the product of the input
- *           perm_c and a permutation that postorders the elimination tree
- *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
- *           is already in postorder.
- *
- *         o R (double*) dimension (A->nrow)
- *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
- *                          diag(R).
- *           If DiagScale = NOEQUIL or COL, R is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
- *           an input argument; otherwise, R is an output argument.
- *
- *         o C (double*) dimension (A->ncol)
- *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
- *                          diag(C).
- *           If DiagScale = NOEQUIL or ROW, C is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
- *           an input argument; otherwise, C is an output argument.
- *         
- * B       (input/output) doublecomplex* (local)
- *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
- *           where, m_loc is the number of rows stored locally on my
- *           process and is defined in the data structure of matrix A.
- *         On exit, the solution matrix if info = 0;
- *
- * ldb     (input) int (local)
- *         The leading dimension of matrix B.
- *
- * nrhs    (input) int (global)
- *         The number of right-hand sides.
- *         If nrhs = 0, only LU decomposition is performed, the forward
- *         and back substitutions are skipped.
- *
- * grid    (input) gridinfo_t* (global)
- *         The 2D process mesh. It contains the MPI communicator, the number
- *         of process rows (NPROW), the number of process columns (NPCOL),
- *         and my process rank. It is an input argument to all the
- *         parallel routines.
- *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *         See superlu_zdefs.h for the definition of 'gridinfo_t'.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         It contains the following fields:
- *
- *         o etree (int*) dimension (A->ncol) (global)
- *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
- *           It is computed in sp_colorder() during the first factorization,
- *           and is reused in the subsequent factorizations of the matrices
- *           with the same nonzero pattern.
- *           On exit of sp_colorder(), the columns of A are permuted so that
- *           the etree is in a certain postorder. This postorder is reflected
- *           in ScalePermstruct->perm_c.
- *           NOTE:
- *           Etree is a vector of parent pointers for a forest whose vertices
- *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- *         o Glu_persist (Glu_persist_t*) (global)
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (LocalLU_t*) (local)
- *           The distributed data structures to store L and U factors.
- *           See superlu_zdefs.h for the definition of 'LocalLU_t'.
- *
- * SOLVEstruct (input/output) SOLVEstruct_t*
- *         The data structure to hold the communication pattern used
- *         in the phases of triangular solution and iterative refinement.
- *         This pattern should be intialized only once for repeated solutions.
- *         If options->SolveInitialized = YES, it is an input argument.
- *         If options->SolveInitialized = NO and nrhs != 0, it is an output
- *         argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'.
- *
- * berr    (output) double*, dimension (nrhs) (global)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *
- * See superlu_zdefs.h for the definitions of varioous data types.
- *
- */
-    NRformat_loc *Astore;
-    SuperMatrix GA;      /* Global A in NC format */
-    NCformat *GAstore;
-    doublecomplex   *a_GA;
-    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
-    NCPformat *GACstore;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    Glu_freeable_t *Glu_freeable;
-            /* The nonzero structures of L and U factors, which are
-	       replicated on all processrs.
-	           (lsub, xlsub) contains the compressed subscript of
-		                 supernodes in L.
-          	   (usub, xusub) contains the compressed subscript of
-		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
-	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
-	      routine. They will be freed after PDDISTRIBUTE routine.
-	      If options->Fact == SamePattern_SameRowPerm, these
-	      structures are not used.                                  */
-    fact_t   Fact;
-    doublecomplex   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    *rowind_loc, *colptr_loc;
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
-    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
-    doublecomplex   *X, *b_col, *b_work, *x_col;
-    double   t;
-    static mem_usage_t num_mem_usage, symb_mem_usage;
-#if ( PRNTlevel>= 2 )
-    double   dmin, dsum, dprod;
-#endif
-    int_t procs;
-
-    /* Structures needed for parallel symbolic factorization */
-    int_t *sizes, *fstVtxSep, parSymbFact;
-    int   noDomains, nprocs_num;
-    MPI_Comm symb_comm; /* communicator for symbolic factorization */
-    int   col, key; /* parameters for creating a new communicator */
-    Pslu_freeable_t Pslu_freeable;
-    float  flinfo;
-
-    /* Initialization. */
-    m       = A->nrow;
-    n       = A->ncol;
-    Astore  = (NRformat_loc *) A->Store;
-    nnz_loc = Astore->nnz_loc;
-    m_loc   = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    a       = (doublecomplex *) Astore->nzval;
-    rowptr  = Astore->rowptr;
-    colind  = Astore->colind;
-    sizes   = NULL;
-    fstVtxSep = NULL;
-    symb_comm = MPI_COMM_NULL;
-
-    /* Test the input parameters. */
-    *info = 0;
-    Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
-	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
-	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
-	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
-	*info = -1;
-    else if ( options->IterRefine == EXTRA ) {
-	*info = -1;
-	fprintf(stderr, "Extra precise iterative refinement yet to support.");
-    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-		|| A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < m_loc )
-	*info = -5;
-    else if ( nrhs < 0 )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	pxerbla("pzgssvx", grid, -*info);
-	return;
-    }
-
-    factored = (Fact == FACTORED);
-    Equil = (!factored && options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    iam = grid->iam;
-    job = 5;
-    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
-	rowequ = (ScalePermstruct->DiagScale == ROW) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-	colequ = (ScalePermstruct->DiagScale == COL) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-    } else rowequ = colequ = FALSE;
-
-    /* The following arrays are replicated on all processes. */
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    etree = LUstruct->etree;
-    R = ScalePermstruct->R;
-    C = ScalePermstruct->C;
-    /********/
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgssvx()");
-#endif
-
-    /* Not factored & ask for equilibration */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
-	/* Allocate storage if not done so before. */
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->R = R;
-		ScalePermstruct->C = C;
-		break;
-	    case ROW: 
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->C = C;
-		break;
-	    case COL: 
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-		ScalePermstruct->R = R;
-		break;
-	}
-    }
-
-    /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix.
-       ------------------------------------------------------------*/
-    if ( Equil ) {
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Enter equil");
-#endif
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm ) {
-	    /* Reuse R and C. */
-	    switch ( ScalePermstruct->DiagScale ) {
-	      case NOEQUIL:
-		break;
-	      case ROW:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-                        zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
-		    }
-		    ++irow;
-		}
-		break;
-	      case COL:
-		for (j = 0; j < m_loc; ++j)
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
-		        icol = colind[i];
-                        zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
-		    }
-		break;
-	      case BOTH:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			icol = colind[i];
-                        zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
-                        zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
-		    }
-		    ++irow;
-		}
-	        break;
-	    }
-	} else { /* Compute R & C from scratch */
-            /* Compute the row and column scalings. */
-	    pzgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
-
-	    /* Equilibrate matrix A if it is badly-scaled. */
-	    pzlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
-
-	    if ( lsame_(equed, "R") ) {
-		ScalePermstruct->DiagScale = rowequ = ROW;
-	    } else if ( lsame_(equed, "C") ) {
-		ScalePermstruct->DiagScale = colequ = COL;
-	    } else if ( lsame_(equed, "B") ) {
-		ScalePermstruct->DiagScale = BOTH;
-		rowequ = ROW;
-		colequ = COL;
-	    } else ScalePermstruct->DiagScale = NOEQUIL;
-
-#if ( PRNTlevel>=1 )
-	    if ( !iam ) {
-		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
-	    }
-#endif
-	} /* if Fact ... */
-
-	stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit equil");
-#endif
-    } /* if Equil ... */
-
-    if ( !factored ) { /* Skip this if already factored. */
-        /*
-         * Gather A from the distributed compressed row format to
-         * global A in compressed column format.
-         * Numerical values are gathered only when a row permutation
-         * for large diagonal is sought after.
-         */
-	if ( Fact != SamePattern_SameRowPerm ) {
-            need_value = (options->RowPerm == LargeDiag);
-            pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
-            GAstore = (NCformat *) GA.Store;
-            colptr = GAstore->colptr;
-            rowind = GAstore->rowind;
-            nnz = GAstore->nnz;
-            if ( need_value ) a_GA = (doublecomplex *) GAstore->nzval;
-            else assert(GAstore->nzval == NULL);
-	}
-
-        /* ------------------------------------------------------------
-           Find the row permutation for A.
-           ------------------------------------------------------------*/
-        if ( options->RowPerm != NO ) {
-	    t = SuperLU_timer_();
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
-	            /* Permute the global matrix GA for symbfact() */
-	            for (i = 0; i < colptr[n]; ++i) {
-	            	irow = rowind[i]; 
-		    	rowind[i] = perm_r[irow];
-	            }
-	        } else { /* options->RowPerm == LargeDiag */
-	            /* Get a new perm_r[] */
-	            if ( job == 5 ) {
-		        /* Allocate storage for scaling factors. */
-		        if ( !(R1 = doubleMalloc_dist(m)) )
-		            ABORT("SUPERLU_MALLOC fails for R1[]");
-		    	if ( !(C1 = doubleMalloc_dist(n)) )
-		            ABORT("SUPERLU_MALLOC fails for C1[]");
-	            }
-
-	            if ( !iam ) {
-		        /* Process 0 finds a row permutation */
-		        zldperm(job, m, nnz, colptr, rowind, a_GA,
-		                perm_r, R1, C1);
-		
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            } else {
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            }
-
-#if ( PRNTlevel>=2 )
-	            dmin = dlamch_("Overflow");
-	            dsum = 0.0;
-	            dprod = 1.0;
-#endif
-	            if ( job == 5 ) {
-		        if ( Equil ) {
-		            for (i = 0; i < n; ++i) {
-			        R1[i] = exp(R1[i]);
-			        C1[i] = exp(C1[i]);
-		            }
-
-		            /* Scale the distributed matrix */
-		            irow = fst_row;
-		            for (j = 0; j < m_loc; ++j) {
-			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			            icol = colind[i];
-                                    zd_mult(&a[i], &a[i], R1[irow]);
-                                    zd_mult(&a[i], &a[i], C1[icol]);
-#if ( PRNTlevel>=2 )
-			            if ( perm_r[irow] == icol ) { /* New diagonal */
-			              if ( job == 2 || job == 3 )
-			                dmin = SUPERLU_MIN(dmin, z_abs1(&a[i]));
-			              else if ( job == 4 )
-				        dsum += z_abs1(&a[i]);
-			              else if ( job == 5 )
-				        dprod *= z_abs1(&a[i]);
-			            }
-#endif
-			        }
-			        ++irow;
-		            }
-
-		            /* Multiply together the scaling factors. */
-		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
-		            else for (i = 0; i < m; ++i) R[i] = R1[i];
-		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
-		            else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
-		            ScalePermstruct->DiagScale = BOTH;
-		            rowequ = colequ = 1;
-
-		        } /* end Equil */
-
-                        /* Now permute global A to prepare for symbfact() */
-                        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	                        irow = rowind[i];
-		                rowind[i] = perm_r[irow];
-		            }
-		        }
-		        SUPERLU_FREE (R1);
-		        SUPERLU_FREE (C1);
-	            } else { /* job = 2,3,4 */
-		        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			        irow = rowind[i];
-			        rowind[i] = perm_r[irow];
-		            } /* end for i ... */
-		        } /* end for j ... */
-	            } /* end else job ... */
-
-#if ( PRNTlevel>=2 )
-	            if ( job == 2 || job == 3 ) {
-		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
-	            } else if ( job == 4 ) {
-		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
-	            } else if ( job == 5 ) {
-		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
-	            }
-#endif
-	    
-                } /* end if options->RowPerm ... */
-
-	        t = SuperLU_timer_() - t;
-	        stat->utime[ROWPERM] = t;
-#if ( PRNTlevel>=1 )
-	        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
-#endif
-            } /* end if Fact ... */
-        } else { /* options->RowPerm == NOROWPERM */
-            for (i = 0; i < m; ++i) perm_r[i] = i;
-        }
-
-#if ( DEBUGlevel>=2 )
-        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
-#endif
-    } /* end if (!factored) */
-
-    if ( !factored || options->IterRefine ) {
-	/* Compute norm(A), which will be used to adjust small diagonal. */
-	if ( notran ) *(unsigned char *)norm = '1';
-	else *(unsigned char *)norm = 'I';
-	anorm = pzlangs(norm, A, grid);
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
-#endif
-    }
-
-    /* ------------------------------------------------------------
-       Perform the LU factorization.
-       ------------------------------------------------------------*/
-    if ( !factored ) {
-	t = SuperLU_timer_();
-	/*
-	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
-	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	parSymbFact = options->ParSymbFact;
-
-#if ( PRNTlevel>=1 )
-	if ( parSymbFact && permc_spec != PARMETIS )
-	    if ( !iam ) printf(".. Parallel symbolic factorization"
-			       " only works wth ParMetis!\n");
-#endif
-
-	if ( parSymbFact == YES || permc_spec == PARMETIS ) {	
-	    nprocs_num = grid->nprow * grid->npcol;
-  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));
-
-	    /* create a new communicator for the first noDomains processors in
-	       grid->comm */
-	    key = iam;
-    	    if (iam < noDomains) col = 0;
-	    else col = MPI_UNDEFINED;
-	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
-
-	    permc_spec = PARMETIS; /* only works with PARMETIS */
-        }
-
-	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
-	  if ( permc_spec == PARMETIS ) {
-	      /* Get column permutation vector in perm_c.                    *
-	       * This routine takes as input the distributed input matrix A  *
-	       * and does not modify it.  It also allocates memory for       *
-	       * sizes[] and fstVtxSep[] arrays, that contain information    *
-	       * on the separator tree computed by ParMETIS.                 */
-	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
-                                  	   noDomains, &sizes, &fstVtxSep,
-                                           grid, &symb_comm);
-	      if (flinfo > 0)
-	          ABORT("ERROR in get perm_c parmetis.");
-	  } else {
-	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
-          }
-        }
-
-	stat->utime[COLPERM] = SuperLU_timer_() - t;
-
-	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
-	   (a.k.a. column etree), depending on the choice of ColPerm.
-	   Adjust perm_c[] to be consistent with a postorder of etree.
-	   Permute columns of A to form A*Pc'. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-	    if ( parSymbFact == NO ) {
-	        int_t *GACcolbeg, *GACcolend, *GACrowind;
-
-	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
-
-	        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
-	        GACstore = (NCPformat *) GAC.Store;
-	        GACcolbeg = GACstore->colbeg;
-	        GACcolend = GACstore->colend;
-	        GACrowind = GACstore->rowind;
-	        for (j = 0; j < n; ++j) {
-	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
-		        irow = GACrowind[i];
-		        GACrowind[i] = perm_c[irow];
-	            }
-	        }
-
-	        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
-                   the nonzero data structures for L & U. */
-#if ( PRNTlevel>=1 ) 
-                if ( !iam )
-		  printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
-#endif
-  	        t = SuperLU_timer_();
-	        if ( !(Glu_freeable = (Glu_freeable_t *)
-		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
-		    ABORT("Malloc fails for Glu_freeable.");
-
-	    	/* Every process does this. */
-	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
-			     	 Glu_persist, Glu_freeable);
-
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if ( iinfo < 0 ) { /* Successful return */
-		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 )
-		    if ( !iam ) {
-		    	printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
-		    	printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
-		    	printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n", 
-			       sizeof(int_t), sizeof(short), sizeof(float),
-			       sizeof(double));
-		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-			   	symb_mem_usage.for_lu*1e-6, 
-			   	symb_mem_usage.total*1e-6,
-			   	symb_mem_usage.expansions);
-		    }
-#endif
-	    	} else {
-		    if ( !iam ) {
-		        fprintf(stderr,"symbfact() error returns %d\n",iinfo);
-		    	exit(-1);
-		    }
-	        }
-	    } /* end if serial symbolic factorization */
-	    else {  /* parallel symbolic factorization */
-	    	t = SuperLU_timer_();
-	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
-				       sizes, fstVtxSep, &Pslu_freeable, 
-				       &(grid->comm), &symb_comm,
-				       &symb_mem_usage); 
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if (flinfo > 0) 
-	      	    ABORT("Insufficient memory for parallel symbolic factorization.");
-	    }
-	} /* end if Fact ... */
-
-	if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]);
-        if (sizes) SUPERLU_FREE (sizes);
-        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
-	if (symb_comm != MPI_COMM_NULL)
-	  MPI_Comm_free (&symb_comm); 
-
-	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
-  	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-  	       distribution routine. */
-	    t = SuperLU_timer_();
-	    dist_mem_use = pzdistribute(Fact, n, A, ScalePermstruct,
-                                      Glu_freeable, LUstruct, grid);
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-
-  	    /* Deallocate storage used in symbolic factorization. */
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        iinfo = symbfact_SubFree(Glu_freeable);
-	        SUPERLU_FREE(Glu_freeable);
-	    }
-	} else {
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-	       distribution routine. */
-	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-    	    t = SuperLU_timer_();
-	    dist_mem_use = zdist_psymbtonum(Fact, n, A, ScalePermstruct,
-		  			   &Pslu_freeable, LUstruct, grid);
-	    if (dist_mem_use > 0)
-	        ABORT ("Not enough memory available for dist_psymbtonum\n");
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-	}
-
-	if (!iam) printf ("\tDISTRIBUTE time    %8.2f\n", stat->utime[DIST]);
-
-	/* Perform numerical factorization in parallel. */
-	t = SuperLU_timer_();
-	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
-	stat->utime[FACT] = SuperLU_timer_() - t;
-
-#if ( PRNTlevel>=1 )
-	{
-	    int_t TinyPivots;
-	    float for_lu, total, max, avg, temp;
-	    zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
-	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Reduce( &num_mem_usage.total, &total,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    temp = SUPERLU_MAX(symb_mem_usage.total,
-			       symb_mem_usage.for_lu +
-			       (float)dist_mem_use + num_mem_usage.for_lu);
-	    if (parSymbFact == TRUE)
-	      /* The memory used in the redistribution routine
-		 includes the memory used for storing the symbolic
-		 structure and the memory allocated for numerical
-		 factorization */
-	      temp = SUPERLU_MAX(symb_mem_usage.total,
-				 (float)dist_mem_use);
-	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
-			  MPI_SUM, grid->comm );
-	    stat->TinyPivots = TinyPivots;
-	    if ( !iam ) {
-		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
-		       for_lu*1e-6, total*1e-6);
-		printf("\tAll space (MB):"
-		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
-		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
-		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
-	    }
-	}
-#endif
-    
-        /* Destroy GA */
-        if ( Fact != SamePattern_SameRowPerm )
-            Destroy_CompCol_Matrix_dist(&GA);
-    } /* end if (!factored) */
-	
-    /* ------------------------------------------------------------
-       Compute the solution matrix X.
-       ------------------------------------------------------------*/
-    if ( nrhs ) {
-
-	if ( !(b_work = doublecomplexMalloc_dist(n)) )
-	    ABORT("Malloc fails for b_work[]");
-
-	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    if ( rowequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-                        zd_mult(&b_col[i], &b_col[i], R[irow]);
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( colequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    zd_mult(&b_col[i], &b_col[i], C[irow]);
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	/* Save a copy of the right-hand side. */
-	ldx = ldb;
-	if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) )
-	    ABORT("Malloc fails for X[]");
-	x_col = X;  b_col = B;
-	for (j = 0; j < nrhs; ++j) {
-	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
-	    x_col += ldx;  b_col += ldb;
-	}
-
-	/* ------------------------------------------------------------
-	   Solve the linear system.
-	   ------------------------------------------------------------*/
-	if ( options->SolveInitialized == NO ) {
-	    zSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
-		       SOLVEstruct);
-	}
-
-	pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
-		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
-
-	/* ------------------------------------------------------------
-	   Use iterative refinement to improve the computed solution and
-	   compute error bounds and backward error estimates for it.
-	   ------------------------------------------------------------*/
-	if ( options->IterRefine ) {
-	    /* Improve the solution by iterative refinement. */
-	    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
-
-	    t = SuperLU_timer_();
-	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
-	        /* All these cases need to re-initialize gsmv structure */
-	        if ( options->RefineInitialized )
-		    pzgsmv_finalize(SOLVEstruct->gsmv_comm);
-	        pzgsmv_init(A, SOLVEstruct->row_to_proc, grid,
-			    SOLVEstruct->gsmv_comm);
-	       
-                /* Save a copy of the transformed local col indices
-		   in colind_gsmv[]. */
-	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
-	        if ( !(it = intMalloc_dist(nnz_loc)) )
-		    ABORT("Malloc fails for colind_gsmv[]");
-	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
-	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
-	        options->RefineInitialized = YES;
-	    } else if ( Fact == SamePattern ||
-			Fact == SamePattern_SameRowPerm ) {
-	        doublecomplex at;
-	        int_t k, jcol, p;
-	        /* Swap to beginning the part of A corresponding to the
-		   local part of X, as was done in pzgsmv_init() */
-	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-		    k = rowptr[i];
-		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-		        jcol = colind[j];
-		        p = SOLVEstruct->row_to_proc[jcol];
-		        if ( p == iam ) { /* Local */
-		            at = a[k]; a[k] = a[j]; a[j] = at;
-		            ++k;
-		        }
-		    }
-	        }
-	      
-	        /* Re-use the local col indices of A obtained from the
-		   previous call to pzgsmv_init() */
-	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
-	    }
-
-	    if ( nrhs == 1 ) { /* Use the existing solve structure */
-	        SOLVEstruct1 = SOLVEstruct;
-	    } else { /* For nrhs > 1, since refinement is performed for RHS
-			one at a time, the communication structure for pdgstrs
-			is different than the solve with nrhs RHS. 
-			So we use SOLVEstruct1 for the refinement step.
-		     */
-	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
-		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
-		    ABORT("Malloc fails for SOLVEstruct1");
-	        /* Copy the same stuff */
-	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
-	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
-	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
-	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
-	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
-	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
-	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		
-		/* Initialize the *gstrs_comm for 1 RHS. */
-		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
-		    ABORT("Malloc fails for gstrs_comm[]");
-		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
-			     Glu_persist, SOLVEstruct1);
-	    }
-
-	    pzgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
-		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
-
-            /* Deallocate the storage associated with SOLVEstruct1 */
-	    if ( nrhs > 1 ) {
-	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
-	        SUPERLU_FREE(SOLVEstruct1);
-	    }
-
-	    stat->utime[REFINE] = SuperLU_timer_() - t;
-	}
-
-	/* Permute the solution matrix B <= Pc'*X. */
-	pzPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
-			       SOLVEstruct->inv_perm_c,
-			       X, ldx, B, ldb, nrhs, grid);
-#if ( DEBUGlevel>=2 )
-	printf("\n (%d) .. After pzPermute_Dense_Matrix(): b =\n", iam);
-	for (i = 0; i < m_loc; ++i)
-	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
-#endif
-	
-	/* Transform the solution matrix X to a solution of the original
-	   system before the equilibration. */
-	if ( notran ) {
-	    if ( colequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-                        zd_mult(&b_col[i], &b_col[i], C[irow]);
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( rowequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    zd_mult(&b_col[i], &b_col[i], R[irow]);
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	SUPERLU_FREE(b_work);
-	SUPERLU_FREE(X);
-
-    } /* end if nrhs != 0 */
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-#endif
-
-    /* Deallocate R and/or C if it was not used. */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-	        SUPERLU_FREE(R);
-		SUPERLU_FREE(C);
-		break;
-	    case ROW: 
-		SUPERLU_FREE(C);
-		break;
-	    case COL: 
-		SUPERLU_FREE(R);
-		break;
-	}
-    }
-    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
- 	Destroy_CompCol_Permuted_dist(&GAC);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgssvx()");
-#endif
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgssvx.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,1281 +0,0 @@
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-pzgssvx(superlu_options_t *options, SuperMatrix *A, 
-	ScalePermstruct_t *ScalePermstruct,
-	doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
-	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
-	SuperLUStat_t *stat, int *info)
-{
-/* 
- * -- Distributed SuperLU routine (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- *
- * Last update: 2/18/2008
- *
- *
- * Purpose
- * =======
- *
- * PZGSSVX solves a system of linear equations A*X=B,
- * by using Gaussian elimination with "static pivoting" to
- * compute the LU factorization of A.
- *
- * Static pivoting is a technique that combines the numerical stability
- * of partial pivoting with the scalability of Cholesky (no pivoting),
- * to run accurately and efficiently on large numbers of processors.
- * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
- * description of the parallel algorithms.
- *
- * The input matrices A and B are distributed by block rows.
- * Here is a graphical illustration (0-based indexing):
- *
- *                        A                B
- *               0 ---------------       ------
- *                   |           |        |  |
- *                   |           |   P0   |  |
- *                   |           |        |  |
- *                 ---------------       ------
- *        - fst_row->|           |        |  |
- *        |          |           |        |  |
- *       m_loc       |           |   P1   |  |
- *        |          |           |        |  |
- *        -          |           |        |  |
- *                 ---------------       ------
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                   |    .      |        |. |
- *                 ---------------       ------
- * 
- * where, fst_row is the row number of the first row,
- *        m_loc is the number of rows local to this processor
- * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
- *
- *
- * Here are the options for using this code:
- *
- *   1. Independent of all the other options specified below, the
- *      user must supply
- *
- *      -  B, the matrix of right-hand sides, distributed by block rows,
- *            and its dimensions ldb (local) and nrhs (global)
- *      -  grid, a structure describing the 2D processor mesh
- *      -  options->IterRefine, which determines whether or not to
- *            improve the accuracy of the computed solution using 
- *            iterative refinement
- *
- *      On output, B is overwritten with the solution X.
- *
- *   2. Depending on options->Fact, the user has four options
- *      for solving A*X=B. The standard option is for factoring
- *      A "from scratch". (The other options, described below,
- *      are used when A is sufficiently similar to a previously 
- *      solved problem to save time by reusing part or all of 
- *      the previous factorization.)
- *
- *      -  options->Fact = DOFACT: A is factored "from scratch"
- *
- *      In this case the user must also supply
- *
- *        o  A, the input matrix
- *
- *        as well as the following options to determine what matrix to
- *        factorize.
- *
- *        o  options->Equil,   to specify how to scale the rows and columns
- *                             of A to "equilibrate" it (to try to reduce its
- *                             condition number and so improve the
- *                             accuracy of the computed solution)
- *
- *        o  options->RowPerm, to specify how to permute the rows of A
- *                             (typically to control numerical stability)
- *
- *        o  options->ColPerm, to specify how to permute the columns of A
- *                             (typically to control fill-in and enhance
- *                             parallelism during factorization)
- *
- *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
- *                             pivots encountered during factorization
- *                             (to control numerical stability)
- *
- *      The outputs returned include
- *         
- *        o  ScalePermstruct,  modified to describe how the input matrix A
- *                             was equilibrated and permuted:
- *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
- *                                         columns of A were scaled
- *          .  ScalePermstruct->R, array of row scale factors
- *          .  ScalePermstruct->C, array of column scale factors
- *          .  ScalePermstruct->perm_r, row permutation vector
- *          .  ScalePermstruct->perm_c, column permutation vector
- *
- *          (part of ScalePermstruct may also need to be supplied on input,
- *           depending on options->RowPerm and options->ColPerm as described 
- *           later).
- *
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix diag(R)*A*diag(C)*Pc^T, where 
- *              Pc is the row permutation matrix determined by
- *                  ScalePermstruct->perm_c
- *              diag(R) and diag(C) are diagonal scaling matrices determined
- *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
- *                  ScalePermstruct->C
- *
- *        o  LUstruct, which contains the L and U factorization of A1 where
- *
- *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
- *                in A on output.)
- *
- *   3. The second value of options->Fact assumes that a matrix with the same
- *      sparsity pattern as A has already been factored:
- *     
- *      -  options->Fact = SamePattern: A is factored, assuming that it has
- *            the same nonzero pattern as a previously factored matrix. In
- *            this case the algorithm saves time by reusing the previously
- *            computed column permutation vector stored in
- *            ScalePermstruct->perm_c and the "elimination tree" of A
- *            stored in LUstruct->etree
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->RowPerm
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->ColPerm, whose value is ignored. This is because the
- *      previous column permutation from ScalePermstruct->perm_c is used as
- *      input. The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->perm_c, the column permutation
- *        o  LUstruct->etree, the elimination tree
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct, modified to describe how the input matrix A was
- *                            equilibrated and row permuted
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   4. The third value of options->Fact assumes that a matrix B with the same
- *      sparsity pattern as A has already been factored, and where the
- *      row permutation of B can be reused for A. This is useful when A and B
- *      have similar numerical values, so that the same row permutation
- *      will make both factorizations numerically stable. This lets us reuse
- *      all of the previously computed structure of L and U.
- *
- *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
- *            assuming not only the same nonzero pattern as the previously
- *            factored matrix B, but reusing B's row permutation.
- *
- *      In this case the user must still specify the following options
- *      as before:
- *
- *        o  options->Equil
- *        o  options->ReplaceTinyPivot
- *
- *      but not options->RowPerm or options->ColPerm, whose values are
- *      ignored. This is because the permutations from ScalePermstruct->perm_r
- *      and ScalePermstruct->perm_c are used as input.
- *
- *      The user must also supply 
- *
- *        o  A, the input matrix
- *        o  ScalePermstruct->DiagScale, how the previous matrix was row
- *                                       and/or column scaled
- *        o  ScalePermstruct->R, the row scalings of the previous matrix,
- *                               if any
- *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
- *                               if any
- *        o  ScalePermstruct->perm_r, the row permutation of the previous
- *                                    matrix
- *        o  ScalePermstruct->perm_c, the column permutation of the previous 
- *                                    matrix
- *        o  all of LUstruct, the previously computed information about
- *                            L and U (the actual numerical values of L and U
- *                            stored in LUstruct->Llu are ignored)
- *
- *      The outputs returned include
- *         
- *        o  A, the input matrix A overwritten by the scaled and permuted
- *              matrix as described above
- *        o  ScalePermstruct,  modified to describe how the input matrix A was
- *                             equilibrated (thus ScalePermstruct->DiagScale,
- *                             R and C may be modified)
- *        o  LUstruct, modified to contain the new L and U factors
- *
- *   5. The fourth and last value of options->Fact assumes that A is
- *      identical to a matrix that has already been factored on a previous 
- *      call, and reuses its entire LU factorization
- *
- *      -  options->Fact = Factored: A is identical to a previously
- *            factorized matrix, so the entire previous factorization
- *            can be reused.
- *
- *      In this case all the other options mentioned above are ignored
- *      (options->Equil, options->RowPerm, options->ColPerm, 
- *       options->ReplaceTinyPivot)
- *
- *      The user must also supply 
- *
- *        o  A, the unfactored matrix, only in the case that iterative
- *              refinment is to be done (specifically A must be the output
- *              A from the previous call, so that it has been scaled and permuted)
- *        o  all of ScalePermstruct
- *        o  all of LUstruct, including the actual numerical values of
- *           L and U
- *
- *      all of which are unmodified on output.
- *         
- * Arguments
- * =========
- *
- * options (input) superlu_options_t* (global)
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following fields should be defined for this structure:
- *         
- *         o Fact (fact_t)
- *           Specifies whether or not the factored form of the matrix
- *           A is supplied on entry, and if not, how the matrix A should
- *           be factorized based on the previous history.
- *
- *           = DOFACT: The matrix A will be factorized from scratch.
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *
- *           = SamePattern: the matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the elimination tree
- *             LUstruct->etree
- *                 Inputs:  A
- *                          options->Equil, RowPerm, ReplaceTinyPivot
- *                          ScalePermstruct->perm_c
- *                          LUstruct->etree
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
- *                          rest of LUstruct (GLU_persist, Llu)
- *
- *           = SamePattern_SameRowPerm: the matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *                 Inputs:  A
- *                          options->Equil, ReplaceTinyPivot
- *                          all of ScalePermstruct
- *                          all of LUstruct
- *                 Outputs: modified A
- *                             (possibly row and/or column scaled and/or 
- *                              permuted)
- *                          modified LUstruct->Llu
- *           = FACTORED: the matrix A is already factored.
- *                 Inputs:  all of ScalePermstruct
- *                          all of LUstruct
- *
- *         o Equil (yes_no_t)
- *           Specifies whether to equilibrate the system.
- *           = NO:  no equilibration.
- *           = YES: scaling factors are computed to equilibrate the system:
- *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
- *                  Whether or not the system will be equilibrated depends
- *                  on the scaling of the matrix A, but if equilibration is
- *                  used, A is overwritten by diag(R)*A*diag(C) and B by
- *                  diag(R)*B.
- *
- *         o RowPerm (rowperm_t)
- *           Specifies how to permute rows of the matrix A.
- *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
- *                        the original matrix to make the diagonal large
- *                        relative to the off-diagonal.
- *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
- *                        input by the user.
- *           
- *         o ColPerm (colperm_t)
- *           Specifies what type of column permutation to use to reduce fill.
- *           = NATURAL:       natural ordering.
- *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
- *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
- *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
- *         
- *         o ReplaceTinyPivot (yes_no_t)
- *           = NO:  do not modify pivots
- *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
- *                  LU factorization.
- *
- *         o IterRefine (IterRefine_t)
- *           Specifies how to perform iterative refinement.
- *           = NO:     no iterative refinement.
- *           = DOUBLE: accumulate residual in double precision.
- *           = EXTRA:  accumulate residual in extra precision.
- *
- *         NOTE: all options must be indentical on all processes when
- *               calling this routine.
- *
- * A (input/output) SuperMatrix* (local)
- *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *           The number of linear equations is A->nrow. The type of A must be:
- *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
- *           That is, A is stored in distributed compressed row format.
- *           See supermatrix.h for the definition of 'SuperMatrix'.
- *           This routine only handles square A, however, the LU factorization
- *           routine PDGSTRF can factorize rectangular matrices.
- *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
- *           depending on ScalePermstruct->DiagScale and options->ColPerm:
- *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
- *                diag(R)*A*diag(C).
- *             if options->ColPerm != NATURAL, A is further overwritten by
- *                diag(R)*A*diag(C)*Pc^T.
- *           If all the above condition are true, the LU decomposition is
- *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
- *
- * ScalePermstruct (input/output) ScalePermstruct_t* (global)
- *         The data structure to store the scaling and permutation vectors
- *         describing the transformations performed to the matrix A.
- *         It contains the following fields:
- *
- *         o DiagScale (DiagScale_t)
- *           Specifies the form of equilibration that was done.
- *           = NOEQUIL: no equilibration.
- *           = ROW:     row equilibration, i.e., A was premultiplied by
- *                      diag(R).
- *           = COL:     Column equilibration, i.e., A was postmultiplied
- *                      by diag(C).
- *           = BOTH:    both row and column equilibration, i.e., A was 
- *                      replaced by diag(R)*A*diag(C).
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
- *           DiagScale is an input argument; otherwise it is an output
- *           argument.
- *
- *         o perm_r (int*)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *           If options->RowPerm = MY_PERMR, or
- *           options->Fact = SamePattern_SameRowPerm, perm_r is an
- *           input argument; otherwise it is an output argument.
- *
- *         o perm_c (int*)
- *           Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
- *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
- *           input argument; otherwise, it is an output argument.
- *           On exit, perm_c may be overwritten by the product of the input
- *           perm_c and a permutation that postorders the elimination tree
- *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
- *           is already in postorder.
- *
- *         o R (double*) dimension (A->nrow)
- *           The row scale factors for A.
- *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
- *                          diag(R).
- *           If DiagScale = NOEQUIL or COL, R is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
- *           an input argument; otherwise, R is an output argument.
- *
- *         o C (double*) dimension (A->ncol)
- *           The column scale factors for A.
- *           If DiagScale = COL or BOTH, A is multiplied on the right by 
- *                          diag(C).
- *           If DiagScale = NOEQUIL or ROW, C is not defined.
- *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
- *           an input argument; otherwise, C is an output argument.
- *         
- * B       (input/output) doublecomplex* (local)
- *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
- *           where, m_loc is the number of rows stored locally on my
- *           process and is defined in the data structure of matrix A.
- *         On exit, the solution matrix if info = 0;
- *
- * ldb     (input) int (local)
- *         The leading dimension of matrix B.
- *
- * nrhs    (input) int (global)
- *         The number of right-hand sides.
- *         If nrhs = 0, only LU decomposition is performed, the forward
- *         and back substitutions are skipped.
- *
- * grid    (input) gridinfo_t* (global)
- *         The 2D process mesh. It contains the MPI communicator, the number
- *         of process rows (NPROW), the number of process columns (NPCOL),
- *         and my process rank. It is an input argument to all the
- *         parallel routines.
- *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *         See superlu_zdefs.h for the definition of 'gridinfo_t'.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         It contains the following fields:
- *
- *         o etree (int*) dimension (A->ncol) (global)
- *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
- *           It is computed in sp_colorder() during the first factorization,
- *           and is reused in the subsequent factorizations of the matrices
- *           with the same nonzero pattern.
- *           On exit of sp_colorder(), the columns of A are permuted so that
- *           the etree is in a certain postorder. This postorder is reflected
- *           in ScalePermstruct->perm_c.
- *           NOTE:
- *           Etree is a vector of parent pointers for a forest whose vertices
- *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- *         o Glu_persist (Glu_persist_t*) (global)
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (LocalLU_t*) (local)
- *           The distributed data structures to store L and U factors.
- *           See superlu_zdefs.h for the definition of 'LocalLU_t'.
- *
- * SOLVEstruct (input/output) SOLVEstruct_t*
- *         The data structure to hold the communication pattern used
- *         in the phases of triangular solution and iterative refinement.
- *         This pattern should be intialized only once for repeated solutions.
- *         If options->SolveInitialized = YES, it is an input argument.
- *         If options->SolveInitialized = NO and nrhs != 0, it is an output
- *         argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'.
- *
- * berr    (output) double*, dimension (nrhs) (global)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *
- * See superlu_zdefs.h for the definitions of varioous data types.
- *
- */
-    NRformat_loc *Astore;
-    SuperMatrix GA;      /* Global A in NC format */
-    NCformat *GAstore;
-    doublecomplex   *a_GA;
-    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
-    NCPformat *GACstore;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    Glu_freeable_t *Glu_freeable;
-            /* The nonzero structures of L and U factors, which are
-	       replicated on all processrs.
-	           (lsub, xlsub) contains the compressed subscript of
-		                 supernodes in L.
-          	   (usub, xusub) contains the compressed subscript of
-		                 nonzero segments in U.
-	      If options->Fact != SamePattern_SameRowPerm, they are 
-	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
-	      routine. They will be freed after PDDISTRIBUTE routine.
-	      If options->Fact == SamePattern_SameRowPerm, these
-	      structures are not used.                                  */
-    fact_t   Fact;
-    doublecomplex   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    *rowind_loc, *colptr_loc;
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
-    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
-    doublecomplex   *X, *b_col, *b_work, *x_col;
-    double   t;
-    static mem_usage_t num_mem_usage, symb_mem_usage;
-#if ( PRNTlevel>= 2 )
-    double   dmin, dsum, dprod;
-#endif
-    int_t procs;
-
-    /* Structures needed for parallel symbolic factorization */
-    int_t *sizes, *fstVtxSep, parSymbFact;
-    int   noDomains, nprocs_num;
-    MPI_Comm symb_comm; /* communicator for symbolic factorization */
-    int   col, key; /* parameters for creating a new communicator */
-    Pslu_freeable_t Pslu_freeable;
-    float  flinfo;
-
-    /* Initialization. */
-    m       = A->nrow;
-    n       = A->ncol;
-    Astore  = (NRformat_loc *) A->Store;
-    nnz_loc = Astore->nnz_loc;
-    m_loc   = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    a       = (doublecomplex *) Astore->nzval;
-    rowptr  = Astore->rowptr;
-    colind  = Astore->colind;
-    sizes   = NULL;
-    fstVtxSep = NULL;
-    symb_comm = MPI_COMM_NULL;
-
-    /* Test the input parameters. */
-    *info = 0;
-    Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
-	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
-	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
-	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
-	*info = -1;
-    else if ( options->IterRefine == EXTRA ) {
-	*info = -1;
-	fprintf(stderr, "Extra precise iterative refinement yet to support.");
-    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-		|| A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( ldb < m_loc )
-	*info = -5;
-    else if ( nrhs < 0 )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	pxerbla("pzgssvx", grid, -*info);
-	return;
-    }
-
-    factored = (Fact == FACTORED);
-    Equil = (!factored && options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    iam = grid->iam;
-    job = 5;
-    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
-	rowequ = (ScalePermstruct->DiagScale == ROW) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-	colequ = (ScalePermstruct->DiagScale == COL) ||
-	         (ScalePermstruct->DiagScale == BOTH);
-    } else rowequ = colequ = FALSE;
-
-    /* The following arrays are replicated on all processes. */
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    etree = LUstruct->etree;
-    R = ScalePermstruct->R;
-    C = ScalePermstruct->C;
-    /********/
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgssvx()");
-#endif
-
-    /* Not factored & ask for equilibration */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
-	/* Allocate storage if not done so before. */
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->R = R;
-		ScalePermstruct->C = C;
-		break;
-	    case ROW: 
-	        if ( !(C = (double *) doubleMalloc_dist(n)) )
-		    ABORT("Malloc fails for C[].");
-		ScalePermstruct->C = C;
-		break;
-	    case COL: 
-		if ( !(R = (double *) doubleMalloc_dist(m)) )
-		    ABORT("Malloc fails for R[].");
-		ScalePermstruct->R = R;
-		break;
-	}
-    }
-
-    /* ------------------------------------------------------------
-       Diagonal scaling to equilibrate the matrix.
-       ------------------------------------------------------------*/
-    if ( Equil ) {
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Enter equil");
-#endif
-	t = SuperLU_timer_();
-
-	if ( Fact == SamePattern_SameRowPerm ) {
-	    /* Reuse R and C. */
-	    switch ( ScalePermstruct->DiagScale ) {
-	      case NOEQUIL:
-		break;
-	      case ROW:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-                        zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
-		    }
-		    ++irow;
-		}
-		break;
-	      case COL:
-		for (j = 0; j < m_loc; ++j)
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
-		        icol = colind[i];
-                        zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
-		    }
-		break;
-	      case BOTH:
-		irow = fst_row;
-		for (j = 0; j < m_loc; ++j) {
-		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			icol = colind[i];
-                        zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
-                        zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
-		    }
-		    ++irow;
-		}
-	        break;
-	    }
-	} else { /* Compute R & C from scratch */
-            /* Compute the row and column scalings. */
-	    pzgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
-
-	    /* Equilibrate matrix A if it is badly-scaled. */
-	    pzlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
-
-	    if ( lsame_(equed, "R") ) {
-		ScalePermstruct->DiagScale = rowequ = ROW;
-	    } else if ( lsame_(equed, "C") ) {
-		ScalePermstruct->DiagScale = colequ = COL;
-	    } else if ( lsame_(equed, "B") ) {
-		ScalePermstruct->DiagScale = BOTH;
-		rowequ = ROW;
-		colequ = COL;
-	    } else ScalePermstruct->DiagScale = NOEQUIL;
-
-#if ( PRNTlevel>=1 )
-	    if ( !iam ) {
-		printf(".. equilibrated? *equed = %c\n", *equed);
-		/*fflush(stdout);*/
-	    }
-#endif
-	} /* if Fact ... */
-
-	stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC(iam, "Exit equil");
-#endif
-    } /* if Equil ... */
-
-    if ( !factored ) { /* Skip this if already factored. */
-        /*
-         * Gather A from the distributed compressed row format to
-         * global A in compressed column format.
-         * Numerical values are gathered only when a row permutation
-         * for large diagonal is sought after.
-         */
-	if ( Fact != SamePattern_SameRowPerm ) {
-            need_value = (options->RowPerm == LargeDiag);
-            pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
-            GAstore = (NCformat *) GA.Store;
-            colptr = GAstore->colptr;
-            rowind = GAstore->rowind;
-            nnz = GAstore->nnz;
-            if ( need_value ) a_GA = (doublecomplex *) GAstore->nzval;
-            else assert(GAstore->nzval == NULL);
-	}
-
-        /* ------------------------------------------------------------
-           Find the row permutation for A.
-           ------------------------------------------------------------*/
-        if ( options->RowPerm != NO ) {
-	    t = SuperLU_timer_();
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
-	            /* Permute the global matrix GA for symbfact() */
-	            for (i = 0; i < colptr[n]; ++i) {
-	            	irow = rowind[i]; 
-		    	rowind[i] = perm_r[irow];
-	            }
-	        } else { /* options->RowPerm == LargeDiag */
-	            /* Get a new perm_r[] */
-	            if ( job == 5 ) {
-		        /* Allocate storage for scaling factors. */
-		        if ( !(R1 = doubleMalloc_dist(m)) )
-		            ABORT("SUPERLU_MALLOC fails for R1[]");
-		    	if ( !(C1 = doubleMalloc_dist(n)) )
-		            ABORT("SUPERLU_MALLOC fails for C1[]");
-	            }
-
-	            if ( !iam ) {
-		        /* Process 0 finds a row permutation */
-		        zldperm(job, m, nnz, colptr, rowind, a_GA,
-		                perm_r, R1, C1);
-		
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            } else {
-		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
-		        if ( job == 5 && Equil ) {
-		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
-		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
-		        }
-	            }
-
-#if ( PRNTlevel>=2 )
-	            dmin = dlamch_("Overflow");
-	            dsum = 0.0;
-	            dprod = 1.0;
-#endif
-	            if ( job == 5 ) {
-		        if ( Equil ) {
-		            for (i = 0; i < n; ++i) {
-			        R1[i] = exp(R1[i]);
-			        C1[i] = exp(C1[i]);
-		            }
-
-		            /* Scale the distributed matrix */
-		            irow = fst_row;
-		            for (j = 0; j < m_loc; ++j) {
-			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
-			            icol = colind[i];
-                                    zd_mult(&a[i], &a[i], R1[irow]);
-                                    zd_mult(&a[i], &a[i], C1[icol]);
-#if ( PRNTlevel>=2 )
-			            if ( perm_r[irow] == icol ) { /* New diagonal */
-			              if ( job == 2 || job == 3 )
-			                dmin = SUPERLU_MIN(dmin, z_abs1(&a[i]));
-			              else if ( job == 4 )
-				        dsum += z_abs1(&a[i]);
-			              else if ( job == 5 )
-				        dprod *= z_abs1(&a[i]);
-			            }
-#endif
-			        }
-			        ++irow;
-		            }
-
-		            /* Multiply together the scaling factors. */
-		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
-		            else for (i = 0; i < m; ++i) R[i] = R1[i];
-		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
-		            else for (i = 0; i < n; ++i) C[i] = C1[i];
-		    
-		            ScalePermstruct->DiagScale = BOTH;
-		            rowequ = colequ = 1;
-
-		        } /* end Equil */
-
-                        /* Now permute global A to prepare for symbfact() */
-                        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	                        irow = rowind[i];
-		                rowind[i] = perm_r[irow];
-		            }
-		        }
-		        SUPERLU_FREE (R1);
-		        SUPERLU_FREE (C1);
-	            } else { /* job = 2,3,4 */
-		        for (j = 0; j < n; ++j) {
-		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
-			        irow = rowind[i];
-			        rowind[i] = perm_r[irow];
-		            } /* end for i ... */
-		        } /* end for j ... */
-	            } /* end else job ... */
-
-#if ( PRNTlevel>=2 )
-	            if ( job == 2 || job == 3 ) {
-		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
-	            } else if ( job == 4 ) {
-		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
-	            } else if ( job == 5 ) {
-		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
-	            }
-#endif
-	    
-                } /* end if options->RowPerm ... */
-
-	        t = SuperLU_timer_() - t;
-	        stat->utime[ROWPERM] = t;
-#if ( PRNTlevel>=1 )
-	        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
-#endif
-            } /* end if Fact ... */
-        } else { /* options->RowPerm == NOROWPERM */
-            for (i = 0; i < m; ++i) perm_r[i] = i;
-        }
-
-#if ( DEBUGlevel>=2 )
-        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
-#endif
-    } /* end if (!factored) */
-
-    if ( !factored || options->IterRefine ) {
-	/* Compute norm(A), which will be used to adjust small diagonal. */
-	if ( notran ) *(unsigned char *)norm = '1';
-	else *(unsigned char *)norm = 'I';
-	anorm = pzlangs(norm, A, grid);
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. anorm %e\n", anorm);
-#endif
-    }
-
-    /* ------------------------------------------------------------
-       Perform the LU factorization.
-       ------------------------------------------------------------*/
-    if ( !factored ) {
-	t = SuperLU_timer_();
-	/*
-	 * Get column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
-	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	parSymbFact = options->ParSymbFact;
-
-#if ( PRNTlevel>=1 )
-	if ( parSymbFact && permc_spec != PARMETIS )
-	    if ( !iam ) printf(".. Parallel symbolic factorization"
-			       " only works wth ParMetis!\n");
-#endif
-
-	if ( parSymbFact == YES || permc_spec == PARMETIS ) {	
-	    nprocs_num = grid->nprow * grid->npcol;
-  	    noDomains = (int) ( pow(2, ((int) log2( (double)nprocs_num ))));
-
-	    /* create a new communicator for the first noDomains processors in
-	       grid->comm */
-	    key = iam;
-    	    if (iam < noDomains) col = 0;
-	    else col = MPI_UNDEFINED;
-	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
-
-	    permc_spec = PARMETIS; /* only works with PARMETIS */
-        }
-
-	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
-	  if ( permc_spec == PARMETIS ) {
-	      /* Get column permutation vector in perm_c.                    *
-	       * This routine takes as input the distributed input matrix A  *
-	       * and does not modify it.  It also allocates memory for       *
-	       * sizes[] and fstVtxSep[] arrays, that contain information    *
-	       * on the separator tree computed by ParMETIS.                 */
-	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
-                                  	   noDomains, &sizes, &fstVtxSep,
-                                           grid, &symb_comm);
-	      if (flinfo > 0)
-	          ABORT("ERROR in get perm_c parmetis.");
-	  } else {
-	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
-          }
-        }
-
-	stat->utime[COLPERM] = SuperLU_timer_() - t;
-
-	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
-	   (a.k.a. column etree), depending on the choice of ColPerm.
-	   Adjust perm_c[] to be consistent with a postorder of etree.
-	   Permute columns of A to form A*Pc'. */
-	if ( Fact != SamePattern_SameRowPerm ) {
-	    if ( parSymbFact == NO ) {
-	        int_t *GACcolbeg, *GACcolend, *GACrowind;
-
-	        sp_colorder(options, &GA, perm_c, etree, &GAC); 
-
-	        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
-	        GACstore = (NCPformat *) GAC.Store;
-	        GACcolbeg = GACstore->colbeg;
-	        GACcolend = GACstore->colend;
-	        GACrowind = GACstore->rowind;
-	        for (j = 0; j < n; ++j) {
-	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
-		        irow = GACrowind[i];
-		        GACrowind[i] = perm_c[irow];
-	            }
-	        }
-
-	        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
-                   the nonzero data structures for L & U. */
-#if ( PRNTlevel>=1 ) 
-                if ( !iam )
-		  printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
-#endif
-  	        t = SuperLU_timer_();
-	        if ( !(Glu_freeable = (Glu_freeable_t *)
-		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
-		    ABORT("Malloc fails for Glu_freeable.");
-
-	    	/* Every process does this. */
-	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
-			     	 Glu_persist, Glu_freeable);
-
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if ( iinfo < 0 ) { /* Successful return */
-		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-#if ( PRNTlevel>=1 )
-		    if ( !iam ) {
-		    	printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
-		    	printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
-		    	printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n", 
-			       sizeof(int_t), sizeof(short), sizeof(float),
-			       sizeof(double));
-		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-			   	symb_mem_usage.for_lu*1e-6, 
-			   	symb_mem_usage.total*1e-6,
-			   	symb_mem_usage.expansions);
-		    }
-#endif
-	    	} else {
-		    if ( !iam ) {
-		        fprintf(stderr,"symbfact() error returns %d\n",iinfo);
-		    	exit(-1);
-		    }
-	        }
-	    } /* end if serial symbolic factorization */
-	    else {  /* parallel symbolic factorization */
-	    	t = SuperLU_timer_();
-	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
-				       sizes, fstVtxSep, &Pslu_freeable, 
-				       &(grid->comm), &symb_comm,
-				       &symb_mem_usage); 
-	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-	    	if (flinfo > 0) 
-	      	    ABORT("Insufficient memory for parallel symbolic factorization.");
-	    }
-	} /* end if Fact ... */
-
-	if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]);
-        if (sizes) SUPERLU_FREE (sizes);
-        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
-	if (symb_comm != MPI_COMM_NULL)
-	  MPI_Comm_free (&symb_comm); 
-
-	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
-  	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-  	       distribution routine. */
-	    t = SuperLU_timer_();
-	    dist_mem_use = pzdistribute(Fact, n, A, ScalePermstruct,
-                                      Glu_freeable, LUstruct, grid);
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-
-  	    /* Deallocate storage used in symbolic factorization. */
-	    if ( Fact != SamePattern_SameRowPerm ) {
-	        iinfo = symbfact_SubFree(Glu_freeable);
-	        SUPERLU_FREE(Glu_freeable);
-	    }
-	} else {
-	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
-	       NOTE: the row permutation Pc*Pr is applied internally in the
-	       distribution routine. */
-	    /* Apply column permutation to the original distributed A */
-	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
-
-    	    t = SuperLU_timer_();
-	    dist_mem_use = zdist_psymbtonum(Fact, n, A, ScalePermstruct,
-		  			   &Pslu_freeable, LUstruct, grid);
-	    if (dist_mem_use > 0)
-	        ABORT ("Not enough memory available for dist_psymbtonum\n");
-	    stat->utime[DIST] = SuperLU_timer_() - t;
-	}
-
-	if (!iam) printf ("\tDISTRIBUTE time    %8.2f\n", stat->utime[DIST]);
-
-	/* Perform numerical factorization in parallel. */
-	t = SuperLU_timer_();
-	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
-	stat->utime[FACT] = SuperLU_timer_() - t;
-
-#if ( PRNTlevel>=1 )
-	{
-	    int_t TinyPivots;
-	    float for_lu, total, max, avg, temp;
-	    zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
-	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Reduce( &num_mem_usage.total, &total,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    temp = SUPERLU_MAX(symb_mem_usage.total,
-			       symb_mem_usage.for_lu +
-			       (float)dist_mem_use + num_mem_usage.for_lu);
-	    if (parSymbFact == TRUE)
-	      /* The memory used in the redistribution routine
-		 includes the memory used for storing the symbolic
-		 structure and the memory allocated for numerical
-		 factorization */
-	      temp = SUPERLU_MAX(symb_mem_usage.total,
-				 (float)dist_mem_use);
-	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
-			  MPI_SUM, grid->comm );
-	    stat->TinyPivots = TinyPivots;
-	    if ( !iam ) {
-		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
-		       for_lu*1e-6, total*1e-6);
-		printf("\tAll space (MB):"
-		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
-		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
-		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
-	    }
-	}
-#endif
-    
-        /* Destroy GA */
-        if ( Fact != SamePattern_SameRowPerm )
-            Destroy_CompCol_Matrix_dist(&GA);
-    } /* end if (!factored) */
-	
-    /* ------------------------------------------------------------
-       Compute the solution matrix X.
-       ------------------------------------------------------------*/
-    if ( nrhs ) {
-
-	if ( !(b_work = doublecomplexMalloc_dist(n)) )
-	    ABORT("Malloc fails for b_work[]");
-
-	/* ------------------------------------------------------------
-	   Scale the right-hand side if equilibration was performed. 
-	   ------------------------------------------------------------*/
-	if ( notran ) {
-	    if ( rowequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-                        zd_mult(&b_col[i], &b_col[i], R[irow]);
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( colequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    zd_mult(&b_col[i], &b_col[i], C[irow]);
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	/* Save a copy of the right-hand side. */
-	ldx = ldb;
-	if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) )
-	    ABORT("Malloc fails for X[]");
-	x_col = X;  b_col = B;
-	for (j = 0; j < nrhs; ++j) {
-	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
-	    x_col += ldx;  b_col += ldb;
-	}
-
-	/* ------------------------------------------------------------
-	   Solve the linear system.
-	   ------------------------------------------------------------*/
-	if ( options->SolveInitialized == NO ) {
-	    zSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
-		       SOLVEstruct);
-	}
-
-	pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
-		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
-
-	/* ------------------------------------------------------------
-	   Use iterative refinement to improve the computed solution and
-	   compute error bounds and backward error estimates for it.
-	   ------------------------------------------------------------*/
-	if ( options->IterRefine ) {
-	    /* Improve the solution by iterative refinement. */
-	    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
-
-	    t = SuperLU_timer_();
-	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
-	        /* All these cases need to re-initialize gsmv structure */
-	        if ( options->RefineInitialized )
-		    pzgsmv_finalize(SOLVEstruct->gsmv_comm);
-	        pzgsmv_init(A, SOLVEstruct->row_to_proc, grid,
-			    SOLVEstruct->gsmv_comm);
-	       
-                /* Save a copy of the transformed local col indices
-		   in colind_gsmv[]. */
-	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
-	        if ( !(it = intMalloc_dist(nnz_loc)) )
-		    ABORT("Malloc fails for colind_gsmv[]");
-	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
-	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
-	        options->RefineInitialized = YES;
-	    } else if ( Fact == SamePattern ||
-			Fact == SamePattern_SameRowPerm ) {
-	        doublecomplex at;
-	        int_t k, jcol, p;
-	        /* Swap to beginning the part of A corresponding to the
-		   local part of X, as was done in pzgsmv_init() */
-	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-		    k = rowptr[i];
-		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-		        jcol = colind[j];
-		        p = SOLVEstruct->row_to_proc[jcol];
-		        if ( p == iam ) { /* Local */
-		            at = a[k]; a[k] = a[j]; a[j] = at;
-		            ++k;
-		        }
-		    }
-	        }
-	      
-	        /* Re-use the local col indices of A obtained from the
-		   previous call to pzgsmv_init() */
-	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
-	    }
-
-	    if ( nrhs == 1 ) { /* Use the existing solve structure */
-	        SOLVEstruct1 = SOLVEstruct;
-	    } else { /* For nrhs > 1, since refinement is performed for RHS
-			one at a time, the communication structure for pdgstrs
-			is different than the solve with nrhs RHS. 
-			So we use SOLVEstruct1 for the refinement step.
-		     */
-	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
-		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
-		    ABORT("Malloc fails for SOLVEstruct1");
-	        /* Copy the same stuff */
-	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
-	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
-	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
-	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
-	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
-	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
-	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		
-		/* Initialize the *gstrs_comm for 1 RHS. */
-		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
-		    ABORT("Malloc fails for gstrs_comm[]");
-		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
-			     Glu_persist, SOLVEstruct1);
-	    }
-
-	    pzgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
-		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
-
-            /* Deallocate the storage associated with SOLVEstruct1 */
-	    if ( nrhs > 1 ) {
-	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
-	        SUPERLU_FREE(SOLVEstruct1);
-	    }
-
-	    stat->utime[REFINE] = SuperLU_timer_() - t;
-	}
-
-	/* Permute the solution matrix B <= Pc'*X. */
-	pzPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
-			       SOLVEstruct->inv_perm_c,
-			       X, ldx, B, ldb, nrhs, grid);
-#if ( DEBUGlevel>=2 )
-	printf("\n (%d) .. After pzPermute_Dense_Matrix(): b =\n", iam);
-	for (i = 0; i < m_loc; ++i)
-	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
-#endif
-	
-	/* Transform the solution matrix X to a solution of the original
-	   system before the equilibration. */
-	if ( notran ) {
-	    if ( colequ ) {
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    irow = fst_row;
-		    for (i = 0; i < m_loc; ++i) {
-                        zd_mult(&b_col[i], &b_col[i], C[irow]);
-		        ++irow;
-		    }
-		    b_col += ldb;
-		}
-	    }
-	} else if ( rowequ ) {
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-	        irow = fst_row;
-		for (i = 0; i < m_loc; ++i) {
-		    zd_mult(&b_col[i], &b_col[i], R[irow]);
-		    ++irow;
-		}
-		b_col += ldb;
-	    }
-	}
-
-	SUPERLU_FREE(b_work);
-	SUPERLU_FREE(X);
-
-    } /* end if nrhs != 0 */
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-#endif
-
-    /* Deallocate R and/or C if it was not used. */
-    if ( Equil && Fact != SamePattern_SameRowPerm ) {
-	switch ( ScalePermstruct->DiagScale ) {
-	    case NOEQUIL:
-	        SUPERLU_FREE(R);
-		SUPERLU_FREE(C);
-		break;
-	    case ROW: 
-		SUPERLU_FREE(C);
-		break;
-	    case COL: 
-		SUPERLU_FREE(R);
-		break;
-	}
-    }
-    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
- 	Destroy_CompCol_Permuted_dist(&GAC);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgssvx()");
-#endif
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrf_irecv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1278 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-/*
- * Internal prototypes
- */
-static void pzgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
-		     gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
-#ifdef _CRAY
-static void pzgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
-#else
-static void pzgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, SuperLUStat_t *);
-#endif
-
-/* 
- * Sketch of the algorithm
- * =======================
- *
- * The following relations hold:
- *     * A_kk = L_kk * U_kk
- *     * L_ik = Aik * U_kk^(-1)
- *     * U_kj = L_kk^(-1) * A_kj
- *
- *              ----------------------------------
- *              |   |                            |
- *              ----|-----------------------------
- *              |   | \ U_kk|                    |
- *              |   |   \   |        U_kj        |
- *              |   |L_kk \ |         ||         |
- *              ----|-------|---------||----------
- *              |   |       |         \/         |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   | L_ik ==>       A_ij        |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              ----------------------------------
- *
- * Handle the first block of columns separately.
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity. ( pzgstrf2(0), one column at a time )
- *     * Compute block row of U
- *     * Update trailing matrix
- * 
- * Loop over the remaining blocks of columns.
- *   mycol = MYCOL( iam, grid );
- *   myrow = MYROW( iam, grid );
- *   N = nsupers;
- *   For (k = 1; k < N; ++k) {
- *       krow = PROW( k, grid );
- *       kcol = PCOL( k, grid );
- *       Pkk = PNUM( krow, kcol, grid );
- *
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity.
- *       if ( mycol == kcol ) {
- *           pzgstrf2(k), one column at a time 
- *       }
- *
- *     * Parallel triangular solve
- *       if ( iam == Pkk ) multicast L_k,k to this process row;
- *       if ( myrow == krow && mycol != kcol ) {
- *          Recv L_k,k from process Pkk;
- *          for (j = k+1; j < N; ++j) 
- *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
- *                 U_k,j = L_k,k \ A_k,j;
- *       }
- *
- *     * Parallel rank-k update
- *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
- *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
- *       if ( myrow != krow ) {
- *          Pkj = PNUM( krow, mycol, grid );
- *          Recv U_k,k+1:N from process Pkj;
- *       }
- *       if ( mycol != kcol ) {
- *          Pik = PNUM( myrow, kcol, grid );
- *          Recv L_k+1:N,k from process Pik;
- *       }
- *       for (j = k+1; k < N; ++k) {
- *          for (i = k+1; i < N; ++i) 
- *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
- *                   && L_i,k != 0 && U_k,j != 0 )
- *                 A_i,j = A_i,j - L_i,k * U_k,j;
- *       }
- *  }
- *
- *
- * Remaining issues
- *   (1) Use local indices for L subscripts and SPA.  [DONE]
- *
- */
-/************************************************************************/
-int_t pzgstrf
-/************************************************************************/
-(
- superlu_options_t *options, int m, int n, double anorm,
- LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
- )
-/* 
- * Purpose
- * =======
- *
- *  PZGSTRF performs the LU factorization in parallel.
- *
- * Arguments
- * =========
- * 
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following field should be defined:
- *         o ReplaceTinyPivot (yes_no_t)
- *           Specifies whether to replace the tiny diagonals by
- *           sqrt(epsilon)*norm(A) during LU factorization.
- *
- * m      (input) int
- *        Number of rows in the matrix.
- *
- * n      (input) int
- *        Number of columns in the matrix.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         The following fields should be defined:
- *
- *         o Glu_persist (input) Glu_persist_t*
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (input/output) LocalLU_t*
- *           The distributed data structures to store L and U factors.
- *           See superlu_zdefs.h for the definition of 'LocalLU_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_zdefs.h for the definition of 'gridinfo_t'.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- *
- */
-{
-#ifdef _CRAY
-    _fcd ftcs = _cptofcd("N", strlen("N"));
-    _fcd ftcs1 = _cptofcd("L", strlen("L"));
-    _fcd ftcs2 = _cptofcd("N", strlen("N"));
-    _fcd ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    doublecomplex zero = {0.0, 0.0};
-    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
-    int_t *xsup;
-    int_t *lsub, *lsub1, *usub, *Usub_buf,
-          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
-    doublecomplex *lusup, *lusup1, *uval, *Uval_buf,
-           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
-    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
-          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
-          nlb, nub, nsupc, rel, rukp;
-    int_t Pc, Pr;
-    int   iam, kcol, krow, mycol, myrow, pi, pj;
-    int   j, k, lk, nsupers;
-    int   nsupr, nbrow, segsize;
-    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
-		      *     0 : transferred in Lsub_buf[]
-		      *     1 : transferred in Lval_buf[]
-		      *     2 : transferred in Usub_buf[] 
-		      *     3 : transferred in Uval_buf[]
-		      */
-    int_t  msg0, msg2;
-    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
-    doublecomplex **Unzval_br_ptr, **Lnzval_bc_ptr;
-    int_t  *index;
-    doublecomplex *nzval;
-    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
-    doublecomplex *ucol;
-    int_t  *indirect;
-    doublecomplex *tempv, *tempv2d;
-    int_t iinfo;
-    int_t *ToRecv, *ToSendD, **ToSendR;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    superlu_scope_t *scp;
-    float s_eps;
-    double thresh;
-    doublecomplex *tempU2d, *tempu;
-    int    full, ldt, ldu, lead_zero, ncols;
-    MPI_Request recv_req[4], *send_req;
-    MPI_Status status;
-#if ( DEBUGlevel>=2 ) 
-    int_t num_copy=0, num_update=0;
-#endif
-#if ( PRNTlevel==3 )
-    int_t  zero_msg = 0, total_msg = 0;
-#endif
-#if ( PROFlevel>=1 )
-    double t1, t2;
-    float msg_vol = 0, msg_cnt = 0;
-    int_t iword = sizeof(int_t), zword = sizeof(doublecomplex);
-#endif
-
-    /* Test the input parameters. */
-    *info = 0;
-    if ( m < 0 ) *info = -2;
-    else if ( n < 0 ) *info = -3;
-    if ( *info ) {
-	pxerbla("pzgstrf", grid, -*info);
-	return (-1);
-    }
-
-    /* Quick return if possible. */
-    if ( m == 0 || n == 0 ) return 0;
-
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    s_eps = slamch_("Epsilon");
-    thresh = s_eps * anorm;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgstrf()");
-#endif
-
-    stat->ops[FACT] = 0.0;
-
-    if ( Pr*Pc > 1 ) {
-	i = Llu->bufmax[0];
-	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
-	    ABORT("Malloc fails for Lsub_buf.");
-	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
-	i = Llu->bufmax[1];
-	if ( !(Llu->Lval_buf_2[0] = doublecomplexMalloc_dist(2 * ((size_t)i))) )
-	    ABORT("Malloc fails for Lval_buf[].");
-	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
-	if ( Llu->bufmax[2] != 0 ) 
-	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
-		ABORT("Malloc fails for Usub_buf[].");
-	if ( Llu->bufmax[3] != 0 ) 
-	    if ( !(Llu->Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3])) )
-		ABORT("Malloc fails for Uval_buf[].");
-	if ( !(send_req =
-	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
-	    ABORT("Malloc fails for send_req[].");
-    }
-    if ( !(Llu->ujrow = doublecomplexMalloc_dist(sp_ienv_dist(3))) )
-	ABORT("Malloc fails for ujrow[].");
-
-#if ( PRNTlevel>=1 )
-    if ( !iam ) {
-	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
-	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
-	       Llu->bufmax[0], Llu->bufmax[1], 
-	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
-    }
-#endif
-
-    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
-    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
-    Lval_buf_2[0] = Llu->Lval_buf_2[0];
-    Lval_buf_2[1] = Llu->Lval_buf_2[1];
-    Usub_buf = Llu->Usub_buf;
-    Uval_buf = Llu->Uval_buf;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    Unzval_br_ptr = Llu->Unzval_br_ptr;
-    ToRecv = Llu->ToRecv;
-    ToSendD = Llu->ToSendD;
-    ToSendR = Llu->ToSendR;
-
-    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
-    if ( !(tempv2d = doublecomplexCalloc_dist(2*((size_t)ldt)*ldt)) )
-	ABORT("Calloc fails for tempv2d[].");
-    tempU2d = tempv2d + ldt*ldt;
-    if ( !(indirect = intMalloc_dist(ldt)) )
-	ABORT("Malloc fails for indirect[].");
-    k = CEILING( nsupers, Pr ); /* Number of local block rows */
-    if ( !(iuip = intMalloc_dist(k)) )
-	ABORT("Malloc fails for iuip[].");
-    if ( !(ruip = intMalloc_dist(k)) )
-	ABORT("Malloc fails for ruip[].");
-
-
-    /* ---------------------------------------------------------------
-       Handle the first block column separately to start the pipeline.
-       --------------------------------------------------------------- */
-    if ( mycol == 0 ) {
-	pzgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
-
-	scp = &grid->rscp; /* The scope of process row. */
-
-	/* Process column *kcol* multicasts numeric values of L(:,k) 
-	   to process rows. */
-	lsub = Lrowind_bc_ptr[0];
-	lusup = Lnzval_bc_ptr[0];
-	if ( lsub ) {
-	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
-	    msgcnt[1] = lsub[1] * SuperSize( 0 );
-	} else {
-	    msgcnt[0] = msgcnt[1] = 0;
-	}
-	
-	for (pj = 0; pj < Pc; ++pj) {
-	    if ( ToSendR[0][pj] != EMPTY ) {
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
-			  &send_req[pj] );
-		MPI_Isend( lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, 1, scp->comm,
-			  &send_req[pj+Pc] );
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
-		       iam, 0, msgcnt[0], msgcnt[1], pj);
-#endif
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-		msg_cnt += 2;
-		msg_vol += msgcnt[0]*iword + msgcnt[1]*zword;
-#endif
-	    }
-	} /* for pj ... */
-    } else { /* Post immediate receives. */
-	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
-	    scp = &grid->rscp; /* The scope of process row. */
-	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
-		      0, scp->comm, &recv_req[0] );
-	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, 0,
-		      1, scp->comm, &recv_req[1] );
-#if ( DEBUGlevel>=2 )
-	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
-#endif
-	}
-    } /* if mycol == 0 */
-
-    /* ------------------------------------------
-       MAIN LOOP: Loop through all block columns.
-       ------------------------------------------ */
-    for (k = 0; k < nsupers; ++k) {
-
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-
-	if ( mycol == kcol ) {
-	    lk = LBj( k, grid ); /* Local block number. */
-
-	    for (pj = 0; pj < Pc; ++pj) {
-                /* Wait for Isend to complete before using lsub/lusup. */
-		if ( ToSendR[lk][pj] != EMPTY ) {
-		    MPI_Wait( &send_req[pj], &status );
-		    MPI_Wait( &send_req[pj+Pc], &status );
-		}
-	    }
-	    lsub = Lrowind_bc_ptr[lk];
-	    lusup = Lnzval_bc_ptr[lk];
-	} else {
-	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
-		scp = &grid->rscp; /* The scope of process row. */
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
-		  Llu->bufmax[0]);*/
-		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
-			 (4*k)%NTAGS, scp->comm, &status );*/
-		MPI_Wait( &recv_req[0], &status );
-		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
-		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, SuperLU_MPI_DOUBLE_COMPLEX, scp->comm, 
-		  Llu->bufmax[1]);*/
-		/*MPI_Recv( Lval_buf, Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, 
-			 (4*k+1)%NTAGS, scp->comm, &status );*/
-		MPI_Wait( &recv_req[1], &status );
-		MPI_Get_count( &status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1] );
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
-		       iam, k, msgcnt[0], msgcnt[1], kcol);
-		fflush(stdout);
-#endif
-		lsub = Lsub_buf_2[k%2];
-		lusup = Lval_buf_2[k%2];
-#if ( PRNTlevel==3 )
-		++total_msg;
-		if ( !msgcnt[0] ) ++zero_msg;
-#endif
-	    } else msgcnt[0] = 0;
-	} /* if mycol = Pc(k) */
-
-	scp = &grid->cscp; /* The scope of process column. */
-
-	if ( myrow == krow ) {
-	    /* Parallel triangular solve across process row *krow* --
-	       U(k,j) = L(k,k) \ A(k,j).  */
-#ifdef _CRAY
-	    pzgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
-#else
-	    pzgstrs2(n, k, Glu_persist, grid, Llu, stat);
-#endif
-
-	    /* Multicasts U(k,:) to process columns. */
-	    lk = LBi( k, grid );
-	    usub = Ufstnz_br_ptr[lk];
-	    uval = Unzval_br_ptr[lk];
-	    if ( usub )	{
-		msgcnt[2] = usub[2];
-		msgcnt[3] = usub[1];
-	    } else {
-		msgcnt[2] = msgcnt[3] = 0;
-	    }
-
-	    if ( ToSendD[lk] == YES ) {
-		for (pi = 0; pi < Pr; ++pi) {
-		    if ( pi != myrow ) {
-#if ( PROFlevel>=1 )
-			TIC(t1);
-#endif
-			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
-				 (4*k+2)%NTAGS, scp->comm);
-			MPI_Send( uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi,
-				 (4*k+3)%NTAGS, scp->comm);
-#if ( PROFlevel>=1 )
-			TOC(t2, t1);
-			stat->utime[COMM] += t2;
-			msg_cnt += 2;
-			msg_vol += msgcnt[2]*iword + msgcnt[3]*zword;
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
-#endif
-		    } /* if pi ... */
-		} /* for pi ... */
-	    } /* if ToSendD ... */
-	} else { /* myrow != krow */
-	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
-#if ( PROFlevel>=1 )
-		TIC(t1);
-#endif
-		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
-		  Llu->bufmax[2]);*/
-		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
-			 (4*k+2)%NTAGS, scp->comm, &status );
-		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
-		/*probe_recv(iam, krow, (4*k+3)%NTAGS, SuperLU_MPI_DOUBLE_COMPLEX, scp->comm, 
-		  Llu->bufmax[3]);*/
-		MPI_Recv( Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow, 
-			 (4*k+3)%NTAGS, scp->comm, &status );
-		MPI_Get_count( &status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3] );
-#if ( PROFlevel>=1 )
-		TOC(t2, t1);
-		stat->utime[COMM] += t2;
-#endif
-		usub = Usub_buf;
-		uval = Uval_buf;
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
-#endif
-#if ( PRNTlevel==3 )
-		++total_msg;
-		if ( !msgcnt[2] ) ++zero_msg;
-#endif
-	    } else msgcnt[2] = 0;
-	} /* if myrow == Pr(k) */
-	  
-	/* 
-	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
-	 *  for (j = k+1; k < N; ++k) {
-	 *     for (i = k+1; i < N; ++i) 
-	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
-	 *              && L(i,k) != 0 && U(k,j) != 0 )
-	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
-	 */
-	msg0 = msgcnt[0];
-	msg2 = msgcnt[2];
-	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
-	    nsupr = lsub[1]; /* LDA of lusup. */
-	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
-		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
-		luptr0 = knsupc;
-		nlb = lsub[0] - 1;
-	    } else {
-		lptr0 = BC_HEADER;
-		luptr0 = 0;
-		nlb = lsub[0];
-	    }
-	    lptr = lptr0;
-	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
-		ib = lsub[lptr];
-		lib = LBi( ib, grid );
-		iuip[lib] = BR_HEADER;
-		ruip[lib] = 0;
-		lptr += LB_DESCRIPTOR + lsub[lptr+1];
-	    }
-	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
-	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
-	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
-	    klst = FstBlockC( k+1 );
-	    
-	    /* ---------------------------------------------------
-	       Update the first block column A(:,k+1).
-	       --------------------------------------------------- */
-	    jb = usub[iukp];   /* Global block number of block U(k,j). */
-	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
-		--nub;
-		lptr = lptr0;
-		luptr = luptr0;
-		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
-		nsupc = SuperSize( jb );
-		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
-		/* Prepare to call DGEMM. */
-		jj = iukp;
-		while ( usub[jj] == klst ) ++jj;
-		ldu = klst - usub[jj++];
-		ncols = 1;
-		full = 1;
-		for (; jj < iukp+nsupc; ++jj) {
-		    segsize = klst - usub[jj];
-		    if ( segsize ) {
-		        ++ncols;
-			if ( segsize != ldu ) full = 0;
-		        if ( segsize > ldu ) ldu = segsize;
-		    }
-		}
-#if ( DEBUGlevel>=3 )
-		++num_update;
-#endif
-		if ( full ) {
-		    tempu = &uval[rukp];
-		} else { /* Copy block U(k,j) into tempU2d. */
-#if ( DEBUGlevel>=3 )
-		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-			 iam, full, k, jb, ldu, ncols, nsupc);
-		  ++num_copy;
-#endif
-		    tempu = tempU2d;
-		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
-		        segsize = klst - usub[jj];
-			if ( segsize ) {
-			    lead_zero = ldu - segsize;
-			    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
-			    tempu += lead_zero;
-			    for (i = 0; i < segsize; ++i)
-				tempu[i] = uval[rukp+i];
-			    rukp += segsize;
-			    tempu += segsize;
-			}
-		    }
-		    tempu = tempU2d;
-		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-		} /* if full ... */
-
-		for (lb = 0; lb < nlb; ++lb) { 
-		    ib = lsub[lptr]; /* Row block L(i,k). */
-		    nbrow = lsub[lptr+1];  /* Number of full rows. */
-		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
-		    tempv = tempv2d;
-#ifdef _CRAY
-		    CGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
-			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			  tempu, &ldu, &beta, tempv, &ldt);
-#elif defined (USE_VENDOR_BLAS)
-		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
-#else
-		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt);
-#endif
-		    stat->ops[FACT] += 8 * nbrow * ldu * ncols;
-
-		    /* Now gather the result into the destination block. */
-		    if ( ib < jb ) { /* A(i,j) is in U. */
-			ilst = FstBlockC( ib+1 );
-			lib = LBi( ib, grid );
-			index = Ufstnz_br_ptr[lib];
-			ijb = index[iuip[lib]];
-			while ( ijb < jb ) { /* Search for dest block. */
-			    ruip[lib] += index[iuip[lib]+1];
-			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
-			    ijb = index[iuip[lib]];
-			}
-			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
-
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    fnz = index[iuip[lib]++];
-			    if ( segsize ) { /* Nonzero segment in U(k.j). */
-				ucol = &Unzval_br_ptr[lib][ruip[lib]];
-				for (i = 0, it = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    z_sub(&ucol[rel], &ucol[rel], &tempv[it]);
-				    ++it;
-				}
-				tempv += ldt;
-			    }
-			    ruip[lib] += ilst - fnz;
-			}
-		    } else { /* A(i,j) is in L. */
-			index = Lrowind_bc_ptr[ljb];
-			ldv = index[1];   /* LDA of the dest lusup. */
-			lptrj = BC_HEADER;
-			luptrj = 0;
-			ijb = index[lptrj];
-			while ( ijb != ib ) { /* Search for dest block -- 
-						 blocks are not ordered! */
-			    luptrj += index[lptrj+1];
-			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
-			    ijb = index[lptrj];
-			}
-			/*
-			 * Build indirect table. This is needed because the
-			 * indices are not sorted.
-			 */
-			fnz = FstBlockC( ib );
-			lptrj += LB_DESCRIPTOR;
-			for (i = 0; i < index[lptrj-1]; ++i) {
-			    rel = index[lptrj + i] - fnz;
-			    indirect[rel] = i;
-			}
-			nzval = Lnzval_bc_ptr[ljb] + luptrj;
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    if ( segsize ) {
-/*#pragma _CRI cache_bypass nzval,tempv*/
-				for (it = 0, i = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    z_sub(&nzval[indirect[rel]],
-					  &nzval[indirect[rel]],
-					  &tempv[it]);
-				    ++it;
-				}
-				tempv += ldt;
-			    }
-			    nzval += ldv;
-			}
-		    } /* if ib < jb ... */
-		    lptr += nbrow;
-		    luptr += nbrow;
-		} /* for lb ... */
-		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
-		iukp += nsupc;
-	    }  /* if jb == k+1 */
-	} /* if L(:,k) and U(k,:) not empty */
-
-
-	if ( k+1 < nsupers ) {
-	  kcol = PCOL( k+1, grid );
-	  if ( mycol == kcol ) {
-	    /* Factor diagonal and subdiagonal blocks and test for exact
-	       singularity.  */
-	    pzgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
-
-	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
-	       to process rows. */
-	    lk = LBj( k+1, grid ); /* Local block number. */
-	    lsub1 = Lrowind_bc_ptr[lk];
- 	    if ( lsub1 ) {
-		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
-		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
-	    } else {
-		msgcnt[0] = 0;
-		msgcnt[1] = 0;
-	    }
-	    scp = &grid->rscp; /* The scope of process row. */
-	    for (pj = 0; pj < Pc; ++pj) {
-		if ( ToSendR[lk][pj] != EMPTY ) {
-		    lusup1 = Lnzval_bc_ptr[lk];
-#if ( PROFlevel>=1 )
-		    TIC(t1);
-#endif
-		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
-			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
-		    MPI_Isend( lusup1, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
-			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
-#if ( PROFlevel>=1 )
-		    TOC(t2, t1);
-		    stat->utime[COMM] += t2;
-		    msg_cnt += 2;
-		    msg_vol += msgcnt[0]*iword + msgcnt[1]*zword;
-#endif
-#if ( DEBUGlevel>=2 )
-		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
-			   iam, k+1, msgcnt[0], msgcnt[1], pj);
-#endif
-		}
-	    } /* for pj ... */
-	  } else { /* Post Recv of block column L(:,k+1). */
-	    if ( ToRecv[k+1] >= 1 ) {
-		scp = &grid->rscp; /* The scope of process row. */
-		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
-			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
-		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, 
-			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
-#if ( DEBUGlevel>=2 )
-		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
-#endif
-	    }
-	  } /* if mycol == Pc(k+1) */
-        } /* if k+1 < nsupers */
-
-	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
-	    /* ---------------------------------------------------
-	       Update all other blocks using block row U(k,:)
-	       --------------------------------------------------- */
-	    for (j = 0; j < nub; ++j) { 
-		lptr = lptr0;
-		luptr = luptr0;
-		jb = usub[iukp];  /* Global block number of block U(k,j). */
-		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
-		nsupc = SuperSize( jb );
-		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
-
-		/* Prepare to call DGEMM. */
-		jj = iukp;
-		while ( usub[jj] == klst ) ++jj;
-		ldu = klst - usub[jj++];
-		ncols = 1;
-		full = 1;
-		for (; jj < iukp+nsupc; ++jj) {
-		    segsize = klst - usub[jj];
-		    if ( segsize ) {
-		        ++ncols;
-			if ( segsize != ldu ) full = 0;
-		        if ( segsize > ldu ) ldu = segsize;
-		    }
-		}
-#if ( DEBUGlevel>=3 )
-		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
-		       iam, full, k, jb, ldu, ncols, nsupc);
-		++num_update;
-#endif
-		if ( full ) {
-		    tempu = &uval[rukp];
-		} else { /* Copy block U(k,j) into tempU2d. */
-#if ( DEBUGlevel>=3 )
-		    ++num_copy;
-#endif
-		    tempu = tempU2d;
-		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
-		        segsize = klst - usub[jj];
-			if ( segsize ) {
-			    lead_zero = ldu - segsize;
-			    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
-			    tempu += lead_zero;
-			    for (i = 0; i < segsize; ++i)
-			        tempu[i] = uval[rukp+i];
-			    rukp += segsize;
-			    tempu += segsize;
-			}
-		    }
-		    tempu = tempU2d;
-		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
-		} /* if full ... */
-
-		for (lb = 0; lb < nlb; ++lb) { 
-		    ib = lsub[lptr];       /* Row block L(i,k). */
-		    nbrow = lsub[lptr+1];  /* Number of full rows. */
-		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
-		    tempv = tempv2d;
-#ifdef _CRAY
-		    CGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
-			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			  tempu, &ldu, &beta, tempv, &ldt);
-#elif defined (USE_VENDOR_BLAS)
-		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
-#else
-		    zgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
-			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
-			   tempu, &ldu, &beta, tempv, &ldt);
-#endif
-		    stat->ops[FACT] += 8 * nbrow * ldu * ncols;
-
-		    /* Now gather the result into the destination block. */
-		    if ( ib < jb ) { /* A(i,j) is in U. */
-			ilst = FstBlockC( ib+1 );
-			lib = LBi( ib, grid );
-			index = Ufstnz_br_ptr[lib];
-			ijb = index[iuip[lib]];
-			while ( ijb < jb ) { /* Search for dest block. */
-			    ruip[lib] += index[iuip[lib]+1];
-			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
-			    ijb = index[iuip[lib]];
-			}
-			/* Skip descriptor.  Now point to fstnz index of 
-			   block U(i,j). */
-			iuip[lib] += UB_DESCRIPTOR;
-
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    fnz = index[iuip[lib]++];
-			    if ( segsize ) { /* Nonzero segment in U(k.j). */
-				ucol = &Unzval_br_ptr[lib][ruip[lib]];
-				for (i = 0 ; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    z_sub(&ucol[rel], &ucol[rel], &tempv[i]);
-				}
-				tempv += ldt;
-			    }
-			    ruip[lib] += ilst - fnz;
-			}
-		    } else { /* A(i,j) is in L. */
-			index = Lrowind_bc_ptr[ljb];
-			ldv = index[1];   /* LDA of the dest lusup. */
-			lptrj = BC_HEADER;
-			luptrj = 0;
-			ijb = index[lptrj];
-			while ( ijb != ib ) { /* Search for dest block -- 
-						 blocks are not ordered! */
-			    luptrj += index[lptrj+1];
-			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
-			    ijb = index[lptrj];
-			}
-			/*
-			 * Build indirect table. This is needed because the
-			 * indices are not sorted for the L blocks.
-			 */
-			fnz = FstBlockC( ib );
-			lptrj += LB_DESCRIPTOR;
-			for (i = 0; i < index[lptrj-1]; ++i) {
-			    rel = index[lptrj + i] - fnz;
-			    indirect[rel] = i;
-			}
-			nzval = Lnzval_bc_ptr[ljb] + luptrj;
-			tempv = tempv2d;
-			for (jj = 0; jj < nsupc; ++jj) {
-			    segsize = klst - usub[iukp + jj];
-			    if ( segsize ) {
-/*#pragma _CRI cache_bypass nzval,tempv*/
-				for (i = 0; i < nbrow; ++i) {
-				    rel = lsub[lptr + i] - fnz;
-				    z_sub(&nzval[indirect[rel]], 
-					  &nzval[indirect[rel]],
-					  &tempv[i]);
-				}
-				tempv += ldt;
-			    }
-			    nzval += ldv;
-			}
-		    } /* if ib < jb ... */
-		    lptr += nbrow;
-		    luptr += nbrow;
-		} /* for lb ... */
-		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
-		iukp += nsupc;
-	    } /* for j ... */
-	} /* if  k L(:,k) and U(k,:) are not empty */
-
-    } 
-    /* ------------------------------------------
-       END MAIN LOOP: for k = ...
-       ------------------------------------------ */
-
-
-    if ( Pr*Pc > 1 ) {
-	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
-	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
-	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
-	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
-	SUPERLU_FREE(send_req);
-    }
-
-    SUPERLU_FREE(Llu->ujrow);
-    SUPERLU_FREE(tempv2d);
-    SUPERLU_FREE(indirect);
-    SUPERLU_FREE(iuip);
-    SUPERLU_FREE(ruip);
-
-    /* Prepare error message. */
-    if ( *info == 0 ) *info = n + 1;
-#if ( PROFlevel>=1 )
-    TIC(t1);
-#endif
-    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
-#if ( PROFlevel>=1 )
-    TOC(t2, t1);
-    stat->utime[COMM] += t2;
-    {
-	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
-	
-	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
-		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	MPI_Reduce( &msg_cnt, &msg_cnt_max,
-		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	MPI_Reduce( &msg_vol, &msg_vol_sum,
-		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-	MPI_Reduce( &msg_vol, &msg_vol_max,
-		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	if ( !iam ) {
-	    printf("\tPZGSTRF comm stat:"
-		   "\tAvg\tMax\t\tAvg\tMax\n"
-		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
-		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
-		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
-	}
-    }
-#endif
-    if ( iinfo == n + 1 ) *info = 0;
-    else *info = iinfo;
-
-
-#if ( PRNTlevel==3 )
-    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
-    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
-    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
-    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
-#endif
-
-#if ( DEBUGlevel>=2 )
-    for (i = 0; i < Pr * Pc; ++i) {
-	if ( iam == i ) {
-	    zPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
-	    zPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
-	    printf("(%d)\n", iam);
-	    PrintInt10("Recv", nsupers, Llu->ToRecv);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-#if ( DEBUGlevel>=3 )
-    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
-#endif
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgstrf()");
-#endif
-} /* PZGSTRF */
-
-
-/************************************************************************/
-static void pzgstrf2
-/************************************************************************/
-(
- superlu_options_t *options,
- int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat, int* info
- )
-/* 
- * Purpose
- * =======
- *   Factor diagonal and subdiagonal blocks and test for exact singularity.
- *   Only the process column that owns block column *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * k      (input) int (global)
- *        The column number of the block column to be factorized.
- *
- * thresh (input) double (global)
- *        The threshold value = s_eps * anorm.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization.
- *        See SuperLUStat_t structure defined in util.h.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- *
- */
-{
-    int    c, iam, l, pkk;
-    int    incx = 1, incy = 1;
-    int    nsupr; /* number of rows in the block (LDA) */
-    int    luptr;
-    int_t  i, krow, j, jfst, jlst;
-    int_t  nsupc; /* number of columns in the block */
-    int_t  *xsup = Glu_persist->xsup;
-    doublecomplex *lusup, temp;
-    doublecomplex *ujrow;
-    doublecomplex one = {1.0, 0.0}, alpha = {-1.0, 0.0};
-    *info = 0;
-
-    /* Quick return. */
-
-    /* Initialization. */
-    iam   = grid->iam;
-    krow  = PROW( k, grid );
-    pkk   = PNUM( PROW(k, grid), PCOL(k, grid), grid );
-    j     = LBj( k, grid ); /* Local block number */
-    jfst  = FstBlockC( k );
-    jlst  = FstBlockC( k+1 );
-    lusup = Llu->Lnzval_bc_ptr[j];
-    nsupc = SuperSize( k );
-    if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
-    ujrow = Llu->ujrow;
-
-    luptr = 0; /* Point to the diagonal entries. */
-    c = nsupc;
-    for (j = 0; j < jlst - jfst; ++j) {
-	/* Broadcast the j-th row (nsupc - j) elements to
-	   the process column. */
-	if ( iam == pkk ) { /* Diagonal process. */
-	    i = luptr;
-	    if ( options->ReplaceTinyPivot == YES ) {
-		if ( z_abs1(&lusup[i]) < thresh ) { /* Diagonal */
-#if ( PRNTlevel>=2 )
-		    printf("(%d) .. col %d, tiny pivot %e  ",
-			   iam, jfst+j, lusup[i]);
-#endif
-		    /* Keep the replaced diagonal with the same sign. */
-		    if ( lusup[i].r < 0 ) lusup[i].r = -thresh;
-		    else lusup[i].r = thresh;
-		    lusup[i].i = 0.0;
-#if ( PRNTlevel>=2 )
-		    printf("replaced by %e\n", lusup[i]);
-#endif
-		    ++(stat->TinyPivots);
-		}
-	    }
-	    for (l = 0; l < c; ++l, i += nsupr)	ujrow[l] = lusup[i];
-	}
-#if 0
-	dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
-#else
-	MPI_Bcast(ujrow, c, SuperLU_MPI_DOUBLE_COMPLEX, krow, (grid->cscp).comm);
-	/*bcast_tree(ujrow, c, SuperLU_MPI_DOUBLE_COMPLEX, krow, (24*k+j)%NTAGS,
-		   grid, COMM_COLUMN, &c);*/
-#endif
-
-#if ( DEBUGlevel>=2 )
-if ( k == 3329 && j == 2 ) {
-	if ( iam == pkk ) {
-	    printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
-	} else {
-	    printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
-	}
-}
-#endif
-
-	if ( !lusup ) { /* Empty block column. */
-	    --c;
-	    if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) *info = j+jfst+1;
-	    continue;
-	}
-
-	/* Test for singularity. */
-	if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) {
-	    *info = j+jfst+1;
-	} else {
-	    /* Scale the j-th column of the matrix. */
-	    z_div(&temp, &one, &ujrow[0]);
-	    if ( iam == pkk ) {
-		for (i = luptr+1; i < luptr-j+nsupr; ++i)
-		    zz_mult(&lusup[i], &lusup[i], &temp);
-		stat->ops[FACT] += 6*(nsupr-j-1) + 10;
-	    } else {
-		for (i = luptr; i < luptr+nsupr; ++i)
-		    zz_mult(&lusup[i], &lusup[i], &temp);
-		stat->ops[FACT] += 6*nsupr + 10;
-	    }
-	}
-	    
-	/* Rank-1 update of the trailing submatrix. */
-	if ( --c ) {
-	    if ( iam == pkk ) {
-		l = nsupr - j - 1;
-#ifdef _CRAY
-		CGERU(&l, &c, &alpha, &lusup[luptr+1], &incx,
-		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#else
-		zgeru_(&l, &c, &alpha, &lusup[luptr+1], &incx,
-		      &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
-#endif
-		stat->ops[FACT] += 8 * l * c;
-	    } else {
-#ifdef _CRAY
-		CGERU(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
-		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#else
-		zgeru_(&nsupr, &c, &alpha, &lusup[luptr], &incx, 
-		      &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
-#endif
-		stat->ops[FACT] += 8 * nsupr * c;
-	    }
-	}
-	
-	/* Move to the next column. */
-	if ( iam == pkk ) luptr += nsupr + 1;
-	else luptr += nsupr;
-
-    } /* for j ... */
-
-} /* PZGSTRF2 */
-
-
-/************************************************************************/
-static void pzgstrs2
-/************************************************************************/
-#ifdef _CRAY
-(
- int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
- )
-#else
-(
- int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
- LocalLU_t *Llu, SuperLUStat_t *stat
- )
-#endif
-/* 
- * Purpose
- * =======
- *   Perform parallel triangular solves
- *           U(k,:) := A(k,:) \ L(k,k). 
- *   Only the process row that owns block row *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * m      (input) int (global)
- *        Number of rows in the matrix.
- *
- * k      (input) int (global)
- *        The row number of the block row to be factorized.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization; 
- *        See SuperLUStat_t structure defined in util.h.
- *
- */
-{
-    int    iam, pkk;
-    int    incx = 1;
-    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
-    int    segsize;
-    int_t  nsupc; /* number of columns in the block */
-    int_t  luptr, iukp, rukp;
-    int_t  b, gb, j, klst, knsupc, lk, nb;
-    int_t  *xsup = Glu_persist->xsup;
-    int_t  *usub;
-    doublecomplex *lusup, *uval;
-
-    /* Quick return. */
-    lk = LBi( k, grid ); /* Local block number */
-    if ( !Llu->Unzval_br_ptr[lk] ) return;
-
-    /* Initialization. */
-    iam  = grid->iam;
-    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
-    klst = FstBlockC( k+1 );
-    knsupc = SuperSize( k );
-    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
-    uval = Llu->Unzval_br_ptr[lk];
-    nb = usub[0];
-    iukp = BR_HEADER;
-    rukp = 0;
-    if ( iam == pkk ) {
-	lk = LBj( k, grid );
-	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
-	lusup = Llu->Lnzval_bc_ptr[lk];
-    } else {
-	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
-	lusup = Llu->Lval_buf_2[k%2];
-    }
-
-    /* Loop through all the row blocks. */
-    for (b = 0; b < nb; ++b) {
-	gb = usub[iukp];
-	nsupc = SuperSize( gb );
-	iukp += UB_DESCRIPTOR;
-
-	/* Loop through all the segments in the block. */
-	for (j = 0; j < nsupc; ++j) {
-	    segsize = klst - usub[iukp++]; 
-	    if ( segsize ) { /* Nonzero segment. */
-		luptr = (knsupc - segsize) * (nsupr + 1);
-#ifdef _CRAY
-		CTRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
-		      &uval[rukp], &incx);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
-		       &uval[rukp], &incx, 1, 1, 1);
-#else
-		ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
-		       &uval[rukp], &incx);
-#endif
-		stat->ops[FACT] += 4 * segsize * (segsize + 1)
-		    + 10 * segsize;  /* complex division */
-		rukp += segsize;
-	    }
-	}
-    } /* for b ... */
-
-} /* PZGSTRS2 */
-
-static int
-probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
-	   int buf_size)
-{
-    MPI_Status status;
-    int count; 
-
-    MPI_Probe( source, tag, comm, &status );
-    MPI_Get_count( &status, datatype, &count );
-    if ( count > buf_size ) {
-        printf("(%d) Recv'ed count %d > buffer size $d\n",
-	       iam, count, buf_size);
-	exit(-1);
-    }
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,822 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_zdefs.h"
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
-		   doublecomplex*, int*, doublecomplex*, int*);
-fortran void SGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
-		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-
-void pzgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
-	      doublecomplex *x, int nrhs, SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * PZGSTRS1 solves a system of distributed linear equations
- *
- *                   op( sub(A) ) * X = sub( B )
- *
- * with a general N-by-N distributed matrix sub( A ) using the LU
- * factorization computed by PZGSTRF.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures to store L and U factors,
- *        and the permutation vectors.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t' structure.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * x      (input/output) doublecomplex*
- *        On entry, the right hand side matrix.
- *        On exit, the solution matrix if info = 0;
- *
- *        NOTE: the right-hand side matrix is already distributed on
- *              the diagonal processes.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves; 
- *        See SuperLUStat_t structure defined in util.h.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    doublecomplex alpha = {1.0, 0.0};
-    doublecomplex zero = {0.0, 0.0};
-    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
-    doublecomplex *lusup, *dest;
-    doublecomplex *recvbuf, *tempv;
-    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    doublecomplex **Lnzval_bc_ptr;
-    MPI_Status status;
-#ifdef ISEND_IRECV
-    MPI_Request *send_req, recv_req;
-#endif
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -8;
-    if ( *info ) {
-	pxerbla("PZGSTRS1", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-    Llu->SolveMsgSent = 0;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgstrs1()");
-#endif
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PZGSTRS1. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#ifdef ISEND_IRECV
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Compute ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
-        + nlb * LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
-    if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-
-    /*
-     * Prepended the block number in the header for lsum[].
-     */
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H].r = k; 
-	    lsum[il - LSUM_H].i = 0; 
-	}
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( !frecv[lk] && !fmod[lk] ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p)
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 SuperLU_MPI_DOUBLE_COMPLEX, 
-                                 pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu,
-			   send_req, stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /*
-     * Compute the internal nodes asynchronously by all processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#ifdef ISEND_IRECV
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &recv_req );
-	MPI_Wait( &recv_req, &status );
-#else
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-#endif
-
-	k = (*recvbuf).r;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM:
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j)
-		  for (i = 0; i < knsupc; ++i)
-		      z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc],
-			    &tempv[i + j*knsupc]);
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p)
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-				     &send_req[Llu->SolveMsgSent++] );
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=2 )
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( DEBUGlevel>=2 )
-    if ( !iam ) printf("\n.. After L-solve: y =\n");
-    for (i = 0, k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    ii = X_BLK( lk );
-	    for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    Llu->SolveMsgSent = 0;
-#endif
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PZGSTRS1. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j)
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nlb is the number of local block rows. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb)
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( !brecv[lk] && !bmod[lk] ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p)
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-				   &send_req[Llu->SolveMsgSent++] );
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, 
-                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-	k = (*recvbuf).r;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM:
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i)
-			z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc],
-			      &tempv[i + j*knsupc]);
-
-		if ( !(--brecv[lk]) && !bmod[lk] ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p)
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-    /* Deallocate storage. */
-
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i)
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    SUPERLU_FREE(send_req);
-#endif
-    
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgstrs1()");
-#endif
-
-} /* PZGSTRS1 */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_Bglobal.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,965 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_zdefs.h"
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
-		   doublecomplex*, int*, doublecomplex*, int*);
-fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
-		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-static void gather_diag_to_all(int_t, int_t, doublecomplex [], Glu_persist_t *,
-                               LocalLU_t *, gridinfo_t *, int_t, int_t [],
-                               int_t [], doublecomplex [], int_t, doublecomplex []);
-
-
-void
-pzgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, 
-                doublecomplex *B, int_t ldb, int nrhs, 
-                SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * pzgstrs_Bglobal solves a system of distributed linear equations
- * A*X = B with a general N-by-N matrix A using the LU factorization
- * computed by pzgstrf.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from pzgstrf for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * B      (input/output) doublecomplex*
- *        On entry, the right-hand side matrix of the possibly equilibrated
- *        and row permuted system.
- *        On exit, the solution matrix of the possibly equilibrated
- *        and row permuted system if info = 0;
- *
- *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all 
- *              processes when calling this routine.
- *
- * ldb    (input) int (global)
- *        Leading dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    doublecomplex alpha = {1.0, 0.0};
-    doublecomplex zero = {0.0, 0.0};
-    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
-    doublecomplex *x;     /* X component at step k. */
-    doublecomplex *lusup, *dest;
-    doublecomplex *recvbuf, *tempv;
-    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    doublecomplex **Lnzval_bc_ptr;
-    MPI_Status status;
-#if defined (ISEND_IRECV) || defined (BSEND)
-    MPI_Request *send_req, recv_req;
-#endif
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -9;
-    if ( *info ) {
-	pxerbla("PDGSTRS_BGLOBAL", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-    stat->ops[SOLVE] = 0.0;
-    Llu->SolveMsgSent = 0;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgstrs_Bglobal()");
-#endif
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#if defined (ISEND_IRECV) || defined (BSEND)
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Obtain ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum) * nrhs 
-        + nlb * LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doublecomplexMalloc_dist(((size_t)ldalsum) * nrhs 
-        + nlb * XK_H)) )
-	ABORT("Malloc fails for x[].");
-    if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-
-    /*
-     * Copy B into X on the diagonal processes.
-     */
-    ii = 0;
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H].r = k;/* Block number prepended in the header. */
-	    lsum[il - LSUM_H].i = 0;
-	    kcol = PCOL( k, grid );
-	    if ( mycol == kcol ) { /* Diagonal process. */
-		jj = X_BLK( lk );
-		x[jj - XK_H].r = k; /* Block number prepended in the header. */
-		x[jj - XK_H].i = 0;
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
-			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
-	    }
-	}
-	ii += knsupc;
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-		    + 10 * knsupc * nrhs; /* complex division */
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-#ifdef BSEND
-			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#else
-
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				  SuperLU_MPI_DOUBLE_COMPLEX, 
-                                  pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			   send_req,stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /* -----------------------------------------------------------
-       Compute the internal nodes asynchronously by all processes.
-       ----------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#if 1
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-		 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-#else
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-		  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &request );
-	MPI_Wait( &request, &status );
-#endif
-
-	k = (*recvbuf).r;
-
-
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM: /* Receiver must be a diagonal process */
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j)
-		  for (i = 0; i < knsupc; ++i)
-		      z_add(&x[i + ii + j*knsupc],
-			    &x[i + ii + j*knsupc],
-			    &tempv[i + j*knsupc]);
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-		      + 10 * knsupc * nrhs; /* complex division */
-
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p) {
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, 
-				     &send_req[Llu->SolveMsgSent++]);
-#else
-#ifdef BSEND
-			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-                  }
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=2 )	      
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( DEBUGlevel>=2 )
-    printf("\n(%d) .. After L-solve: y =\n", iam);
-    for (i = 0, k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    ii = X_BLK( lk );
-	    for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    Llu->SolveMsgSent = 0;
-#endif
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j)
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nub is the number of local block columns. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb) {
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    }
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-		    + 10 * knsupc * nrhs; /* complex division */
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-#ifdef BSEND
-			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                  SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE,
-		 MPI_ANY_TAG, grid->comm, &status );
-	
-	k = (*recvbuf).r;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM: /* Receiver must be a diagonal process */
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j)
-		    for (i = 0; i < knsupc; ++i)
-			z_add(&x[i + ii + j*knsupc],
-			      &x[i + ii + j*knsupc],
-			      &tempv[i + j*knsupc]);
-
-		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		    stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-			+ 10 * knsupc * nrhs; /* complex division */
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p) {
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		    }
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-
-    /* Copy the solution X into B (on all processes). */
-    {
-	int_t num_diag_procs, *diag_procs, *diag_len;
-	doublecomplex *work;
-
-	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
-		       &diag_procs, &diag_len);
-	jj = diag_len[0];
-	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
-	if ( !(work = doublecomplexMalloc_dist(((size_t)jj)*nrhs)) )
-	    ABORT("Malloc fails for work[]");
-	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
-			   grid, num_diag_procs, diag_procs, diag_len,
-			   B, ldb, work);
-	SUPERLU_FREE(diag_procs);
-	SUPERLU_FREE(diag_len);
-	SUPERLU_FREE(work);
-    }
-
-    /* Deallocate storage. */
-
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(x);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i)
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    SUPERLU_FREE(send_req);
-#endif
-#ifdef BSEND
-    SUPERLU_FREE(send_req);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgstrs_Bglobal()");
-#endif
-
-} /* PZGSTRS_BGLOBAL */
-
-
-/*
- * Gather the components of x vector on the diagonal processes
- * onto all processes, and combine them into the global vector y.
- */
-static void
-gather_diag_to_all(int_t n, int_t nrhs, doublecomplex x[],
-		   Glu_persist_t *Glu_persist, LocalLU_t *Llu,
-		   gridinfo_t *grid, int_t num_diag_procs,
-		   int_t diag_procs[], int_t diag_len[],
-		   doublecomplex y[], int_t ldy, doublecomplex work[])
-{
-    int_t i, ii, j, k, lk, lwork, nsupers, p;
-    int_t *ilsum, *xsup;
-    int iam, knsupc, pkk;
-    doublecomplex *x_col, *y_col;
-    
-    iam = grid->iam;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    ilsum = Llu->ilsum;
-
-    for (p = 0; p < num_diag_procs; ++p) {
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    /* Copy x vector into a buffer. */
-	    lwork = 0;
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid );
-		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
-		x_col = &x[ii];
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
-		    lwork += knsupc;
-		    x_col += knsupc;
-		}
-	    }
-	    MPI_Bcast( work, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
-	} else {
-	    MPI_Bcast( work, diag_len[p]*nrhs, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
-	}
-	/* Scatter work[] into global y vector. */
-	lwork = 0;
-	for (k = p; k < nsupers; k += num_diag_procs) {
-	    knsupc = SuperSize( k );
-	    ii = FstBlockC( k );
-	    y_col = &y[ii];
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
-		lwork += knsupc;
-		y_col += ldy;
-	    }
-	}
-    }
-} /* GATHER_DIAG_TO_ALL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1201 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include "superlu_zdefs.h"
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
-		   doublecomplex*, int*, doublecomplex*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-
-int_t
-pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb,
-                      int_t fst_row, int_t *ilsum, doublecomplex *x,
-		      ScalePermstruct_t *ScalePermstruct,
-		      Glu_persist_t *Glu_persist,
-		      gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct)
-{
-/*
- * Purpose
- * =======
- *   Re-distribute B on the diagonal processes of the 2D process mesh.
- * 
- * Note
- * ====
- *   This routine can only be called after the routine pxgstrs_init(),
- *   in which the structures of the send and receive buffers are set up.
- *
- * Arguments
- * =========
- * 
- * B      (input) doublecomplex*
- *        The distributed right-hand side matrix of the possibly
- *        equilibrated system.
- *
- * m_loc  (input) int (local)
- *        The local row dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * ldb    (input) int (local)
- *        Leading dimension of matrix B.
- *
- * fst_row (input) int (global)
- *        The row number of B's first row in the global matrix.
- *
- * ilsum  (input) int* (global)
- *        Starting position of each supernode in a full array.
- *
- * x      (output) doublecomplex*
- *        The solution vector. It is valid only on the diagonal processes.
- *
- * ScalePermstruct (input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * SOLVEstruct (input) SOLVEstruct_t*
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * Return value
- * ============
- *
- */
-    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
-    int  *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
-    int  *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t  *perm_r, *perm_c; /* row and column permutation vectors */
-    int_t  *send_ibuf, *recv_ibuf;
-    doublecomplex *send_dbuf, *recv_dbuf;
-    int_t  *xsup, *supno;
-    int_t  i, ii, irow, gbi, j, jj, k, knsupc, l, lk;
-    int    p, procs;
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    perm_r = ScalePermstruct->perm_r;
-    perm_c = ScalePermstruct->perm_c;
-    procs = grid->nprow * grid->npcol;
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    SendCnt      = gstrs_comm->B_to_X_SendCnt;
-    SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt +   procs;
-    RecvCnt      = gstrs_comm->B_to_X_SendCnt + 2*procs;
-    RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs;
-    sdispls      = gstrs_comm->B_to_X_SendCnt + 4*procs;
-    sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs;
-    rdispls      = gstrs_comm->B_to_X_SendCnt + 6*procs;
-    rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs;
-    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
-    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
-
-    /* ------------------------------------------------------------
-       NOW COMMUNICATE THE ACTUAL DATA.
-       ------------------------------------------------------------*/
-    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
-    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
-    if ( !(send_ibuf = intMalloc_dist(k + l)) )
-        ABORT("Malloc fails for send_ibuf[].");
-    recv_ibuf = send_ibuf + k;
-    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) )
-        ABORT("Malloc fails for send_dbuf[].");
-    recv_dbuf = send_dbuf + k * nrhs;
-    
-    for (p = 0; p < procs; ++p) {
-        ptr_to_ibuf[p] = sdispls[p];
-        ptr_to_dbuf[p] = sdispls[p] * nrhs;
-    }
-
-    /* Copy the row indices and values to the send buffer. */
-    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
-        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
-	gbi = BlockNum( irow );
-	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
-	k = ptr_to_ibuf[p];
-	send_ibuf[k] = irow;
-	k = ptr_to_dbuf[p];
-	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
-	    send_dbuf[k++] = B[i + j*ldb];
-	}
-	++ptr_to_ibuf[p];
-	ptr_to_dbuf[p] += nrhs;
-    }
-
-    /* Communicate the (permuted) row indices. */
-    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
-		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
-
-    /* Communicate the numerical values. */
-    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
-		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
-		  grid->comm);
-    
-    /* ------------------------------------------------------------
-       Copy buffer into X on the diagonal processes.
-       ------------------------------------------------------------*/
-    ii = 0;
-    for (p = 0; p < procs; ++p) {
-        jj = rdispls_nrhs[p];
-        for (i = 0; i < RecvCnt[p]; ++i) {
-	    /* Only the diagonal processes do this; the off-diagonal processes
-	       have 0 RecvCnt. */
-	    irow = recv_ibuf[ii]; /* The permuted row index. */
-	    k = BlockNum( irow );
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );  /* Local block number. */
-	    l = X_BLK( lk );
-            x[l - XK_H].r = k; /* Block number prepended in the header. */
-            x[l - XK_H].i = 0;
-	    irow = irow - FstBlockC(k); /* Relative row number in X-block */
-	    RHS_ITERATE(j) {
-	        x[l + irow + j*knsupc] = recv_dbuf[jj++];
-	    }
-	    ++ii;
-	}
-    }
-
-    SUPERLU_FREE(send_ibuf);
-    SUPERLU_FREE(send_dbuf);
-    
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()");
-#endif
-    return 0;
-} /* pzReDistribute_B_to_X */
-
-int_t
-pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t fst_row,
-		      int_t nrhs, doublecomplex *x, int_t *ilsum,
-		      ScalePermstruct_t *ScalePermstruct,
-		      Glu_persist_t *Glu_persist, gridinfo_t *grid,
-		      SOLVEstruct_t *SOLVEstruct)
-{
-/*
- * Purpose
- * =======
- *   Re-distribute X on the diagonal processes to B distributed on all
- *   the processes.
- *
- * Note
- * ====
- *   This routine can only be called after the routine pxgstrs_init(),
- *   in which the structures of the send and receive buffers are set up.
- *
- */
-    int_t  i, ii, irow, j, jj, k, knsupc, nsupers, l, lk;
-    int_t  *xsup, *supno;
-    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
-    int  *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs;
-    int  *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t  *send_ibuf, *recv_ibuf;
-    doublecomplex *send_dbuf, *recv_dbuf;
-    int_t  *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-    int  iam, p, q, pkk, procs;
-    int_t  num_diag_procs, *diag_procs;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pzReDistribute_X_to_B()");
-#endif
-
-    /* ------------------------------------------------------------
-       INITIALIZATION.
-       ------------------------------------------------------------*/
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    iam = grid->iam;
-    procs = grid->nprow * grid->npcol;
- 
-    SendCnt      = gstrs_comm->X_to_B_SendCnt;
-    SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
-    RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
-    RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs;
-    sdispls      = gstrs_comm->X_to_B_SendCnt + 4*procs;
-    sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs;
-    rdispls      = gstrs_comm->X_to_B_SendCnt + 6*procs;
-    rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs;
-    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
-    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
-
-    k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
-    l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
-    if ( !(send_ibuf = intMalloc_dist(k + l)) )
-        ABORT("Malloc fails for send_ibuf[].");
-    recv_ibuf = send_ibuf + k;
-    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
-        ABORT("Malloc fails for send_dbuf[].");
-    recv_dbuf = send_dbuf + k * nrhs;
-    for (p = 0; p < procs; ++p) {
-        ptr_to_ibuf[p] = sdispls[p];
-        ptr_to_dbuf[p] = sdispls_nrhs[p];
-    }
-    num_diag_procs = SOLVEstruct->num_diag_procs;
-    diag_procs = SOLVEstruct->diag_procs;
-
-    for (p = 0; p < num_diag_procs; ++p) {  /* For all diagonal processes. */
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid ); /* Local block number */
-		irow = FstBlockC( k );
-		l = X_BLK( lk );
-		for (i = 0; i < knsupc; ++i) {
-#if 0
-		    ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */
-#else
-		    ii = irow;
-#endif
-		    q = row_to_proc[ii];
-		    jj = ptr_to_ibuf[q];
-		    send_ibuf[jj] = ii;
-		    jj = ptr_to_dbuf[q];
-		    RHS_ITERATE(j) { /* RHS stored in row major in buffer. */
-		        send_dbuf[jj++] = x[l + i + j*knsupc];
-		    }
-		    ++ptr_to_ibuf[q];
-		    ptr_to_dbuf[q] += nrhs;
-		    ++irow;
-		}
-	    }
-	}
-    }
-    
-    /* ------------------------------------------------------------
-        COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
-       ------------------------------------------------------------*/
-    MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
-		  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
-    MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, 
-		  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
-		  grid->comm);
-
-    /* ------------------------------------------------------------
-       COPY THE BUFFER INTO B.
-       ------------------------------------------------------------*/
-    for (i = 0, k = 0; i < m_loc; ++i) {
-	irow = recv_ibuf[i];
-	irow -= fst_row; /* Relative row number */
-	RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
-	    B[irow + j*ldb] = recv_dbuf[k++];
-	}
-    }
-
-    SUPERLU_FREE(send_ibuf);
-    SUPERLU_FREE(send_dbuf);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Exit pzReDistribute_X_to_B()");
-#endif
-    return 0;
-
-} /* pzReDistribute_X_to_B */
-
-
-void
-pzgstrs(int_t n, LUstruct_t *LUstruct, 
-	ScalePermstruct_t *ScalePermstruct,
-	gridinfo_t *grid, doublecomplex *B,
-	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
-	SOLVEstruct_t *SOLVEstruct,
-	SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * PZGSTRS solves a system of distributed linear equations
- * A*X = B with a general N-by-N matrix A using the LU factorization
- * computed by PZGSTRF.
- * If the equilibration, and row and column permutations were performed,
- * the LU factorization was performed for A1 where
- *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- * and the linear system solved is
- *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
- * the permutation to B1 by Pc*Pr is applied internally in this routine.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The order of the system of linear equations.
- *
- * LUstruct (input) LUstruct_t*
- *        The distributed data structures storing L and U factors.
- *        The L and U factors are obtained from PZGSTRF for
- *        the possibly scaled and permuted matrix A.
- *        See superlu_zdefs.h for the definition of 'LUstruct_t'.
- *        A may be scaled and permuted into A1, so that
- *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_defs.h for the definition of 'gridinfo_t'.
- *
- * B      (input/output) doublecomplex*
- *        On entry, the distributed right-hand side matrix of the possibly
- *        equilibrated system. That is, B may be overwritten by diag(R)*B.
- *        On exit, the distributed solution matrix Y of the possibly
- *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
- *        and X is the solution of the original system.
- *
- * m_loc  (input) int (local)
- *        The local row dimension of matrix B.
- *
- * fst_row (input) int (global)
- *        The row number of B's first row in the global matrix.
- *
- * ldb    (input) int (local)
- *        The leading dimension of matrix B.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- * 
- * SOLVEstruct (output) SOLVEstruct_t* (global)
- *        Contains the information for the communication during the
- *        solution phase.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the triangular solves.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *        
- */
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    doublecomplex alpha = {1.0, 0.0};
-    doublecomplex zero = {0.0, 0.0};
-    doublecomplex *lsum;  /* Local running sum of the updates to B-components */
-    doublecomplex *x;     /* X component at step k. */
-		    /* NOTE: x and lsum are of same size. */
-    doublecomplex *lusup, *dest;
-    doublecomplex *recvbuf, *tempv;
-    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *supno, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int_t  Pc, Pr;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    doublecomplex **Lnzval_bc_ptr;
-    MPI_Status status;
-#ifdef ISEND_IRECV
-    MPI_Request *send_req, recv_req;
-#endif
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve --
-                             Count the number of local block products to
-                             be summed into lsum[lk]. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
-                             from processes in this row. 
-                             It is only valid on the diagonal processes. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for U-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
-#if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
-#endif
-
-    t = SuperLU_timer_();
-
-    /* Test input parameters. */
-    *info = 0;
-    if ( n < 0 ) *info = -1;
-    else if ( nrhs < 0 ) *info = -9;
-    if ( *info ) {
-	pxerbla("PZGSTRS", grid, -*info);
-	return;
-    }
-	
-    /*
-     * Initialization.
-     */
-    iam = grid->iam;
-    Pc = grid->npcol;
-    Pr = grid->nprow;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    nsupers = supno[n-1] + 1;
-    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter pzgstrs()");
-#endif
-
-    stat->ops[SOLVE] = 0.0;
-    Llu->SolveMsgSent = 0;
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PDGSTRS. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for fmod[].");
-    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for frecv[].");
-    Llu->frecv = frecv;
-
-#ifdef ISEND_IRECV
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
-#endif
-
-#ifdef _CRAY
-    ftcs1 = _cptofcd("L", strlen("L"));
-    ftcs2 = _cptofcd("N", strlen("N"));
-    ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-
-
-    /* Obtain ilsum[] and ldalsum for process column 0. */
-    ilsum = Llu->ilsum;
-    ldalsum = Llu->ldalsum;
-
-    /* Allocate working storage. */
-    knsupc = sp_ienv_dist(3);
-    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
-    if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
-	ABORT("Malloc fails for x[].");
-    if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for recvbuf[].");
-    if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) )
-	ABORT("Malloc fails for rtemp[].");
-
-    
-    /*---------------------------------------------------
-     * Forward solve Ly = b.
-     *---------------------------------------------------*/
-    /* Redistribute B into X on the diagonal processes. */
-    pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
-			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
-
-    /* Set up the headers in lsum[]. */
-    ii = 0;
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    lk = LBi( k, grid );   /* Local block number. */
-	    il = LSUM_BLK( lk );
-	    lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/
-	    lsum[il - LSUM_H].i = 0;
-	}
-	ii += knsupc;
-    }
-
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && fmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
-#endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-		    + 10 * knsupc * nrhs; /* complex division */
-		--nleaf;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			   send_req, stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-    /* -----------------------------------------------------------
-       Compute the internal nodes asynchronously by all processes.
-       ----------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
-#endif
-
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-#ifdef ISEND_IRECV
-	/* -MPI- FATAL: Remote protocol queue full */
-	MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req );
-	MPI_Wait( &recv_req, &status );
-#else
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-#endif
-
-        k = (*recvbuf).r;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM: /* Receiver must be a diagonal process */
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j) {
-		  for (i = 0; i < knsupc; ++i)
-		      z_add(&x[i + ii + j*knsupc],
-			    &x[i + ii + j*knsupc],
-			    &tempv[i + j*knsupc]);
-	      }
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
-#ifdef _CRAY
-		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-		      + 10 * knsupc * nrhs; /* complex division */
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p) {
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
-                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                     &send_req[Llu->SolveMsgSent++]);
-#else
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				    SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
-#endif
-		      }
-                  }
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
-
-#if ( DEBUGlevel>=2 )
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-	      break;
-#endif
-	  } /* switch */
-
-    } /* while not finished ... */
-
-
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-#if ( DEBUGlevel==2 )
-    {
-      printf("(%d) .. After L-solve: y =\n", iam);
-      for (i = 0, k = 0; k < nsupers; ++k) {
-	  krow = PROW( k, grid );
-	  kcol = PCOL( k, grid );
-	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	      knsupc = SuperSize( k );
-	      lk = LBi( k, grid );
-	      ii = X_BLK( lk );
-	      for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	      fflush(stdout);
-	  }
-	  MPI_Barrier( grid->comm );
-      }
-    }
-#endif
-
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
-
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    Llu->SolveMsgSent = 0;
-#endif
-
-
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
-
-    /* Save the count to be altered so it can be used by
-       subsequent call to PZGSTRS. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
-
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
-    }
-
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j) {
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
-	    }
-	}
-    }
-
-    /* Set up additional pointers for the index and value arrays of U.
-       nub is the number of local block columns. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
-#endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb) {
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    }
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
-
-#if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
-		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
-#endif /* DEBUGlevel */
-
-
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
-
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
-#endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-		    + 10 * knsupc * nrhs; /* complex division */
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#else
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                  SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk,
-                                  grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
-#endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
-
-
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
-
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-        k = (*recvbuf).r;
-
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
-		 */
-		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-			   send_req, stat);
-
-	        break;
-
-	    case LSUM: /* Receiver must be a diagonal process */
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j) {
-		    for (i = 0; i < knsupc; ++i)
-                        z_add(&x[i + ii + j*knsupc],
-			      &x[i + ii + j*knsupc],
-			      &tempv[i + j*knsupc]);
-		}
-
-		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		    ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		    stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-			+ 10 * knsupc * nrhs; /* complex division */
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p) {
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                       &send_req[Llu->SolveMsgSent++] );
-#else
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk,
-                                      grid->comm );
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
-#endif
-			}
-		    }
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
-
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
-		break;
-#endif		
-
-	} /* switch */
-
-    } /* while not finished ... */
-
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
-
-#if ( DEBUGlevel>=2 )
-    {
-	doublecomplex *x_col;
-	int diag;
-	printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
-	ii = 0;
-	for (k = 0; k < nsupers; ++k) {
-	    knsupc = SuperSize( k );
-	    krow = PROW( k, grid );
-	    kcol = PCOL( k, grid );
-	    diag = PNUM( krow, kcol, grid);
-	    if ( iam == diag ) { /* Diagonal process. */
-		lk = LBi( k, grid );
-		jj = X_BLK( lk );
-		x_col = &x[jj];
-		RHS_ITERATE(j) {
-		    for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
-			printf("\t(%d)\t%4d\t%.10f\n",
-			       iam, xsup[k]+i, x_col[i]);
-		    }
-		    x_col += knsupc;
-		}
-	    }
-	    ii += knsupc;
-	} /* for k ... */
-    }
-#endif
-
-    pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
-			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
-
-
-    /* Deallocate storage. */
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(x);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i) {
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    }
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
-#ifdef ISEND_IRECV
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
-    SUPERLU_FREE(send_req);
-#endif
-
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgstrs()");
-#endif
-
-} /* PZGSTRS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzgstrs_lsum.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,368 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- */
-
-#include "superlu_zdefs.h"
-
-#define ISEND_IRECV
-
-/*
- * Function prototypes
- */
-#ifdef _CRAY
-fortran void CTRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, doublecomplex*,
-		   doublecomplex*, int*, doublecomplex*, int*);
-fortran void CGEMM(_fcd, _fcd, int*, int*, int*, doublecomplex*, doublecomplex*, 
-		   int*, doublecomplex*, int*, doublecomplex*, doublecomplex*, int*);
-_fcd ftcs1;
-_fcd ftcs2;
-_fcd ftcs3;
-#endif
-
-/************************************************************************/
-void zlsum_fmod
-/************************************************************************/
-(
- doublecomplex *lsum,    /* Sum of local modifications.                        */
- doublecomplex *x,       /* X array (local)                                    */
- doublecomplex *xk,      /* X[k].                                              */
- doublecomplex *rtemp,   /* Result of full matrix-vector multiply.             */
- int   nrhs,      /* Number of right-hand sides.                        */
- int   knsupc,    /* Size of supernode k.                               */
- int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
- int_t nlb,       /* Number of L blocks.                                */
- int_t lptr,      /* Starting position in lsub[*].                      */
- int_t luptr,     /* Starting position in lusup[*].                     */
- int_t *xsup,
- gridinfo_t *grid,
- LocalLU_t *Llu,
- MPI_Request send_req[],
- SuperLUStat_t *stat
-)
-{
-/*
- * Purpose
- * =======
- *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
- */
-    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
-    doublecomplex *lusup, *lusup1;
-    doublecomplex *dest;
-    int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
-    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, rel;
-    int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
-    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    MPI_Status status;
-    int test_flag;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    lk = LBj( k, grid ); /* Local block number, column-wise. */
-    lsub = Llu->Lrowind_bc_ptr[lk];
-    lusup = Llu->Lnzval_bc_ptr[lk];
-    nsupr = lsub[1];
-
-    for (lb = 0; lb < nlb; ++lb) {
-	ik = lsub[lptr]; /* Global block number, row-wise. */
-	nbrow = lsub[lptr+1];
-#ifdef _CRAY
-	CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
-	      &alpha, &lusup[luptr], &nsupr, xk,
-	      &knsupc, &beta, rtemp, &nbrow );
-#elif defined (USE_VENDOR_BLAS)
-	zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
-	       &alpha, &lusup[luptr], &nsupr, xk,
-	       &knsupc, &beta, rtemp, &nbrow, 1, 1 );
-#else
-	zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
-	       &alpha, &lusup[luptr], &nsupr, xk,
-	       &knsupc, &beta, rtemp, &nbrow );
-#endif
-	stat->ops[SOLVE] += 8 * nbrow * nrhs * knsupc + 2 * nbrow * nrhs;
-   
-	lk = LBi( ik, grid ); /* Local block number, row-wise. */
-	iknsupc = SuperSize( ik );
-	il = LSUM_BLK( lk );
-	dest = &lsum[il];
-	lptr += LB_DESCRIPTOR;
-	rel = xsup[ik]; /* Global row index of block ik. */
-	for (i = 0; i < nbrow; ++i) {
-	    irow = lsub[lptr++] - rel; /* Relative row. */
-	    RHS_ITERATE(j)
-		z_sub(&dest[irow + j*iknsupc],
-		      &dest[irow + j*iknsupc],
-		      &rtemp[i + j*nbrow]);
-	}
-	luptr += nbrow;
-		    
-	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
-	    ikcol = PCOL( ik, grid );
-	    p = PNUM( myrow, ikcol, grid );
-	    if ( iam != p ) {
-#ifdef ISEND_IRECV
-		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm,
-                           &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
-#else
-		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			 SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
-		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
-#endif
-	    } else { /* Diagonal process: X[i] += lsum[i]. */
-		ii = X_BLK( lk );
-		RHS_ITERATE(j)
-		    for (i = 0; i < iknsupc; ++i)
-			z_add(&x[i + ii + j*iknsupc],
-			      &x[i + ii + j*iknsupc],
-			      &lsum[i + il + j*iknsupc]);
-		if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
-		    fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( ik, grid );/* Local block number, column-wise. */
-		    lsub1 = Llu->Lrowind_bc_ptr[lk];
-		    lusup1 = Llu->Lnzval_bc_ptr[lk];
-		    nsupr1 = lsub1[1];
-#ifdef _CRAY
-		    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
-			  lusup1, &nsupr1, &x[ii], &iknsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
-#else
-		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
-			   lusup1, &nsupr1, &x[ii], &iknsupc);
-#endif
-		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
-			+ 10 * knsupc * nrhs; /* complex division */
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, ik);
-#endif
-		
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    for (p = 0; p < grid->nprow; ++p) {
-			if ( fsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, ikcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii-XK_H], pi);
-#endif
-			}
-                    }
-		    /*
-		     * Perform local block modifications.
-		     */
-		    nlb1 = lsub1[0] - 1;
-		    lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc;
-		    luptr1 = iknsupc; /* Skip diagonal block L(I,I). */
-
-		    zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
-			       fmod, nlb1, lptr1, luptr1, xsup,
-			       grid, Llu, send_req, stat);
-		} /* if frecv[lk] == 0 */
-	    } /* if iam == p */
-	} /* if fmod[lk] == 0 */
-
-    } /* for lb ... */
-
-} /* zLSUM_FMOD */
-
-
-/************************************************************************/
-void zlsum_bmod
-/************************************************************************/
-(
- doublecomplex *lsum,        /* Sum of local modifications.                    */
- doublecomplex *x,           /* X array (local).                               */
- doublecomplex *xk,          /* X[k].                                          */
- int    nrhs,	      /* Number of right-hand sides.                    */
- int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
- int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
- Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
- int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
- int_t  *xsup,
- gridinfo_t *grid,
- LocalLU_t *Llu,
- MPI_Request send_req[],
- SuperLUStat_t *stat
- )
-{
-/*
- * Purpose
- * =======
- *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
- */
-    doublecomplex alpha = {1.0, 0.0};
-    int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
-    int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
-           j, jj, lk, lk1, nub, ub, uptr;
-    int_t  *usub;
-    doublecomplex *uval, *dest, *y;
-    doublecomplex temp;
-    int_t  *lsub;
-    doublecomplex *lusup;
-    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *brecv = Llu->brecv;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    MPI_Status status;
-    int test_flag;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    knsupc = SuperSize( k );
-    lk = LBj( k, grid ); /* Local block number, column-wise. */
-    nub = Urbs[lk];      /* Number of U blocks in block column lk */
-
-    for (ub = 0; ub < nub; ++ub) {
-	ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
-	usub = Llu->Ufstnz_br_ptr[ik];
-	uval = Llu->Unzval_br_ptr[ik];
-	i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
-	i += UB_DESCRIPTOR;
-	il = LSUM_BLK( ik );
-	gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
-	iknsupc = SuperSize( gik );
-	ikfrow = FstBlockC( gik );
-	iklrow = FstBlockC( gik+1 );
-
-	RHS_ITERATE(j) {
-	    dest = &lsum[il + j*iknsupc];
-	    y = &xk[j*knsupc];
-	    uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
-	    for (jj = 0; jj < knsupc; ++jj) {
-		fnz = usub[i + jj];
-		if ( fnz < iklrow ) { /* Nonzero segment. */
-		    /* AXPY */
-		    for (irow = fnz; irow < iklrow; ++irow) {
-			zz_mult(&temp, &uval[uptr], &y[jj]);
-			z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow],
-			      &temp);
-			++uptr;
-		    }
-		    stat->ops[SOLVE] += 8 * (iklrow - fnz);
-		}
-	    } /* for jj ... */
-	}
-
-	if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */
-	    gikcol = PCOL( gik, grid );
-	    p = PNUM( myrow, gikcol, grid );
-	    if ( iam != p ) {
-#ifdef ISEND_IRECV
-		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm,
-                           &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			   SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
-#else
-		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
-			  SuperLU_MPI_DOUBLE_COMPLEX, p, LSUM, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
-		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
-#endif
-	    } else { /* Diagonal process: X[i] += lsum[i]. */
-		ii = X_BLK( ik );
-		dest = &x[ii];
-		RHS_ITERATE(j)
-		    for (i = 0; i < iknsupc; ++i)
-			z_add(&dest[i + j*iknsupc], &dest[i + j*iknsupc],
-			      &lsum[i + il + j*iknsupc]);
-		if ( !brecv[ik] ) { /* Becomes a leaf node. */
-		    bmod[ik] = -1; /* Do not solve X[k] in the future. */
-		    lk1 = LBj( gik, grid ); /* Local block number. */
-		    lsub = Llu->Lrowind_bc_ptr[lk1];
-		    lusup = Llu->Lnzval_bc_ptr[lk1];
-		    nsupr = lsub[1];
-#ifdef _CRAY
-		    CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
-			  lusup, &nsupr, &x[ii], &iknsupc);
-#elif defined (USE_VENDOR_BLAS)
-		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
-#else
-		    ztrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, 
-			   lusup, &nsupr, &x[ii], &iknsupc);
-#endif
-		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs
-			+ 10 * iknsupc * nrhs; /* complex division */
-#if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, gik);
-#endif
-
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    for (p = 0; p < grid->nprow; ++p) {
-			if ( bsendx_plist[lk1][p] != EMPTY ) {
-			    pi = PNUM( p, gikcol, grid );
-#ifdef ISEND_IRECV
-			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-				       &send_req[Llu->SolveMsgSent++] );
-#else
-#ifdef BSEND
-			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#else
-			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
-				     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
-#endif
-#endif
-#if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii-XK_H], pi);
-#endif
-			}
-                     }
-		    /*
-		     * Perform local block modifications.
-		     */
-		    if ( Urbs[lk1] )
-			zlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if brecv[ik] == 0 */
-	    }
-	} /* if bmod[ik] == 0 */
-
-    } /* for ub ... */
-
-} /* zlSUM_BMOD */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlangs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,127 +0,0 @@
-
-/*
- * File name:	pzlangs.c
- * History:     Modified from lapack routine ZLANGE
- */
-#include <math.h>
-#include "superlu_zdefs.h"
-
-double pzlangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
-{
-/* 
-    Purpose   
-    =======   
-
-    PZLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    PZLANGE returns the value   
-
-       PZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                 (   
-                 ( norm1(A),         NORM = '1', 'O' or 'o'   
-                 (   
-                 ( normI(A),         NORM = 'I' or 'i'   
-                 (   
-                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in DLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-    GRID    (input) gridinof_t*
-            The 2D process mesh.
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NRformat_loc *Astore;
-    int_t    m_loc;
-    doublecomplex   *Aval;
-    int_t    i, j, irow, jcol;
-    double   value=0., sum;
-    double   *rwork;
-    double   tempvalue;
-    double   *temprwork;
-
-    Astore = (NRformat_loc *) A->Store;
-    m_loc = Astore->m_loc;
-    Aval   = (doublecomplex *) Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-		value = SUPERLU_MAX( value, z_abs(&Aval[j]) );
-	}
-
-	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-	value = tempvalue;
-
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-#if 0
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += fabs(Aval[i]);
-	    value = SUPERLU_MAX(value,sum);
-	}
-#else /* XSL ==> */
-	if ( !(rwork = (double *) doubleCalloc_dist(A->ncol)) )
-	    ABORT("doubleCalloc_dist fails for rwork.");
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	        jcol = Astore->colind[j];
-		rwork[jcol] += z_abs(&Aval[j]);
-	    }
-	}
-
-	if ( !(temprwork = (double *) doubleCalloc_dist(A->ncol)) )
-	    ABORT("doubleCalloc_dist fails for temprwork.");
-	MPI_Allreduce(rwork, temprwork, A->ncol, MPI_DOUBLE, MPI_SUM, grid->comm);
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    value = SUPERLU_MAX(value, temprwork[j]);
-	}
-	SUPERLU_FREE (temprwork);
-	SUPERLU_FREE (rwork);
-#endif	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	value = 0.;
-	sum = 0.;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-	        sum += z_abs(&Aval[j]);
-	    value = SUPERLU_MAX(value, sum);
-	}
-	MPI_Allreduce(&value, &tempvalue, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-	value = tempvalue;
-
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else {
-	ABORT("Illegal norm specified.");
-    }
-    
-    return (value);
-
-} /* pzlangs */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzlaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,135 +0,0 @@
-
-/*
- * File name:	pzlaqgs.c
- * History:     Modified from LAPACK routine ZLAQGE
- */
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-pzlaqgs(SuperMatrix *A, double *r, double *c, 
-       double rowcnd, double colcnd, double amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    PZLAQGS equilibrates a general sparse M by N matrix A using the row
-    and column scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
-    R       (input) double*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) double*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) double
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) double
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) double
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NRformat_loc *Astore;
-    doublecomplex *Aval;
-    int_t i, j, irow, jcol, m_loc;
-    double large, small, cj;
-    extern double dlamch_(char *);
-    double temp;
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    m_loc = Astore->m_loc;
-    
-    /* Initialize LARGE and SMALL. */
-    small = dlamch_("Safe minimum") / dlamch_("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    irow = Astore->fst_row;
-	    for (i = 0; i < m_loc; ++i) {
-	        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-		    jcol = Astore->colind[j];
-                    zd_mult(&Aval[j], &Aval[j], c[jcol]);
-	      }
-	      ++irow;
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	irow = Astore->fst_row;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-                zd_mult(&Aval[j], &Aval[j], r[irow]);
-	    ++irow;
-	}
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Both row and column scaling */
-	irow = Astore->fst_row;
-	for (i = 0; i < m_loc; ++i) {
-	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	        jcol = Astore->colind[j];
-                temp = r[irow] * c[jcol];
-                zd_mult(&Aval[j], &Aval[j], temp);
-	    }
-	    ++irow;
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* pzlaqgs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1942 +0,0 @@
-
-/*
- * -- Parallel symbolic factorization auxialiary routine (version 2.2) --
- * -- Distributes the data from parallel symbolic factorization 
- * -- to numeric factorization
- * INRIA France -  July 1, 2004
- * Laura Grigori
- *
- * November 1, 2007
- * Feburary 20, 2008
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-
-#include "superlu_zdefs.h"
-#include "psymbfact.h"
-
-static float
-dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
-	     Glu_persist_t *Glu_persist, 
-	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
-	     gridinfo_t *grid
-	     )
-/*
- * Purpose
- * =======
- * 
- * Redistribute the symbolic structure of L and U from the distribution
- * used in the parallel symbolic factorization step to the distdibution
- * used in the parallel numeric factorization step.  On exit, the L and U
- * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
- * information is also computed and it is stored in Glu_persist->supno
- * and Glu_persist->xsup.
- *
- * This routine allocates memory for storing the structure of L and U
- * and the supernodes information.  This represents the arrays:
- * p_xlsub, p_lsub, p_xusub, p_usub,
- * Glu_persist->supno,  Glu_persist->xsup.
- *
- * This routine also deallocates memory allocated during symbolic 
- * factorization routine.  That is, the folloing arrays are freed:
- * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
- * Pslu_freeable->xusub, Pslu_freeable->usub, 
- * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
- * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
- *
- * Arguments
- * =========
- *
- * n      (Input) int_t
- *        Order of the input matrix
- * Pslu_freeable  (Input) Pslu_freeable_t *
- *        Local L and U structure, 
- *        global to local indexing information.
- * 
- * Glu_persist (Output) Glu_persist_t *
- *        Stores on output the information on supernodes mapping.
- * 
- * p_xlsub (Output) int_t **
- *         Pointer to structure of L distributed on a 2D grid 
- *         of processors, stored by columns.
- * 
- * p_lsub  (Output) int_t **
- *         Structure of L distributed on a 2D grid of processors, 
- *         stored by columns.
- *
- * p_xusub (Output) int_t **
- *         Pointer to structure of U distributed on a 2D grid 
- *         of processors, stored by rows.
- * 
- * p_usub  (Output) int_t **
- *         Structure of U distributed on a 2D grid of processors, 
- *         stored by rows.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU.
- *   > 0, number of bytes allocated in this routine when out of memory.
- *        (an approximation).
- */
-{
-  int   iam, nprocs, pc, pr, p, np, p_diag;
-  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
-    *tmp_ptrToSend, *mem;
-  int_t *nnzToRecv_l, *nnzToRecv_u;
-  int_t *send_1, *send_2, nsend_1, nsend_2;
-  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
-  int_t nsupers, nsupers_i, nsupers_j;
-  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
-  int_t maxszsn, maxNvtcsPProc;
-  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
-  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
-  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
-  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
-  int_t *globToLoc, nvtcs_loc;
-  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
-    RecvCnt_l, RecvCnt_u, ind_loc;
-  int_t i, k, j, gb, szsn,  gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
-  int_t nelts, isize;
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword;
-  
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_symbLU()");
-#endif
-  nprocs = (int) grid->nprow * grid->npcol;
-  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
-  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
-  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
-  xsup_end_s    = Pslu_freeable->xsup_end_loc;
-  supno_s       = Pslu_freeable->supno_loc;
-  rcv_luind     = NULL;
-  iword = sizeof(int_t);
-  dword = sizeof(doublecomplex);
-  memAux = 0.; memRet = 0.;
-  
-  mem           = intCalloc_dist(12 * nprocs);
-  if (!mem)
-    return (ERROR_RET);
-  memAux     = (float) (12 * nprocs * sizeof(int_t));
-  nnzToRecv     = mem;
-  nnzToSend     = nnzToRecv + 2*nprocs;
-  nnzToSend_l   = nnzToSend + 2 * nprocs;
-  nnzToSend_u   = nnzToSend_l + nprocs;
-  send_1        = nnzToSend_u + nprocs;
-  send_2        = send_1 + nprocs;
-  tmp_ptrToSend = send_2 + nprocs;
-  nnzToRecv_l   = tmp_ptrToSend + nprocs;
-  nnzToRecv_u   = nnzToRecv_l + nprocs;
-  
-  ptrToSend = nnzToSend;
-  ptrToRecv = nnzToSend + nprocs;
-
-  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
-  intBuf1 = nvtcs + nprocs;
-  intBuf2 = nvtcs + 2 * nprocs;
-  intBuf3 = nvtcs + 3 * nprocs;
-  intBuf4 = nvtcs + 4 * nprocs;
-  memAux += 5 * nprocs * sizeof(int);
-
-  maxszsn   = sp_ienv_dist(3);
-  
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(supno_n = intMalloc_dist(n+1)) ) {
-    fprintf (stderr, "Malloc fails for supno_n[].");
-    return (memAux);
-  }
-  memRet += (float) ((n+1) * sizeof(int_t));
-
-  /* ------------------------------------------------------------
-     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
-     ------------------------------------------------------------*/
-  
-  if (nvtcs_loc > INT_MAX)
-    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
-  intNvtcs_loc = (int) nvtcs_loc;
-  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
-	      0, grid->comm);
-
-  if (!iam) {
-    /* set ptrToRecv to point to the beginning of the data for
-       each processor */
-    for (k = 0, p = 0; p < nprocs; p++) {
-      ptrToRecv[p] = k;
-      k += nvtcs[p];
-    }
-  }
-  
-  if (nprocs > 1) {
-    temp = NULL;
-    if (!iam ) {
-      if ( !(temp = intMalloc_dist (n+1)) ) {
-	fprintf (stderr, "Malloc fails for temp[].");
-	return (memAux + memRet);
-      }
-      memAux += (float) (n+1) * iword;
-    }
-#if defined (_LONGINT)
-    for (p=0; p<nprocs; p++) {
-      if (ptrToRecv[p] > INT_MAX)
-	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-      intBuf1[p] = (int) ptrToRecv[p];
-    }
-#else  /* Default */
-    intBuf1 = ptrToRecv;
-#endif
-    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
-		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
-  }
-  else
-    temp = supno_s;
-
-  if (!iam) {
-    nsupers = 0;
-    p = (int) OWNER( globToLoc[0] );
-    gb = temp[ptrToRecv[p]];
-    supno_n[0] = nsupers;
-    ptrToRecv[p] ++;
-    szsn = 1;
-    for (j = 1; j < n; j ++) {
-      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
-	nsupers ++;
-	p  = (int) OWNER( globToLoc[j] );
-	gb = temp[ptrToRecv[p]];
-	szsn = 1;
-      }
-      else {
-	szsn ++;
-      }
-      ptrToRecv[p] ++;
-      supno_n[j] = nsupers;
-    }
-    nsupers++;
-    if (nprocs > 1) {
-      SUPERLU_FREE (temp);
-      memAux -= (float) (n+1) * iword;
-    }
-    supno_n[n] = nsupers;
-  }
-
-  /* reset to 0 nnzToSend */
-  for (p = 0; p < 2 *nprocs; p++)
-    nnzToSend[p] = 0;
-  
-  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
-  nsupers = supno_n[n];
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
-    fprintf (stderr, "Malloc fails for xsup_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers+1) * iword;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
-     ------------------------------------------------------------*/
-  gb = EMPTY;
-  for (i = 0; i < n; i++) {
-    if (gb != supno_n[i]) {
-      /* a new supernode starts */
-      gb = supno_n[i];
-      xsup_n[gb] = i;
-    }
-  }
-  xsup_n[nsupers] = n;
-  
-  for (p = 0; p < nprocs; p++) {
-    send_1[p] = FALSE;
-    send_2[p] = FALSE;
-  }
-  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
-    i = xsup_n[gb_n];
-    if (iam == (int) OWNER( globToLoc[i] )) {
-      pc = PCOL( gb_n, grid );
-      pr = PROW( gb_n, grid );
-      p_diag = PNUM( pr, pc, grid);
-      
-      i_loc = LOCAL_IND( globToLoc[i] );
-      gb_s  = supno_s[i_loc];
-      fst_s = xsup_beg_s[gb_s];
-      lst_s = xsup_end_s[gb_s];
-      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
-	k = lsub_s[j];
-	if (k >= i) {
-	  gb = supno_n[k];
-	  p = (int) PNUM( PROW(gb, grid), pc, grid );
-	  nnzToSend[2*p] ++;
-	  send_1[p] = TRUE;
-	}
-      }
-      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
-	k = usub_s[j];
-	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
-	  gb = supno_n[k];
-	  p = PNUM( pr, PCOL(gb, grid), grid);
-	  nnzToSend[2*p+1] ++;	
-	  send_2[p] = TRUE;
-	}
-      }
-      
-      nsend_2 = 0;
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	nnzToSend[2*p+1] += 2;
-	if (send_2[p])  nsend_2 ++;	  
-      }
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
-	if (send_2[p] || p == p_diag) {
-	  if (p == p_diag && !send_2[p])
-	    nnzToSend[2*p+1] += nsend_2;
-	  else
-	    nnzToSend[2*p+1] += nsend_2-1;
-	  send_2[p] = FALSE;
-	}
-      nsend_1 = 0;
-      for (p = pc; p < nprocs; p += grid->npcol) {
-	nnzToSend[2*p] += 2;
-	if (send_1[p]) nsend_1 ++;
-      }
-      for (p = pc; p < nprocs; p += grid->npcol) 
-	if (send_1[p]) {
-	  nnzToSend[2*p] += nsend_1-1;
-	  send_1[p] = FALSE;
-	}
-	else
-	  nnzToSend[2*p] += nsend_1;
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
-		grid->comm);
-  
-  nnz_loc_l = nnz_loc_u = 0;
-  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
-  for (p = 0; p < nprocs; p++) {
-    if ( p != iam ) {
-      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
-      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
-      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
-      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    } else {
-      nnz_loc_l += nnzToRecv[2*p];
-      nnz_loc_u += nnzToRecv[2*p+1];
-      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
-      nnzToRecv_l[p] = nnzToRecv[2*p]; 
-      nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    }
-  }
-  
-  /* Allocate space for storing the symbolic structure after redistribution. */
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for xlsub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_j+1) * iword;
-
-  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for xusub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_i+1) * iword;  
-
-  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
-  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
-    if (!(rcv_luind = 
-	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
-      fprintf (stderr, "Malloc fails for rcv_luind[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
-      * iword;
-  }
-  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
-      fprintf (stderr, "Malloc fails for index[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-  } 
-  
-  /* ------------------------------------------------------------------
-     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
-     ------------------------------------------------------------------*/
-  sendL = TRUE;
-  sendU = FALSE;
-  while (sendL || sendU) {
-    if (sendL) {
-      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
-      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
-    }
-    if (sendU) {
-      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
-      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
-    }
-    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
-      if ( p != iam ) {
-	ptrToSend[p] = i;  i += nnzToSend[p];
-      }
-      ptrToRecv[p] = j;  j += nnzToRecv[p];
-    }
-    nnzToRecv[iam] = 0;
-    
-    ind_loc = ptrToRecv[iam];
-    for (gb_n = 0; gb_n < nsupers; gb_n++) {
-      nsend_2 = 0;    
-      i = xsup_n[gb_n];
-      if (iam == OWNER( globToLoc[i] )) {
-	pc = PCOL( gb_n, grid );
-	pr = PROW( gb_n, grid );
-	p_diag = PNUM( pr, pc, grid );
-	
-	i_loc = LOCAL_IND( globToLoc[i] );
-	gb_s  = supno_s[i_loc];
-	fst_s = xsup_beg_s[gb_s];
-	lst_s = xsup_end_s[gb_s];
-	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-
-	if (sendL) {
-	  p = pc;                np = grid->nprow;	  
-	} else {
-	  p = pr * grid->npcol;  np = grid->npcol;
-	}
-	for (j = 0; j < np; j++) {
-	  if (p == iam) {
-	    rcv_luind[ind_loc] = gb_n;
-	    rcv_luind[ind_loc+1] = 0;
-	    tmp_ptrToSend[p] = ind_loc + 1;
-	    ind_loc += 2;	 
-	  }
-	  else {
-	    snd_luind[ptrToSend[p]] = gb_n;
-	    snd_luind[ptrToSend[p]+1] = 0;
-	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
-	    ptrToSend[p] += 2;	 
-	  }
-	  if (sendL) p += grid->npcol;
-	  if (sendU) p++;
-	}
-	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
-	  k = sub_s[j];
-	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
-	    gb = supno_n[k];
-	    if (sendL)
-	      p = PNUM( PROW(gb, grid), pc, grid );
-	    else 
-	      p = PNUM( pr, PCOL(gb, grid), grid);
-	    if (send_1[p] == FALSE) {
-	      send_1[p] = TRUE;
-	      send_2[nsend_2] = k; nsend_2 ++;
-	    }
-	    if (p == iam) {
-	      rcv_luind[ind_loc] = k;  ind_loc++;
-	      if (sendL)
-		xsub_n[LBj( gb_n, grid )] ++;
-	      else
-		xsub_n[LBi( gb_n, grid )] ++;
-	    }
-	    else {
-	      snd_luind[ptrToSend[p]] = k;
-	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-	    }
-	  }
-	}
-	if (sendL)
-	  for (p = pc; p < nprocs; p += grid->npcol) {
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if (PNUM(PROW(gb, grid), pc, grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBj( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }
-		}
-	      }
-	      send_1[p] = FALSE;
-	  }  
-	if (sendU)
-	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	    if (send_1[p] || p == p_diag) {	      
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBi( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }	     
-		}
-	      } 
-	      send_1[p] = FALSE;
-	    }
-	  }
-      }
-    }
-    
-    /* reset ptrToSnd to point to the beginning of the data for
-       each processor (structure needed in MPI_Alltoallv) */
-    for (i = 0, p = 0; p < nprocs; p++) {
-      ptrToSend[p] = i;  i += nnzToSend[p];
-    }
-
-    /* ------------------------------------------------------------
-       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-       Note: it uses MPI_Alltoallv.
-       ------------------------------------------------------------*/
-    if (nprocs > 1) {
-#if defined (_LONGINT)
-      nnzToSend[iam] = 0;
-      for (p=0; p<nprocs; p++) {
-	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
-	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
-	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-	intBuf1[p] = (int) nnzToSend[p];
-	intBuf2[p] = (int) ptrToSend[p];
-	intBuf3[p] = (int) nnzToRecv[p];
-	intBuf4[p] = (int) ptrToRecv[p];
-      }
-#else  /* Default */
-      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
-      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
-#endif
-
-      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
-		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
-		     grid->comm);
-    }
-    if (sendL)
-      nnzToRecv[iam] = nnz_loc_l;
-    else 
-      nnzToRecv[iam] = nnz_loc_u;
-    
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE.
-       -------------------------------------------------------------*/
-    if (sendU) 
-      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-	SUPERLU_FREE (snd_luind);
-	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-      }
-    
-    /* ------------------------------------------------------------
-       CONVERT THE FORMAT.
-       ------------------------------------------------------------*/
-    /* Initialize the array of column of L/ row of U pointers */
-    k = 0;
-    for (p = 0; p < nprocs; p ++) {
-      if (p != iam) {
-	i = k;
-	while (i < k + nnzToRecv[p]) {
-	  gb = rcv_luind[i];
-	  nelts = rcv_luind[i+1];
-	  if (sendL)
-	    xsub_n[LBj( gb, grid )] = nelts;
-	  else
-	    xsub_n[LBi( gb, grid )] = nelts;
-	  i += nelts + 2;
-	}
-      }
-      k += nnzToRecv[p];
-    }
-
-    if (sendL) j = nsupers_j;
-    else j = nsupers_i;
-    k = 0; 
-    isize = xsub_n[0];
-    xsub_n[0] = 0; 
-    for (gb_l = 1; gb_l < j; gb_l++) {
-      k += isize;
-      isize = xsub_n[gb_l];
-      xsub_n[gb_l] = k;
-    }
-    xsub_n[gb_l] = k + isize;
-    nnz_loc = xsub_n[gb_l];
-    if (sendL) {
-      lsub_n = NULL;
-      if (nnz_loc) {
-	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for lsub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = lsub_n;
-    }
-    if (sendU) {
-      usub_n = NULL;
-      if (nnz_loc) {
-	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for usub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = usub_n;
-    }
-    
-    /* Copy the data into the L column / U row oriented storage */
-    k = 0;
-    for (p = 0; p < nprocs; p++) {
-      i = k;
-      while (i < k + nnzToRecv[p]) {
-	gb = rcv_luind[i];
-	if (gb >= nsupers)
-	  printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n",
-		  iam, p, gb, nsupers, i, i-k);
-	i += 2;
-	if (sendL) gb_l = LBj( gb, grid );
-	if (sendU) gb_l = LBi( gb, grid );
-	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
-	  sub_n[j] = rcv_luind[i];
-	}
-      }      
-      k += nnzToRecv[p];
-    }
-    if (sendL) {
-      sendL = FALSE;  sendU = TRUE;
-    }
-    else
-      sendU = FALSE;
-  }
-
-  /* deallocate memory allocated during symbolic factorization routine */
-  if (rcv_luind != NULL) {
-    SUPERLU_FREE (rcv_luind);
-    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
-  }
-  SUPERLU_FREE (mem);  
-  memAux -= (float) (12 * nprocs * iword);
-  SUPERLU_FREE(nvtcs);
-  memAux -= (float) (5 * nprocs * sizeof(int));
-  
-  if (xlsub_s != NULL) {
-    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
-  }
-  if (xusub_s != NULL) {
-    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
-  }
-  SUPERLU_FREE (globToLoc); 
-  if (supno_s != NULL) {
-    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
-    SUPERLU_FREE (supno_s);
-  }
-  
-  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
-  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
-  *p_xusub = xusub_n; *p_usub = usub_n;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit dist_symbLU()");
-#endif
-  
-  return (-memRet);
-}
-  
-static float
-zdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
-	int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val,
-	int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val,
-	int_t *ilsum_i, int_t *ilsum_j
-	)
-{
-/*
- *
- * Purpose
- * =======
- *   Re-distribute A on the 2D process mesh.  The lower part is
- *   stored using a column format and the upper part
- *   is stored using a row format.
- * 
- * Arguments
- * =========
- * 
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_persist  (Input) Glu_persist_t *
- *        Information on supernodes mapping.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * p_ainf_colptr (Output) int_t**
- *         Pointer to the lower part of A distributed on a 2D grid 
- *         of processors, stored by columns.
- *
- * p_ainf_rowind (Output) int_t**
- *         Structure of of the lower part of A distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_ainf_val    (Output) doublecomplex**
- *         Numerical values of the lower part of A, distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_asup_rowptr (Output) int_t**
- *         Pointer to the upper part of A distributed on a 2D grid 
- *         of processors, stored by rows.
- *
- * p_asup_colind (Output) int_t**
- *         Structure of of the upper part of A distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * p_asup_val    (Output) doublecomplex**
- *         Numerical values of the upper part of A, distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * ilsum_i  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block row wise).
- *
- * ilsum_j  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block column wise).
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated when out of memory.
- *        (an approximation).
- *
- */
-  int    iam, p, procs;
-  NRformat_loc *Astore;
-  int_t  *perm_r; /* row permutation vector */
-  int_t  *perm_c; /* column permutation vector */
-  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
-  int_t  nsupers, nsupers_i, nsupers_j;
-  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
-  int_t  nnz_remote; /* number of remote nonzeros to be sent */
-  int_t  SendCnt; /* number of remote nonzeros to be sent */
-  int_t  RecvCnt; /* number of remote nonzeros to be sent */
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  doublecomplex *asup_val, *ainf_val;
-  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-  int_t  *ia, *ja, **ia_send, *index, *itemp;
-  int_t  *ptr_to_send;
-  doublecomplex *aij, **aij_send, *nzval, *dtemp;
-  doublecomplex *nzval_a;
-  MPI_Request *send_req;
-  MPI_Status  status;
-  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-  int_t *supno = Glu_persist->supno;   
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword, szbuf;
-
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter zdist_A()");
-#endif
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-  
-  perm_r = ScalePermstruct->perm_r;
-  perm_c = ScalePermstruct->perm_c;
-  procs = grid->nprow * grid->npcol;
-  Astore = (NRformat_loc *) A->Store;
-  n = A->ncol;
-  m_loc = Astore->m_loc;
-  fst_row = Astore->fst_row;
-  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (ERROR_RET);
-  }
-  memAux = (float) (2 * procs * iword);
-  memRet = 0.;
-  nnzToSend = nnzToRecv + procs;
-  nsupers  = supno[n-1] + 1;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF A.
-     ------------------------------------------------------------*/
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      ++nnzToSend[p]; 
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		grid->comm);
-  
-  maxnnzToRecv = 0;
-  nnz_loc = SendCnt = RecvCnt = 0;
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      SendCnt += nnzToSend[p];
-      RecvCnt += nnzToRecv[p];
-      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
-    } else {
-      nnz_loc += nnzToRecv[p];
-      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
-    }
-  }
-  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-  szbuf = k;
-
-  /* Allocate space for storing the triplets after redistribution. */
-  if ( !(ia = intMalloc_dist(2*k)) ) {
-    fprintf (stderr, "Malloc fails for ia[].");
-    return (memAux);
-  }
-  memAux += (float) (2*k*iword);
-  ja = ia + k;
-  if ( !(aij = doublecomplexMalloc_dist(k)) ) {
-    fprintf (stderr, "Malloc fails for aij[].");
-    return (memAux);
-  }
-  memAux += (float) (k*dword);
-  
-  /* Allocate temporary storage for sending/receiving the A triplets. */
-  if ( procs > 1 ) {
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
-      fprintf (stderr, "Malloc fails for send_req[].");
-      return (memAux);
-    }
-    memAux += (float) (2*procs *sizeof(MPI_Request));
-    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
-      fprintf(stderr, "Malloc fails for ia_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(int_t*));
-    if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) {
-      fprintf(stderr, "Malloc fails for aij_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(doublecomplex*));    
-    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for index[].");
-      return (memAux);
-    }
-    memAux += (float) (2*SendCnt*iword);
-    if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for nzval[].");
-      return (memAux);
-    }
-    memAux += (float) (SendCnt * dword);
-    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
-      fprintf(stderr, "Malloc fails for ptr_to_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs * iword);
-    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for itemp[].");
-      return (memAux);
-    }
-    memAux += (float) (2*maxnnzToRecv*iword);
-    if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for dtemp[].");
-      return (memAux);
-    }
-    memAux += (float) (maxnnzToRecv * dword);
-    
-    for (i = 0, j = 0, p = 0; p < procs; ++p) {
-      if ( p != iam ) {
-	ia_send[p] = &index[i];
-	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	aij_send[p] = &nzval[j];
-	j += nnzToSend[p];
-      }
-    }
-  } /* if procs > 1 */
-  
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
-    return (memAux);
-  }
-  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
-  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
-    return (memAux+memRet);
-  }
-  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
-  
-  /* ------------------------------------------------------------
-     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF A.
-     ------------------------------------------------------------*/
-  nnz_loc = 0; /* Reset the local nonzero count. */
-  nnz_loc_ainf = nnz_loc_asup = 0;
-  nzval_a = Astore->nzval;
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      
-      if ( p != iam ) { /* remote */
-	k = ptr_to_send[p];
-	ia_send[p][k] = irow;
-	ia_send[p][k + nnzToSend[p]] = jcol;
-	aij_send[p][k] = nzval_a[j];
-	++ptr_to_send[p]; 
-      } else {          /* local */
-	ia[nnz_loc] = irow;
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = nzval_a[j];
-	++nnz_loc;
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-
-  /* ------------------------------------------------------------
-     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-     NOTE: Can possibly use MPI_Alltoallv.
-     ------------------------------------------------------------*/
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToSend[p];
-      MPI_Isend( ia_send[p], it, mpi_int_t,
-		 p, iam, grid->comm, &send_req[p] );
-      it = nnzToSend[p];
-      MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
-		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToRecv[p];
-      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
-      it = nnzToRecv[p];
-      MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
-		grid->comm, &status );
-      for (i = 0; i < nnzToRecv[p]; ++i) {
-	ia[nnz_loc] = itemp[i];
-	irow = itemp[i];
-	jcol = itemp[i + nnzToRecv[p]];
-	/* assert(jcol<n); */
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = dtemp[i];
-	++nnz_loc;
-	
-	gbi = BlockNum( irow );
-	gbj = BlockNum( jcol );
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      MPI_Wait( &send_req[p], &status);
-      MPI_Wait( &send_req[procs+p], &status);
-    }
-  }
-  
-  /* ------------------------------------------------------------
-     DEALLOCATE TEMPORARY STORAGE
-     ------------------------------------------------------------*/
-  
-  SUPERLU_FREE(nnzToRecv);
-  memAux -= 2 * procs * iword;
-  if ( procs > 1 ) {
-    SUPERLU_FREE(send_req);
-    SUPERLU_FREE(ia_send);
-    SUPERLU_FREE(aij_send);
-    SUPERLU_FREE(index);
-    SUPERLU_FREE(nzval);
-    SUPERLU_FREE(ptr_to_send);
-    SUPERLU_FREE(itemp);
-    SUPERLU_FREE(dtemp);
-    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
-      procs*sizeof(doublecomplex*) + 2*SendCnt * iword +
-      SendCnt* dword + procs*iword +
-      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
-  }
-  
-  /* ------------------------------------------------------------
-     CONVERT THE TRIPLET FORMAT.
-     ------------------------------------------------------------*/
-  if (nnz_loc_ainf != 0) {
-    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * iword);
-    if ( !(ainf_val = doublecomplexMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_val[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * dword);
-  }
-  else {
-    ainf_rowind = NULL;
-    ainf_val = NULL;
-  }
-  if (nnz_loc_asup != 0) {
-    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_colind[].");
-      return (memAux + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * iword);
-    if ( !(asup_val = doublecomplexMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_val[].");
-      return (memAux  + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * dword);
-  }
-  else {
-    asup_colind = NULL;
-    asup_val = NULL;
-  }
-
-  /* Initialize the array of column pointers */
-  k = 0; 
-  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
-  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
-    k += jsize;              
-    jsize = ainf_colptr[j];  
-    ainf_colptr[j] = k;
-  }
-  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
-  i = 0;
-  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
-  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
-    i += isize;
-    isize = asup_rowptr[j];  
-    asup_rowptr[j] = i;
-  }
-  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
-
-  /* Copy the triplets into the column oriented storage */
-  for (i = 0; i < nnz_loc; ++i) {
-    jcol = ja[i];
-    irow = ia[i];
-    gbi = BlockNum( irow );
-    gbj = BlockNum( jcol );
-    /* Count nonzeros in each column of L / row of U */
-    if (gbi >= gbj) {
-      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
-      k = ainf_colptr[j];
-      ainf_rowind[k] = irow;
-      ainf_val[k] = aij[i];
-      ainf_colptr[j] ++;
-    }
-    else {
-      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
-      k = asup_rowptr[j];
-      asup_colind[k] = jcol;
-      asup_val[k] = aij[i];
-      asup_rowptr[j] ++;
-    }
-  }
-
-  /* Reset the column pointers to the beginning of each column */
-  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
-    ainf_colptr[j] = ainf_colptr[j-1];
-  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
-    asup_rowptr[j] = asup_rowptr[j-1];
-  ainf_colptr[0] = 0;
-  asup_rowptr[0] = 0;
-  
-  SUPERLU_FREE(ia);
-  SUPERLU_FREE(aij);
-  memAux -= 2*szbuf*iword + szbuf*dword;
-  
-  *p_ainf_colptr = ainf_colptr;
-  *p_ainf_rowind = ainf_rowind; 
-  *p_ainf_val    = ainf_val;
-  *p_asup_rowptr = asup_rowptr;
-  *p_asup_colind = asup_colind;
-  *p_asup_val    = asup_val;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit zdist_A()");
-  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
-#endif
-
-  return (-memRet);
-} /* dist_A */
-
-int_t
-zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
-		ScalePermstruct_t *ScalePermstruct,
-		Pslu_freeable_t *Pslu_freeable, 
-		LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- *
- *
- * Purpose
- * =======
- *   Distribute the input matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *          This routine should not be called for this case, an error
- *          is generated.  Instead, pddistribute routine should be called.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (Input) int
- *        Dimension of the matrix.
- *
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
- *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (Input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * LUstruct (Input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated for performing the distribution
- *       of the data, when out of memory.
- *        (an approximation).
- *
- */
-{
-  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-  Glu_freeable_t Glu_freeable_n;
-  LocalLU_t *Llu = LUstruct->Llu;
-  int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, 
-    len, len1, nsupc, nsupc_gb, ii, nprocs;
-  int_t ljb;  /* local block column number */
-  int_t nrbl; /* number of L blocks in current block column */
-  int_t nrbu; /* number of U blocks in current block column */
-  int_t gb;   /* global block number; 0 < gb <= nsuper */
-  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-  int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
-  int_t mybufmax[NBUFFERS];
-  NRformat_loc *Astore;
-  doublecomplex *a;
-  int_t *asub, *xa;
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  doublecomplex *asup_val, *ainf_val;
-  int_t *xsup, *supno;    /* supernode and column mapping */
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
-  int_t next_ind;      /* next available position in index[*] */
-  int_t next_val;      /* next available position in nzval[*] */
-  int_t *index;         /* indices consist of headers and row subscripts */
-  doublecomplex *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
-  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
-  doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-  doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  
-  /*-- Counts to be used in factorization. --*/
-  int_t  *ToRecv, *ToSendD, **ToSendR;
-  
-  /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
-  
-  /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
-  int_t  *ilsum;         /* starting position of each supernode in 
-			    the full array (local)                 */  
-  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
-				the full array (local, block column wise) */  
-  /*-- Auxiliary arrays; freed on return --*/
-  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  int_t *LUb_length; /* L,U block length; size nsupers_ij */
-  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
-  int_t *LUb_number; /* global block number; size nsupers_ij */
-  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
-  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  doublecomplex *dense, *dense_col; /* SPA */
-  doublecomplex zero = {0.0, 0.0};
-  int_t ldaspa;     /* LDA of SPA */
-  int_t iword, dword;
-  float memStrLU, memA,
-    memDist = 0.; /* memory used for redistributing the data, which does
-		   not include the memory for the numerical values of L and U */
-  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
-		    L and U, that will be used in the numeric factorization */
-
-#if ( PRNTlevel>=1 )
-  int_t nLblocks = 0, nUblocks = 0;
-#endif
-  
-  /* Initialization. */
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
-#endif
-  myrow = MYROW( iam, grid );
-  mycol = MYCOL( iam, grid );
-  nprocs = grid->npcol * grid->nprow;
-  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-  Astore   = (NRformat_loc *) A->Store;
-  
-  iword = sizeof(int_t);
-  dword = sizeof(doublecomplex);
-
-  if (fact == SamePattern_SameRowPerm) {
-    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
-  }
-
-  if ((memStrLU = 
-       dist_symbLU (n, Pslu_freeable, 
-		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
-    return (memStrLU);
-  memDist += (-memStrLU);
-  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
-  supno = Glu_persist->supno;   
-  nsupers  = supno[n-1] + 1;
-  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
-  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
-  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
-  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum[].");  
-    return (memDist + memNLU);
-  }
-  memNLU += (nsupers_i+1) * iword;
-  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum_j[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_j+1) * iword;
-
-  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
-  ilsum[0] = 0;
-  ldaspa = 0;
-  for (gb = 0; gb < nsupers; gb++) 
-    if ( myrow == PROW( gb, grid ) ) {
-      i = SuperSize( gb );
-      ldaspa += i;
-      lb = LBi( gb, grid );
-      ilsum[lb + 1] = ilsum[lb] + i;
-    }
-  ilsum[nsupers_i] = ldaspa;
-
-  ldaspa_j = 0; ilsum_j[0] = 0;  
-  for (gb = 0; gb < nsupers; gb++) 
-    if (mycol == PCOL( gb, grid )) {
-      i = SuperSize( gb );
-      ldaspa_j += i;
-      lb = LBj( gb, grid );
-      ilsum_j[lb + 1] = ilsum_j[lb] + i;
-    }
-  ilsum_j[nsupers_j] = ldaspa_j;
-  
-  if ((memA = zdist_A(A, ScalePermstruct, Glu_persist,
-		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
-		      &asup_rowptr, &asup_colind, &asup_val,
-		      ilsum, ilsum_j)) > 0)
-    return (memDist + memA + memNLU);
-  memDist += (-memA);
-
-  /* ------------------------------------------------------------
-     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
-     ------------------------------------------------------------*/
-  
-  /* We first need to set up the L and U data structures and then
-   * propagate the values of A into them.
-   */
-  if ( !(ToRecv = intCalloc_dist(nsupers)) ) {
-    fprintf(stderr, "Calloc fails for ToRecv[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers * iword;
-  
-  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-  if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for ToSendR[].");
-    return (memDist + memNLU);
-  }
-  memNLU += k*sizeof(int_t*);
-  j = k * grid->npcol;
-  if ( !(index = intMalloc_dist(j)) ) {
-    fprintf(stderr, "Malloc fails for index[].");
-    return (memDist + memNLU);
-  }
-  memNLU += j*iword;
-  
-  for (i = 0; i < j; ++i) index[i] = EMPTY;
-  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-  
-  /* Auxiliary arrays used to set up L and U block data structures.
-     They are freed on return. */
-  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_length[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Malloc fails for LUb_indptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_number[].");
-    return (memDist + memNLU);
-  }    
-  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_valptr[].");
-    return (memDist + memNLU);
-  }
-  memDist += 4 * nsupers_ij * iword;
-  
-  k = CEILING( nsupers, grid->nprow ); 
-  /* Pointers to the beginning of each block row of U. */
-  if ( !(Unzval_br_ptr = 
-	 (doublecomplex**)SUPERLU_MALLOC(nsupers_i * sizeof(doublecomplex*))) ) {
-    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*sizeof(doublecomplex*) + nsupers_i*sizeof(int_t*);
-  Unzval_br_ptr[nsupers_i-1] = NULL;
-  Ufstnz_br_ptr[nsupers_i-1] = NULL;
-
-  if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Malloc fails for ToSendD[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*iword;  
-  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_i + nsupers_j)*iword;
-  
-  /* Auxiliary arrays used to set up L, U block data structures.
-     They are freed on return.
-     k is the number of local row blocks.   */
-  if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
-				   * sp_ienv_dist(3))) ) {
-    fprintf(stderr, "Calloc fails for SPA dense[].");
-    return (memDist + memNLU);
-  }
-  /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for fmod[].");
-    return (memDist + memNLU);
-  }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for bmod[].");
-    return (memDist + memNLU);
-  }
-  /* ------------------------------------------------ */
-  memNLU += 2*nsupers_i*iword + 
-    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
-  
-  /* Pointers to the beginning of each block column of L. */
-  if ( !(Lnzval_bc_ptr = 
-	 (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
-    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_j * sizeof(doublecomplex*) + nsupers_j * sizeof(int_t*);
-  Lnzval_bc_ptr[nsupers_j-1] = NULL;
-  Lrowind_bc_ptr[nsupers_j-1] = NULL;
-  
-  /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
-  /* -------------------------------------------------------------- */
-  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
-  
-  /*------------------------------------------------------------
-    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-    ------------------------------------------------------------*/
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */
-    ljb_i = LBi( jb, grid);  /* Local block number row wise */
-    fsupc = FstBlockC( jb );
-    nsupc = SuperSize( jb );
-    
-    if ( myrow == jbrow ) { /* Block row jb in my process row */
-      /* Scatter A into SPA. */
-      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
-	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
-	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
-	    printf ("ERR7\n");
-	  jcol = asup_colind[i];
-	  if (jcol >= n)
-	    printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  gb = BlockNum( jcol );
-	  lb = LBj( gb, grid );
-	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
-	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
-	  if (jcol >= ldaspa_j)
-	    printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  dense_col[jcol] = asup_val[i];
-	}
-	dense_col += ldaspa_j;
-      }
-      
-      /*------------------------------------------------
-       * SET UP U BLOCKS.
-       *------------------------------------------------*/
-      /* Count number of blocks and length of each block. */
-      nrbu = 0;
-      len = 0; /* Number of column subscripts I own. */
-      len1 = 0; /* number of fstnz subscripts */
-      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
-	jcol = usub[i];
-	gb = BlockNum( jcol ); /* Global block number */
-	
-	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
-	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
-	  iam, jb, gb, jcol, jbcol, pc); */
-	
-	lb = LBj( gb, grid );  /* Local block number */
-	pc = PCOL( gb, grid ); /* Process col owning this block */
-	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
-	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
-	pr = PROW( gb, grid );
-	if ( pr != jbrow  && mycol == pc)
-	  bsendx_plist[lb][jbrow] = YES; 
-	if (mycol == pc) {
-	  len += nsupc;
-	  LUb_length[lb] += nsupc;
-	  ToSendD[ljb_i] = YES;
-	  if (Urb_marker[lb] <= jb) { /* First see this block */
-	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
-	    Urb_marker[lb] = jb + 1;
-	    LUb_number[nrbu] = gb;
-	    /* if (gb == 391825 && jb == 145361)
-	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
-	       iam, jb, gb, nrbu); */
-	    nrbu ++;
-	    len1 += SuperSize( gb );
-	    if ( gb != jb )/* Exclude diagonal block. */
-	      ++bmod[ljb_i];/* Mod. count for back solve */
-#if ( PRNTlevel>=1 )
-	    ++nUblocks;
-#endif
-	  }
-	}
-      } /* for i ... */
-      
-      if ( nrbu ) { 
-	/* Sort the blocks of U in increasing block column index.
-	   SuperLU_DIST assumes this is true */
-	/* simple insert sort algorithm */
-	/* to be transformed in quick sort */
-	for (j = 1; j < nrbu; j++) {
-	  k = LUb_number[j];
-	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
-	    LUb_number[i+1] = LUb_number[i];
-	  }
-	  LUb_number[i+1] = k;
-	} 
-	
-	/* Set up the initial pointers for each block in
-	   index[] and nzval[]. */
-	/* Add room for descriptors */
-	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1+1)) ) {
-	  fprintf (stderr, "Malloc fails for Uindex[]");
-	  return (memDist + memNLU);
-	}
-	Ufstnz_br_ptr[ljb_i] = index;
-	if (!(Unzval_br_ptr[ljb_i] =
-	      doublecomplexMalloc_dist(len))) {
-	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
-	  return (memDist + memNLU);
-	}
-	memNLU += (len1+1)*iword + len*dword;
-	uval = Unzval_br_ptr[ljb_i];
-	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-	index[0] = nrbu;  /* Number of column blocks */
-	index[1] = len;   /* Total length of nzval[] */
-	index[2] = len1;  /* Total length of index */
-	index[len1] = -1; /* End marker */
-	next_ind = BR_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbu; k++) {
-	  gb = LUb_number[k];
-	  lb = LBj( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;  /* Reset vector of block length */
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len;
-	  LUb_indptr[lb] = next_ind;
-	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
-	    index[next_ind] = FstBlockC( jb + 1 );
-	  LUb_valptr[lb] = next_val;
-	  next_val += len;
-	}
-	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
-	   and the initial values of A from SPA into Unzval_br_ptr[]. */
-	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	  jcol = usub[i];
-	  gb = BlockNum( jcol );
-	  
-	  if ( mycol == PCOL( gb, grid ) ) {
-	    lb = LBj( gb, grid );
-	    k = LUb_indptr[lb]; /* Start fstnz in index */
-	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
-	  }
-	}  /* for i ... */
-	
-	for (i = 0; i < nrbu; i++) {
-	  gb = LUb_number[i];
-	  lb = LBj( gb, grid );   
-	  next_ind = LUb_indptr[lb];
-	  k = FstBlockC( jb + 1);
-	  jcol = ilsum_j[lb];
-	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
-	    dense_col = dense;
-	    j = index[next_ind+jj];
-	    for (ii = j; ii < k; ii++) {
-	      uval[LUb_valptr[lb]++] = dense_col[jcol];
-	      dense_col[jcol] = zero;
-	      dense_col += ldaspa_j;	      
-	    }
-	  }
-	}
-      } else {
-	Ufstnz_br_ptr[ljb_i] = NULL;
-	Unzval_br_ptr[ljb_i] = NULL;
-      } /* if nrbu ... */	
-    } /* if myrow == jbrow */
-    
-      /*------------------------------------------------
-       * SET UP L BLOCKS.
-       *------------------------------------------------*/
-    if (mycol == jbcol) {  /* Block column jb in my process column */
-      /* Scatter A_inf into SPA. */
-      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
-	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
-	  irow = ainf_rowind[i];
-	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
-	  gb = BlockNum( irow );
-	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
-	  if ( myrow == PROW( gb, grid ) ) {
-	    lb = LBi( gb, grid );
-	    irow = ilsum[lb] + irow - FstBlockC( gb );
-	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
-	    dense_col[irow] = ainf_val[i];
-	  }
-	}
-	dense_col += ldaspa;
-      }      
-      
-      /* sort the indices of the diagonal block at the beginning of xlsub */
-      if (myrow == jbrow) {
-	k = xlsub[ljb_j];
-	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	  irow = lsub[i];
-	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
-	    lsub[i] = lsub[k + irow - fsupc];
-	    lsub[k + irow - fsupc] = irow;
-	    i --;
-	  }
-	}
-      }
-      
-      /* Count number of blocks and length of each block. */
-      nrbl = 0;
-      len = 0; /* Number of row subscripts I own. */
-      kseen = 0;
-      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	irow = lsub[i];
-	gb = BlockNum( irow ); /* Global block number */	  
-	pr = PROW( gb, grid ); /* Process row owning this block */
-	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
-	     myrow == jbrow) {
-	  fsendx_plist[ljb_j][pr] = YES;
-	  ++nfsendx;
-	}
-	if ( myrow == pr ) {
-	  lb = LBi( gb, grid );  /* Local block number */
-	  if (Lrb_marker[lb] <= jb) { /* First see this block */
-	    Lrb_marker[lb] = jb + 1;
-	    LUb_length[lb] = 1;
-	    LUb_number[nrbl++] = gb;
-	    if ( gb != jb ) /* Exclude diagonal block. */
-	      ++fmod[lb]; /* Mod. count for forward solve */
-	    if ( kseen == 0 && myrow != jbrow ) {
-	      ++nfrecvx;
-	      kseen = 1;
-	    }
-#if ( PRNTlevel>=1 )
-	    ++nLblocks;
-#endif
-	  } else 
-	    ++LUb_length[lb];	    
-	  ++len;
-	}
-      } /* for i ... */
-      
-      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-	/* Set up the initial pointers for each block in 
-	   index[] and nzval[]. */
-	/* If I am the owner of the diagonal block, order it first in LUb_number.
-	   Necessary for SuperLU_DIST routines */
-	kseen = EMPTY;
-	for (j = 0; j < nrbl; j++) {
-	  if (LUb_number[j] == jb)
-	    kseen = j;
-	}
-	if (kseen != EMPTY && kseen != 0) {
-	  LUb_number[kseen] = LUb_number[0];
-	  LUb_number[0] = jb;
-	}
-	
-	/* Add room for descriptors */
-	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1)) ) {
-	  fprintf (stderr, "Malloc fails for index[]");
-	  return (memDist + memNLU);
-	}
-	Lrowind_bc_ptr[ljb_j] = index;
-	if (!(Lnzval_bc_ptr[ljb_j] = 
-	      doublecomplexMalloc_dist(len*nsupc))) {
-	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb);
-	  return (memDist + memNLU);
-	}
-	memNLU += len1*iword + len*nsupc*dword;
-	
-	lusup = Lnzval_bc_ptr[ljb_j];
-	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-	index[0] = nrbl;  /* Number of row blocks */
-	index[1] = len;   /* LDA of the nzval[] */
-	next_ind = BC_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbl; ++k) {
-	  gb = LUb_number[k];
-	  lb = LBi( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len; 
-	  LUb_indptr[lb] = next_ind;
-	    LUb_valptr[lb] = next_val;
-	    next_ind += len;
-	    next_val += len;
-	  }
-	  /* Propagate the compressed row subscripts to Lindex[],
-	     and the initial values of A from SPA into Lnzval[]. */
-	  len = index[1];  /* LDA of lusup[] */
-	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	    irow = lsub[i];
-	    gb = BlockNum( irow );
-	    if ( myrow == PROW( gb, grid ) ) {
-	      lb = LBi( gb, grid );
-	      k = LUb_indptr[lb]++; /* Random access a block */
-	      index[k] = irow;
-	      k = LUb_valptr[lb]++;
-	      irow = ilsum[lb] + irow - FstBlockC( gb );
-	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
-		lusup[k] = dense_col[irow];
-		dense_col[irow] = zero;
-		k += len;
-		dense_col += ldaspa;
-	      }
-	    }
-	  } /* for i ... */
-	} else {
-	  Lrowind_bc_ptr[ljb_j] = NULL;
-	  Lnzval_bc_ptr[ljb_j] = NULL;
-	} /* if nrbl ... */		  
-      } /* if mycol == pc */
-  } /* for jb ... */
-
-  SUPERLU_FREE(ilsum_j);
-  SUPERLU_FREE(Urb_marker);
-  SUPERLU_FREE(LUb_length);
-  SUPERLU_FREE(LUb_indptr);
-  SUPERLU_FREE(LUb_number);
-  SUPERLU_FREE(LUb_valptr);
-  SUPERLU_FREE(Lrb_marker);
-  SUPERLU_FREE(dense);
-  
-  /* Free the memory used for storing L and U */
-  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
-  if (lsub != NULL)
-    SUPERLU_FREE(lsub);  
-  if (usub != NULL)
-    SUPERLU_FREE(usub);
-  
-  /* Free the memory used for storing A */
-  SUPERLU_FREE(ainf_colptr);
-  if (ainf_rowind != NULL) {
-    SUPERLU_FREE(ainf_rowind);
-    SUPERLU_FREE(ainf_val);
-  }
-  SUPERLU_FREE(asup_rowptr);
-  if (asup_colind != NULL) {
-    SUPERLU_FREE(asup_colind);	
-    SUPERLU_FREE(asup_val);	
-  }
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
-    fprintf (stderr, "Malloc fails for recvBuf[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  
-  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
-    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
-  
-  for (p = 0; p < nprocs; p++)
-    nnzToRecv[p] = 0;
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    nnzToRecv[p] += grid->npcol;
-  }    
-  i = 0;
-  for (p = 0; p < nprocs; p++) {
-    ptrToRecv[p] = i;
-    i += nnzToRecv[p];
-    ptrToSend[p] = 0;
-    if (p != iam)
-      nnzToSend[p] = nnzToRecv[iam];
-    else
-      nnzToSend[p] = 0;
-  }
-  nnzToRecv[iam] = 0;
-  i = ptrToRecv[iam];
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    if (p == iam) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      for (j = 0; j < grid->npcol; j++, i++)
-	recvBuf[i] = ToSendR[ljb_j][j];
-    }
-  }   
-  
-  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
-		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
-    /* (myrow == jbrow) {
-       if (ToSendD[ljb_i] == YES)
-       ToRecv[jb] = 1;
-       }
-       else {
-       if (recvBuf[ptrToRecv[p] + mycol] == YES)
-       ToRecv[jb] = 2;
-       } */
-    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
-      if (myrow == jbrow)
-	ToRecv[jb] = 1;
-      else
-	ToRecv[jb] = 2;
-    }
-    if (mycol == jbcol) {
-      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
-	ToSendR[ljb_j][i] = recvBuf[j];  
-      ToSendR[ljb_j][mycol] = EMPTY;
-    }
-    ptrToRecv[p] += grid->npcol;
-  }   
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
-		 MPI_MAX, grid->cscp.comm);
-  
-  for (jb = 0; jb < nsupers; jb ++) {
-    jbcol = PCOL( jb, grid);
-    jbrow = PROW( jb, grid);
-    if (mycol == jbcol) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      if (myrow == jbrow ) {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
-	  (*bsendx_plist)[k] = recvBuf[k];
-	  if ((*bsendx_plist)[k] != EMPTY)
-	    nbsendx ++;
-	}
-      }
-      else {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
-	  (*bsendx_plist)[k] = EMPTY;
-      }
-    }
-  }
-  
-  SUPERLU_FREE(nnzToRecv);
-  SUPERLU_FREE(ptrToRecv);
-  SUPERLU_FREE(nnzToSend);
-  SUPERLU_FREE(ptrToSend);
-  SUPERLU_FREE(recvBuf);
-  
-  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-  Llu->Unzval_br_ptr = Unzval_br_ptr;
-  Llu->ToRecv = ToRecv;
-  Llu->ToSendD = ToSendD;
-  Llu->ToSendR = ToSendR;
-  Llu->fmod = fmod;
-  Llu->fsendx_plist = fsendx_plist;
-  Llu->nfrecvx = nfrecvx;
-  Llu->nfsendx = nfsendx;
-  Llu->bmod = bmod;
-  Llu->bsendx_plist = bsendx_plist;
-  Llu->nbrecvx = nbrecvx;
-  Llu->nbsendx = nbsendx;
-  Llu->ilsum = ilsum;
-  Llu->ldalsum = ldaspa;
-  LUstruct->Glu_persist = Glu_persist;	
-#if ( PRNTlevel>=1 )
-  if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-		     nLblocks, nUblocks);
-#endif
-  
-  /* Find the maximum buffer size. */
-  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		MPI_MAX, grid->comm);
-  
-#if ( DEBUGlevel>=1 )
-  /* Memory allocated but not freed:
-     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
-     ToRecv, ToSendR, ToSendD
-  */
-  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
-#endif
-    
-  return (- (memDist+memNLU));
-} /* dist_psymbtonum */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzsymbfact_distdata.c.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,1939 +0,0 @@
-
-/*
- * -- Parallel symbolic factorization auxialiary routine (version 2.1) --
- * -- Distributes the data from parallel symbolic factorization 
- * -- to numeric factorization
- * INRIA France -  July 1, 2004
- * Laura Grigori
- *
- * November 1, 2007
- */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-
-#include "superlu_zdefs.h"
-
-static float
-dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, 
-	     Glu_persist_t *Glu_persist, 
-	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
-	     gridinfo_t *grid
-	     )
-/*
- * Purpose
- * =======
- * 
- * Redistribute the symbolic structure of L and U from the distribution
- * used in the parallel symbolic factorization step to the distdibution
- * used in the parallel numeric factorization step.  On exit, the L and U
- * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal 
- * information is also computed and it is stored in Glu_persist->supno
- * and Glu_persist->xsup.
- *
- * This routine allocates memory for storing the structure of L and U
- * and the supernodes information.  This represents the arrays:
- * p_xlsub, p_lsub, p_xusub, p_usub,
- * Glu_persist->supno,  Glu_persist->xsup.
- *
- * This routine also deallocates memory allocated during symbolic 
- * factorization routine.  That is, the folloing arrays are freed:
- * Pslu_freeable->xlsub,  Pslu_freeable->lsub, 
- * Pslu_freeable->xusub, Pslu_freeable->usub, 
- * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, 
- * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
- *
- * Arguments
- * =========
- *
- * n      (Input) int_t
- *        Order of the input matrix
- * Pslu_freeable  (Input) Pslu_freeable_t *
- *        Local L and U structure, 
- *        global to local indexing information.
- * 
- * Glu_persist (Output) Glu_persist_t *
- *        Stores on output the information on supernodes mapping.
- * 
- * p_xlsub (Output) int_t **
- *         Pointer to structure of L distributed on a 2D grid 
- *         of processors, stored by columns.
- * 
- * p_lsub  (Output) int_t **
- *         Structure of L distributed on a 2D grid of processors, 
- *         stored by columns.
- *
- * p_xusub (Output) int_t **
- *         Pointer to structure of U distributed on a 2D grid 
- *         of processors, stored by rows.
- * 
- * p_usub  (Output) int_t **
- *         Structure of U distributed on a 2D grid of processors, 
- *         stored by rows.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU.
- *   > 0, number of bytes allocated in this routine when out of memory.
- *        (an approximation).
- */
-{
-  int   iam, nprocs, pc, pr, p, np, p_diag;
-  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, 
-    *tmp_ptrToSend, *mem;
-  int_t *nnzToRecv_l, *nnzToRecv_u;
-  int_t *send_1, *send_2, nsend_1, nsend_2;
-  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
-  int_t nsupers, nsupers_i, nsupers_j;
-  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
-  int_t maxszsn, maxNvtcsPProc;
-  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
-  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
-  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
-  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
-  int_t *globToLoc, nvtcs_loc;
-  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
-    RecvCnt_l, RecvCnt_u, ind_loc;
-  int_t i, k, j, gb, szsn,  gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
-  int_t nelts, isize;
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword;
-  
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_symbLU()");
-#endif
-  nprocs = (int) grid->nprow * grid->npcol;
-  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
-  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
-  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
-  globToLoc     = Pslu_freeable->globToLoc;
-  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
-  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
-  xsup_end_s    = Pslu_freeable->xsup_end_loc;
-  supno_s       = Pslu_freeable->supno_loc;
-  rcv_luind     = NULL;
-  iword = sizeof(int_t);
-  dword = sizeof(doublecomplex);
-  memAux = 0.; memRet = 0.;
-  
-  mem           = intCalloc_dist(12 * nprocs);
-  if (!mem)
-    return (ERROR_RET);
-  memAux     = (float) (12 * nprocs * sizeof(int_t));
-  nnzToRecv     = mem;
-  nnzToSend     = nnzToRecv + 2*nprocs;
-  nnzToSend_l   = nnzToSend + 2 * nprocs;
-  nnzToSend_u   = nnzToSend_l + nprocs;
-  send_1        = nnzToSend_u + nprocs;
-  send_2        = send_1 + nprocs;
-  tmp_ptrToSend = send_2 + nprocs;
-  nnzToRecv_l   = tmp_ptrToSend + nprocs;
-  nnzToRecv_u   = nnzToRecv_l + nprocs;
-  
-  ptrToSend = nnzToSend;
-  ptrToRecv = nnzToSend + nprocs;
-
-  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
-  intBuf1 = nvtcs + nprocs;
-  intBuf2 = nvtcs + 2 * nprocs;
-  intBuf3 = nvtcs + 3 * nprocs;
-  intBuf4 = nvtcs + 4 * nprocs;
-  memAux += 5 * nprocs * sizeof(int);
-
-  maxszsn   = sp_ienv_dist(3);
-  
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(supno_n = intMalloc_dist(n+1)) ) {
-    fprintf (stderr, "Malloc fails for supno_n[].");
-    return (memAux);
-  }
-  memRet += (float) ((n+1) * sizeof(int_t));
-
-  /* ------------------------------------------------------------
-     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
-     ------------------------------------------------------------*/
-  
-  if (nvtcs_loc > INT_MAX)
-    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
-  intNvtcs_loc = (int) nvtcs_loc;
-  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
-	      0, grid->comm);
-
-  if (!iam) {
-    /* set ptrToRecv to point to the beginning of the data for
-       each processor */
-    for (k = 0, p = 0; p < nprocs; p++) {
-      ptrToRecv[p] = k;
-      k += nvtcs[p];
-    }
-  }
-  
-  if (nprocs > 1) {
-    temp = NULL;
-    if (!iam ) {
-      if ( !(temp = intMalloc_dist (n+1)) ) {
-	fprintf (stderr, "Malloc fails for temp[].");
-	return (memAux + memRet);
-      }
-      memAux += (float) (n+1) * iword;
-    }
-#if defined (_LONGINT)
-    for (p=0; p<nprocs; p++) {
-      if (ptrToRecv[p] > INT_MAX)
-	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-      intBuf1[p] = (int) ptrToRecv[p];
-    }
-#else  /* Default */
-    intBuf1 = ptrToRecv;
-#endif
-    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, 
-		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
-  }
-  else
-    temp = supno_s;
-
-  if (!iam) {
-    nsupers = 0;
-    p = (int) OWNER( globToLoc[0] );
-    gb = temp[ptrToRecv[p]];
-    supno_n[0] = nsupers;
-    ptrToRecv[p] ++;
-    szsn = 1;
-    for (j = 1; j < n; j ++) {
-      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
-	nsupers ++;
-	p  = (int) OWNER( globToLoc[j] );
-	gb = temp[ptrToRecv[p]];
-	szsn = 1;
-      }
-      else {
-	szsn ++;
-      }
-      ptrToRecv[p] ++;
-      supno_n[j] = nsupers;
-    }
-    nsupers++;
-    if (nprocs > 1) {
-      SUPERLU_FREE (temp);
-      memAux -= (float) (n+1) * iword;
-    }
-    supno_n[n] = nsupers;
-  }
-
-  /* reset to 0 nnzToSend */
-  for (p = 0; p < 2 *nprocs; p++)
-    nnzToSend[p] = 0;
-  
-  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
-  nsupers = supno_n[n];
-  /* Allocate space for storing Glu_persist_n. */
-  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
-    fprintf (stderr, "Malloc fails for xsup_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers+1) * iword;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
-     ------------------------------------------------------------*/
-  gb = EMPTY;
-  for (i = 0; i < n; i++) {
-    if (gb != supno_n[i]) {
-      /* a new supernode starts */
-      gb = supno_n[i];
-      xsup_n[gb] = i;
-    }
-  }
-  xsup_n[nsupers] = n;
-  
-  for (p = 0; p < nprocs; p++) {
-    send_1[p] = FALSE;
-    send_2[p] = FALSE;
-  }
-  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
-    i = xsup_n[gb_n];
-    if (iam == (int) OWNER( globToLoc[i] )) {
-      pc = PCOL( gb_n, grid );
-      pr = PROW( gb_n, grid );
-      p_diag = PNUM( pr, pc, grid);
-      
-      i_loc = LOCAL_IND( globToLoc[i] );
-      gb_s  = supno_s[i_loc];
-      fst_s = xsup_beg_s[gb_s];
-      lst_s = xsup_end_s[gb_s];
-      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
-	k = lsub_s[j];
-	if (k >= i) {
-	  gb = supno_n[k];
-	  p = (int) PNUM( PROW(gb, grid), pc, grid );
-	  nnzToSend[2*p] ++;
-	  send_1[p] = TRUE;
-	}
-      }
-      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
-	k = usub_s[j];
-	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
-	  gb = supno_n[k];
-	  p = PNUM( pr, PCOL(gb, grid), grid);
-	  nnzToSend[2*p+1] ++;	
-	  send_2[p] = TRUE;
-	}
-      }
-      
-      nsend_2 = 0;
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	nnzToSend[2*p+1] += 2;
-	if (send_2[p])  nsend_2 ++;	  
-      }
-      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) 
-	if (send_2[p] || p == p_diag) {
-	  if (p == p_diag && !send_2[p])
-	    nnzToSend[2*p+1] += nsend_2;
-	  else
-	    nnzToSend[2*p+1] += nsend_2-1;
-	  send_2[p] = FALSE;
-	}
-      nsend_1 = 0;
-      for (p = pc; p < nprocs; p += grid->npcol) {
-	nnzToSend[2*p] += 2;
-	if (send_1[p]) nsend_1 ++;
-      }
-      for (p = pc; p < nprocs; p += grid->npcol) 
-	if (send_1[p]) {
-	  nnzToSend[2*p] += nsend_1-1;
-	  send_1[p] = FALSE;
-	}
-	else
-	  nnzToSend[2*p] += nsend_1;
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
-		grid->comm);
-  
-  nnz_loc_l = nnz_loc_u = 0;
-  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;  
-  for (p = 0; p < nprocs; p++) {
-    if ( p != iam ) {
-      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
-      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; 
-      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
-      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    } else {
-      nnz_loc_l += nnzToRecv[2*p];
-      nnz_loc_u += nnzToRecv[2*p+1];
-      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
-      nnzToRecv_l[p] = nnzToRecv[2*p]; 
-      nnzToRecv_u[p] = nnzToRecv[2*p+1];
-    }
-  }
-  
-  /* Allocate space for storing the symbolic structure after redistribution. */
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for xlsub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_j+1) * iword;
-
-  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for xusub_n[].");
-    return (memAux + memRet);
-  }
-  memRet += (float) (nsupers_i+1) * iword;  
-
-  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
-  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
-    if (!(rcv_luind = 
-	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
-      fprintf (stderr, "Malloc fails for rcv_luind[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) 
-      * iword;
-  }
-  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
-      fprintf (stderr, "Malloc fails for index[].");
-      return (memAux + memRet);
-    }
-    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-  } 
-  
-  /* ------------------------------------------------------------------
-     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
-     ------------------------------------------------------------------*/
-  sendL = TRUE;
-  sendU = FALSE;
-  while (sendL || sendU) {
-    if (sendL) {
-      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
-      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
-    }
-    if (sendU) {
-      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
-      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
-    }
-    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
-      if ( p != iam ) {
-	ptrToSend[p] = i;  i += nnzToSend[p];
-      }
-      ptrToRecv[p] = j;  j += nnzToRecv[p];
-    }
-    nnzToRecv[iam] = 0;
-    
-    ind_loc = ptrToRecv[iam];
-    for (gb_n = 0; gb_n < nsupers; gb_n++) {
-      nsend_2 = 0;    
-      i = xsup_n[gb_n];
-      if (iam == OWNER( globToLoc[i] )) {
-	pc = PCOL( gb_n, grid );
-	pr = PROW( gb_n, grid );
-	p_diag = PNUM( pr, pc, grid );
-	
-	i_loc = LOCAL_IND( globToLoc[i] );
-	gb_s  = supno_s[i_loc];
-	fst_s = xsup_beg_s[gb_s];
-	lst_s = xsup_end_s[gb_s];
-	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
-
-	if (sendL) {
-	  p = pc;                np = grid->nprow;	  
-	} else {
-	  p = pr * grid->npcol;  np = grid->npcol;
-	}
-	for (j = 0; j < np; j++) {
-	  if (p == iam) {
-	    rcv_luind[ind_loc] = gb_n;
-	    rcv_luind[ind_loc+1] = 0;
-	    tmp_ptrToSend[p] = ind_loc + 1;
-	    ind_loc += 2;	 
-	  }
-	  else {
-	    snd_luind[ptrToSend[p]] = gb_n;
-	    snd_luind[ptrToSend[p]+1] = 0;
-	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
-	    ptrToSend[p] += 2;	 
-	  }
-	  if (sendL) p += grid->npcol;
-	  if (sendU) p++;
-	}
-	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
-	  k = sub_s[j];
-	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
-	    gb = supno_n[k];
-	    if (sendL)
-	      p = PNUM( PROW(gb, grid), pc, grid );
-	    else 
-	      p = PNUM( pr, PCOL(gb, grid), grid);
-	    if (send_1[p] == FALSE) {
-	      send_1[p] = TRUE;
-	      send_2[nsend_2] = k; nsend_2 ++;
-	    }
-	    if (p == iam) {
-	      rcv_luind[ind_loc] = k;  ind_loc++;
-	      if (sendL)
-		xsub_n[LBj( gb_n, grid )] ++;
-	      else
-		xsub_n[LBi( gb_n, grid )] ++;
-	    }
-	    else {
-	      snd_luind[ptrToSend[p]] = k;
-	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-	    }
-	  }
-	}
-	if (sendL)
-	  for (p = pc; p < nprocs; p += grid->npcol) {
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if (PNUM(PROW(gb, grid), pc, grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBj( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }
-		}
-	      }
-	      send_1[p] = FALSE;
-	  }  
-	if (sendU)
-	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
-	    if (send_1[p] || p == p_diag) {	      
-	      for (k = 0; k < nsend_2; k++) {
-		gb = supno_n[send_2[k]];
-		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
-		  if (p == iam) {
-		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
-		    xsub_n[LBi( gb_n, grid )] ++;
-		  }
-		  else {
-		    snd_luind[ptrToSend[p]] = send_2[k];
-		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
-		  }	     
-		}
-	      } 
-	      send_1[p] = FALSE;
-	    }
-	  }
-      }
-    }
-    
-    /* reset ptrToSnd to point to the beginning of the data for
-       each processor (structure needed in MPI_Alltoallv) */
-    for (i = 0, p = 0; p < nprocs; p++) {
-      ptrToSend[p] = i;  i += nnzToSend[p];
-    }
-
-    /* ------------------------------------------------------------
-       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-       Note: it uses MPI_Alltoallv.
-       ------------------------------------------------------------*/
-    if (nprocs > 1) {
-#if defined (_LONGINT)
-      for (p=0; p<nprocs; p++) {
-	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
-	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
-	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
-	intBuf1[p] = (int) nnzToSend[p];
-	intBuf2[p] = (int) ptrToSend[p];
-	intBuf3[p] = (int) nnzToRecv[p];
-	intBuf4[p] = (int) ptrToRecv[p];
-      }
-#else  /* Default */
-      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
-      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
-#endif
-
-      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, 
-		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
-		     grid->comm);
-    }
-    if (sendL)
-      nnzToRecv[iam] = nnz_loc_l;
-    else 
-      nnzToRecv[iam] = nnz_loc_u;
-    
-    /* ------------------------------------------------------------
-       DEALLOCATE TEMPORARY STORAGE.
-       -------------------------------------------------------------*/
-    if (sendU) 
-      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
-	SUPERLU_FREE (snd_luind);
-	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
-      }
-    
-    /* ------------------------------------------------------------
-       CONVERT THE FORMAT.
-       ------------------------------------------------------------*/
-    /* Initialize the array of column of L/ row of U pointers */
-    k = 0;
-    for (p = 0; p < nprocs; p ++) {
-      if (p != iam) {
-	i = k;
-	while (i < k + nnzToRecv[p]) {
-	  gb = rcv_luind[i];
-	  nelts = rcv_luind[i+1];
-	  if (sendL)
-	    xsub_n[LBj( gb, grid )] = nelts;
-	  else
-	    xsub_n[LBi( gb, grid )] = nelts;
-	  i += nelts + 2;
-	}
-      }
-      k += nnzToRecv[p];
-    }
-
-    if (sendL) j = nsupers_j;
-    else j = nsupers_i;
-    k = 0; 
-    isize = xsub_n[0];
-    xsub_n[0] = 0; 
-    for (gb_l = 1; gb_l < j; gb_l++) {
-      k += isize;
-      isize = xsub_n[gb_l];
-      xsub_n[gb_l] = k;
-    }
-    xsub_n[gb_l] = k + isize;
-    nnz_loc = xsub_n[gb_l];
-    if (sendL) {
-      lsub_n = NULL;
-      if (nnz_loc) {
-	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for lsub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = lsub_n;
-    }
-    if (sendU) {
-      usub_n = NULL;
-      if (nnz_loc) {
-	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
-	  fprintf (stderr, "Malloc fails for usub_n[].");
-	  return (memAux + memRet);
-	}
-	memRet += (float) (nnz_loc * iword);
-      }
-      sub_n = usub_n;
-    }
-    
-    /* Copy the data into the L column / U row oriented storage */
-    k = 0;
-    for (p = 0; p < nprocs; p++) {
-      i = k;
-      while (i < k + nnzToRecv[p]) {
-	gb = rcv_luind[i];
-	if (gb >= nsupers)
-	  printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n",
-		  iam, p, gb, nsupers, i, i-k);
-	i += 2;
-	if (sendL) gb_l = LBj( gb, grid );
-	if (sendU) gb_l = LBi( gb, grid );
-	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
-	  sub_n[j] = rcv_luind[i];
-	}
-      }      
-      k += nnzToRecv[p];
-    }
-    if (sendL) {
-      sendL = FALSE;  sendU = TRUE;
-    }
-    else
-      sendU = FALSE;
-  }
-
-  /* deallocate memory allocated during symbolic factorization routine */
-  if (rcv_luind != NULL) {
-    SUPERLU_FREE (rcv_luind);
-    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
-  }
-  SUPERLU_FREE (mem);  
-  memAux -= (float) (12 * nprocs * iword);
-  SUPERLU_FREE(nvtcs);
-  memAux -= (float) (5 * nprocs * sizeof(int));
-  
-  if (xlsub_s != NULL) {
-    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
-  }
-  if (xusub_s != NULL) {
-    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
-  }
-  SUPERLU_FREE (globToLoc); 
-  if (supno_s != NULL) {
-    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
-    SUPERLU_FREE (supno_s);
-  }
-  
-  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
-  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
-  *p_xusub = xusub_n; *p_usub = usub_n;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit dist_symbLU()");
-#endif
-  
-  return (-memRet);
-}
-  
-static float
-zdist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
-	Glu_persist_t *Glu_persist, gridinfo_t *grid, 
-	int_t **p_ainf_colptr, int_t **p_ainf_rowind, doublecomplex **p_ainf_val,
-	int_t **p_asup_rowptr, int_t **p_asup_colind, doublecomplex **p_asup_val,
-	int_t *ilsum_i, int_t *ilsum_j
-	)
-{
-/*
- *
- * Purpose
- * =======
- *   Re-distribute A on the 2D process mesh.  The lower part is
- *   stored using a column format and the upper part
- *   is stored using a row format.
- * 
- * Arguments
- * =========
- * 
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_persist  (Input) Glu_persist_t *
- *        Information on supernodes mapping.
- * 
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * p_ainf_colptr (Output) int_t**
- *         Pointer to the lower part of A distributed on a 2D grid 
- *         of processors, stored by columns.
- *
- * p_ainf_rowind (Output) int_t**
- *         Structure of of the lower part of A distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_ainf_val    (Output) doublecomplex**
- *         Numerical values of the lower part of A, distributed on a 
- *         2D grid of processors, stored by columns.
- *
- * p_asup_rowptr (Output) int_t**
- *         Pointer to the upper part of A distributed on a 2D grid 
- *         of processors, stored by rows.
- *
- * p_asup_colind (Output) int_t**
- *         Structure of of the upper part of A distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * p_asup_val    (Output) doublecomplex**
- *         Numerical values of the upper part of A, distributed on a 
- *         2D grid of processors, stored by rows.
- *
- * ilsum_i  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block row wise).
- *
- * ilsum_j  (Input) int_t *
- *       Starting position of each supernode in 
- *       the full array (local, block column wise).
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated when out of memory.
- *        (an approximation).
- *
- */
-  int    iam, p, procs;
-  NRformat_loc *Astore;
-  int_t  *perm_r; /* row permutation vector */
-  int_t  *perm_c; /* column permutation vector */
-  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
-  int_t  nsupers, nsupers_i, nsupers_j;
-  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
-  int_t  nnz_remote; /* number of remote nonzeros to be sent */
-  int_t  SendCnt; /* number of remote nonzeros to be sent */
-  int_t  RecvCnt; /* number of remote nonzeros to be sent */
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  doublecomplex *asup_val, *ainf_val;
-  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-  int_t  *ia, *ja, **ia_send, *index, *itemp;
-  int_t  *ptr_to_send;
-  doublecomplex *aij, **aij_send, *nzval, *dtemp;
-  doublecomplex *nzval_a;
-  MPI_Request *send_req;
-  MPI_Status  status;
-  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-  int_t *supno = Glu_persist->supno;   
-  float memAux;  /* Memory used during this routine and freed on return */
-  float memRet; /* Memory allocated and not freed on return */
-  int_t iword, dword, szbuf;
-
-  /* ------------------------------------------------------------
-     INITIALIZATION.
-     ------------------------------------------------------------*/
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter zdist_A()");
-#endif
-  iword = sizeof(int_t);
-  dword = sizeof(double);
-  
-  perm_r = ScalePermstruct->perm_r;
-  perm_c = ScalePermstruct->perm_c;
-  procs = grid->nprow * grid->npcol;
-  Astore = (NRformat_loc *) A->Store;
-  n = A->ncol;
-  m_loc = Astore->m_loc;
-  fst_row = Astore->fst_row;
-  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (ERROR_RET);
-  }
-  memAux = (float) (2 * procs * iword);
-  memRet = 0.;
-  nnzToSend = nnzToRecv + procs;
-  nsupers  = supno[n-1] + 1;  
-
-  /* ------------------------------------------------------------
-     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
-     THEN ALLOCATE SPACE.
-     THIS ACCOUNTS FOR THE FIRST PASS OF A.
-     ------------------------------------------------------------*/
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      ++nnzToSend[p]; 
-    }
-  }
-  
-  /* All-to-all communication */
-  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
-		grid->comm);
-  
-  maxnnzToRecv = 0;
-  nnz_loc = SendCnt = RecvCnt = 0;
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      SendCnt += nnzToSend[p];
-      RecvCnt += nnzToRecv[p];
-      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
-    } else {
-      nnz_loc += nnzToRecv[p];
-      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
-    }
-  }
-  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
-  szbuf = k;
-
-  /* Allocate space for storing the triplets after redistribution. */
-  if ( !(ia = intMalloc_dist(2*k)) ) {
-    fprintf (stderr, "Malloc fails for ia[].");
-    return (memAux);
-  }
-  memAux += (float) (2*k*iword);
-  ja = ia + k;
-  if ( !(aij = doublecomplexMalloc_dist(k)) ) {
-    fprintf (stderr, "Malloc fails for aij[].");
-    return (memAux);
-  }
-  memAux += (float) (k*dword);
-  
-  /* Allocate temporary storage for sending/receiving the A triplets. */
-  if ( procs > 1 ) {
-    if ( !(send_req = (MPI_Request *)
-	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
-      fprintf (stderr, "Malloc fails for send_req[].");
-      return (memAux);
-    }
-    memAux += (float) (2*procs *sizeof(MPI_Request));
-    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
-      fprintf(stderr, "Malloc fails for ia_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(int_t*));
-    if ( !(aij_send = (doublecomplex **)SUPERLU_MALLOC(procs*sizeof(doublecomplex*))) ) {
-      fprintf(stderr, "Malloc fails for aij_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs*sizeof(doublecomplex*));    
-    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for index[].");
-      return (memAux);
-    }
-    memAux += (float) (2*SendCnt*iword);
-    if ( !(nzval = doublecomplexMalloc_dist(SendCnt)) ) {
-      fprintf(stderr, "Malloc fails for nzval[].");
-      return (memAux);
-    }
-    memAux += (float) (SendCnt * dword);
-    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
-      fprintf(stderr, "Malloc fails for ptr_to_send[].");
-      return (memAux);
-    }
-    memAux += (float) (procs * iword);
-    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for itemp[].");
-      return (memAux);
-    }
-    memAux += (float) (2*maxnnzToRecv*iword);
-    if ( !(dtemp = doublecomplexMalloc_dist(maxnnzToRecv)) ) {
-      fprintf(stderr, "Malloc fails for dtemp[].");
-      return (memAux);
-    }
-    memAux += (float) (maxnnzToRecv * dword);
-    
-    for (i = 0, j = 0, p = 0; p < procs; ++p) {
-      if ( p != iam ) {
-	ia_send[p] = &index[i];
-	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	aij_send[p] = &nzval[j];
-	j += nnzToSend[p];
-      }
-    }
-  } /* if procs > 1 */
-  
-  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
-  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
-    return (memAux);
-  }
-  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
-  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
-    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
-    return (memAux+memRet);
-  }
-  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
-  
-  /* ------------------------------------------------------------
-     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
-     THIS ACCOUNTS FOR THE SECOND PASS OF A.
-     ------------------------------------------------------------*/
-  nnz_loc = 0; /* Reset the local nonzero count. */
-  nnz_loc_ainf = nnz_loc_asup = 0;
-  nzval_a = Astore->nzval;
-  for (i = 0; i < m_loc; ++i) {
-    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
-      jcol = Astore->colind[j];
-      gbi = BlockNum( irow );
-      gbj = BlockNum( jcol );
-      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
-      
-      if ( p != iam ) { /* remote */
-	k = ptr_to_send[p];
-	ia_send[p][k] = irow;
-	ia_send[p][k + nnzToSend[p]] = jcol;
-	aij_send[p][k] = nzval_a[j];
-	++ptr_to_send[p]; 
-      } else {          /* local */
-	ia[nnz_loc] = irow;
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = nzval_a[j];
-	++nnz_loc;
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-
-  /* ------------------------------------------------------------
-     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
-     NOTE: Can possibly use MPI_Alltoallv.
-     ------------------------------------------------------------*/
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToSend[p];
-      MPI_Isend( ia_send[p], it, mpi_int_t,
-		 p, iam, grid->comm, &send_req[p] );
-      it = nnzToSend[p];
-      MPI_Isend( aij_send[p], it, SuperLU_MPI_DOUBLE_COMPLEX,
-		 p, iam+procs, grid->comm, &send_req[procs+p] ); 
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      it = 2*nnzToRecv[p];
-      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
-      it = nnzToRecv[p];
-      MPI_Recv( dtemp, it, SuperLU_MPI_DOUBLE_COMPLEX, p, p+procs,
-		grid->comm, &status );
-      for (i = 0; i < nnzToRecv[p]; ++i) {
-	ia[nnz_loc] = itemp[i];
-	irow = itemp[i];
-	jcol = itemp[i + nnzToRecv[p]];
-	/* assert(jcol<n); */
-	ja[nnz_loc] = jcol;
-	aij[nnz_loc] = dtemp[i];
-	++nnz_loc;
-	
-	gbi = BlockNum( irow );
-	gbj = BlockNum( jcol );
-	/* Count nonzeros in each column of L / row of U */
-	if (gbi >= gbj) {
-	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
-	  nnz_loc_ainf ++;
-	}
-	else {
-	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
-	  nnz_loc_asup ++;
-	}
-      }
-    }
-  }
-  
-  for (p = 0; p < procs; ++p) {
-    if ( p != iam ) {
-      MPI_Wait( &send_req[p], &status);
-      MPI_Wait( &send_req[procs+p], &status);
-    }
-  }
-  
-  /* ------------------------------------------------------------
-     DEALLOCATE TEMPORARY STORAGE
-     ------------------------------------------------------------*/
-  
-  SUPERLU_FREE(nnzToRecv);
-  memAux -= 2 * procs * iword;
-  if ( procs > 1 ) {
-    SUPERLU_FREE(send_req);
-    SUPERLU_FREE(ia_send);
-    SUPERLU_FREE(aij_send);
-    SUPERLU_FREE(index);
-    SUPERLU_FREE(nzval);
-    SUPERLU_FREE(ptr_to_send);
-    SUPERLU_FREE(itemp);
-    SUPERLU_FREE(dtemp);
-    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
-      procs*sizeof(doublecomplex*) + 2*SendCnt * iword +
-      SendCnt* dword + procs*iword +
-      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
-  }
-  
-  /* ------------------------------------------------------------
-     CONVERT THE TRIPLET FORMAT.
-     ------------------------------------------------------------*/
-  if (nnz_loc_ainf != 0) {
-    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * iword);
-    if ( !(ainf_val = doublecomplexMalloc_dist(nnz_loc_ainf)) ) {
-      fprintf (stderr, "Malloc fails for *ainf_val[].");
-      return (memAux+memRet);
-    }
-    memRet += (float) (nnz_loc_ainf * dword);
-  }
-  else {
-    ainf_rowind = NULL;
-    ainf_val = NULL;
-  }
-  if (nnz_loc_asup != 0) {
-    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_colind[].");
-      return (memAux + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * iword);
-    if ( !(asup_val = doublecomplexMalloc_dist(nnz_loc_asup)) ) {
-      fprintf (stderr, "Malloc fails for *asup_val[].");
-      return (memAux  + memRet);
-    }
-    memRet += (float) (nnz_loc_asup * dword);
-  }
-  else {
-    asup_colind = NULL;
-    asup_val = NULL;
-  }
-
-  /* Initialize the array of column pointers */
-  k = 0; 
-  jsize = ainf_colptr[0];  ainf_colptr[0] = 0; 
-  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
-    k += jsize;              
-    jsize = ainf_colptr[j];  
-    ainf_colptr[j] = k;
-  }
-  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
-  i = 0;
-  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
-  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
-    i += isize;
-    isize = asup_rowptr[j];  
-    asup_rowptr[j] = i;
-  }
-  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
-
-  /* Copy the triplets into the column oriented storage */
-  for (i = 0; i < nnz_loc; ++i) {
-    jcol = ja[i];
-    irow = ia[i];
-    gbi = BlockNum( irow );
-    gbj = BlockNum( jcol );
-    /* Count nonzeros in each column of L / row of U */
-    if (gbi >= gbj) {
-      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
-      k = ainf_colptr[j];
-      ainf_rowind[k] = irow;
-      ainf_val[k] = aij[i];
-      ainf_colptr[j] ++;
-    }
-    else {
-      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
-      k = asup_rowptr[j];
-      asup_colind[k] = jcol;
-      asup_val[k] = aij[i];
-      asup_rowptr[j] ++;
-    }
-  }
-
-  /* Reset the column pointers to the beginning of each column */
-  for (j = ilsum_j[nsupers_j]; j > 0; j--) 
-    ainf_colptr[j] = ainf_colptr[j-1];
-  for (j = ilsum_i[nsupers_i]; j > 0; j--) 
-    asup_rowptr[j] = asup_rowptr[j-1];
-  ainf_colptr[0] = 0;
-  asup_rowptr[0] = 0;
-  
-  SUPERLU_FREE(ia);
-  SUPERLU_FREE(aij);
-  memAux -= 2*szbuf*iword + szbuf*dword;
-  
-  *p_ainf_colptr = ainf_colptr;
-  *p_ainf_rowind = ainf_rowind; 
-  *p_ainf_val    = ainf_val;
-  *p_asup_rowptr = asup_rowptr;
-  *p_asup_colind = asup_colind;
-  *p_asup_val    = asup_val;
-
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Exit zdist_A()");
-  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
-#endif
-
-  return (-memRet);
-} /* dist_A */
-
-int_t
-zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
-		ScalePermstruct_t *ScalePermstruct,
-		Pslu_freeable_t *Pslu_freeable, 
-		LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- *
- *
- * Purpose
- * =======
- *   Distribute the input matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *          This routine should not be called for this case, an error
- *          is generated.  Instead, pddistribute routine should be called.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (Input) int
- *        Dimension of the matrix.
- *
- * A      (Input) SuperMatrix*
- *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
- *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
- *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
- *
- * ScalePermstruct (Input) ScalePermstruct_t*
- *        The data structure to store the scaling and permutation vectors
- *        describing the transformations performed to the original matrix A.
- *
- * Glu_freeable (Input) *Glu_freeable_t
- *        The global structure describing the graph of L and U.
- * 
- * LUstruct (Input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (Input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   < 0, number of bytes allocated on return from the dist_symbLU
- *   > 0, number of bytes allocated for performing the distribution
- *       of the data, when out of memory.
- *        (an approximation).
- *
- */
-{
-  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-  Glu_freeable_t Glu_freeable_n;
-  LocalLU_t *Llu = LUstruct->Llu;
-  int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, 
-    len, len1, nsupc, nsupc_gb, ii, nprocs;
-  int_t ljb;  /* local block column number */
-  int_t nrbl; /* number of L blocks in current block column */
-  int_t nrbu; /* number of U blocks in current block column */
-  int_t gb;   /* global block number; 0 < gb <= nsuper */
-  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-  int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
-  int_t mybufmax[NBUFFERS];
-  NRformat_loc *Astore;
-  doublecomplex *a;
-  int_t *asub, *xa;
-  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
-  doublecomplex *asup_val, *ainf_val;
-  int_t *xsup, *supno;    /* supernode and column mapping */
-  int_t *lsub, *xlsub, *usub, *xusub;
-  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
-  int_t next_ind;      /* next available position in index[*] */
-  int_t next_val;      /* next available position in nzval[*] */
-  int_t *index;         /* indices consist of headers and row subscripts */
-  doublecomplex *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
-  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
-  doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-  doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-  
-  /*-- Counts to be used in factorization. --*/
-  int_t  *ToRecv, *ToSendD, **ToSendR;
-  
-  /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
-  
-  /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */  
-  int_t  *ilsum;         /* starting position of each supernode in 
-			    the full array (local)                 */  
-  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in 
-				the full array (local, block column wise) */  
-  /*-- Auxiliary arrays; freed on return --*/
-  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  int_t *LUb_length; /* L,U block length; size nsupers_ij */
-  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
-  int_t *LUb_number; /* global block number; size nsupers_ij */
-  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
-  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-  doublecomplex *dense, *dense_col; /* SPA */
-  doublecomplex zero = {0.0, 0.0};
-  int_t ldaspa;     /* LDA of SPA */
-  int_t iword, dword;
-  float memStrLU, memA,
-    memDist = 0.; /* memory used for redistributing the data, which does
-		   not include the memory for the numerical values of L and U */
-  float  memNLU = 0.; /* memory allocated for storing the numerical values of 
-		    L and U, that will be used in the numeric factorization */
-
-#if ( PRNTlevel>=1 )
-  int_t nLblocks = 0, nUblocks = 0;
-#endif
-  
-  /* Initialization. */
-  iam = grid->iam;
-#if ( DEBUGlevel>=1 )
-  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
-#endif
-  myrow = MYROW( iam, grid );
-  mycol = MYCOL( iam, grid );
-  nprocs = grid->npcol * grid->nprow;
-  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-  Astore   = (NRformat_loc *) A->Store;
-  
-  iword = sizeof(int_t);
-  dword = sizeof(doublecomplex);
-
-  if (fact == SamePattern_SameRowPerm) {
-    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");  
-  }
-
-  if ((memStrLU = 
-       dist_symbLU (n, Pslu_freeable, 
-		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
-    return (memStrLU);
-  memDist += (-memStrLU);
-  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
-  supno = Glu_persist->supno;   
-  nsupers  = supno[n-1] + 1;
-  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
-  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
-  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
-  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum[].");  
-    return (memDist + memNLU);
-  }
-  memNLU += (nsupers_i+1) * iword;
-  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
-    fprintf (stderr, "Malloc fails for ilsum_j[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_j+1) * iword;
-
-  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
-  ilsum[0] = 0;
-  ldaspa = 0;
-  for (gb = 0; gb < nsupers; gb++) 
-    if ( myrow == PROW( gb, grid ) ) {
-      i = SuperSize( gb );
-      ldaspa += i;
-      lb = LBi( gb, grid );
-      ilsum[lb + 1] = ilsum[lb] + i;
-    }
-  ilsum[nsupers_i] = ldaspa;
-
-  ldaspa_j = 0; ilsum_j[0] = 0;  
-  for (gb = 0; gb < nsupers; gb++) 
-    if (mycol == PCOL( gb, grid )) {
-      i = SuperSize( gb );
-      ldaspa_j += i;
-      lb = LBj( gb, grid );
-      ilsum_j[lb + 1] = ilsum_j[lb] + i;
-    }
-  ilsum_j[nsupers_j] = ldaspa_j;
-  
-  if ((memA = zdist_A(A, ScalePermstruct, Glu_persist,
-		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
-		      &asup_rowptr, &asup_colind, &asup_val,
-		      ilsum, ilsum_j)) > 0)
-    return (memDist + memA + memNLU);
-  memDist += (-memA);
-
-  /* ------------------------------------------------------------
-     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
-     ------------------------------------------------------------*/
-  
-  /* We first need to set up the L and U data structures and then
-   * propagate the values of A into them.
-   */
-  if ( !(ToRecv = intCalloc_dist(nsupers)) ) {
-    fprintf(stderr, "Calloc fails for ToRecv[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers * iword;
-  
-  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-  if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for ToSendR[].");
-    return (memDist + memNLU);
-  }
-  memNLU += k*sizeof(int_t*);
-  j = k * grid->npcol;
-  if ( !(index = intMalloc_dist(j)) ) {
-    fprintf(stderr, "Malloc fails for index[].");
-    return (memDist + memNLU);
-  }
-  memNLU += j*iword;
-  
-  for (i = 0; i < j; ++i) index[i] = EMPTY;
-  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-  
-  /* Auxiliary arrays used to set up L and U block data structures.
-     They are freed on return. */
-  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_length[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Malloc fails for LUb_indptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_number[].");
-    return (memDist + memNLU);
-  }    
-  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
-    fprintf(stderr, "Calloc fails for LUb_valptr[].");
-    return (memDist + memNLU);
-  }
-  memDist += 4 * nsupers_ij * iword;
-  
-  k = CEILING( nsupers, grid->nprow ); 
-  /* Pointers to the beginning of each block row of U. */
-  if ( !(Unzval_br_ptr = 
-	 (doublecomplex**)SUPERLU_MALLOC(nsupers_i * sizeof(doublecomplex*))) ) {
-    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*sizeof(doublecomplex*) + nsupers_i*sizeof(int_t*);
-  Unzval_br_ptr[nsupers_i-1] = NULL;
-  Ufstnz_br_ptr[nsupers_i-1] = NULL;
-
-  if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Malloc fails for ToSendD[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_i*iword;  
-  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
-    fprintf(stderr, "Calloc fails for rb_marker[].");
-    return (memDist + memNLU);
-  }
-  memDist += (nsupers_i + nsupers_j)*iword;
-  
-  /* Auxiliary arrays used to set up L, U block data structures.
-     They are freed on return.
-     k is the number of local row blocks.   */
-  if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) 
-				   * sp_ienv_dist(3))) ) {
-    fprintf(stderr, "Calloc fails for SPA dense[].");
-    return (memDist + memNLU);
-  }
-  /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for fmod[].");
-    return (memDist + memNLU);
-  }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
-    fprintf(stderr, "Calloc fails for bmod[].");
-    return (memDist + memNLU);
-  }
-  /* ------------------------------------------------ */
-  memNLU += 2*nsupers_i*iword + 
-    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; 
-  
-  /* Pointers to the beginning of each block column of L. */
-  if ( !(Lnzval_bc_ptr = 
-	 (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
-    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
-    return (memDist + memNLU);
-  }
-  memNLU += nsupers_j * sizeof(doublecomplex*) + nsupers_j * sizeof(int_t*);
-  Lnzval_bc_ptr[nsupers_j-1] = NULL;
-  Lrowind_bc_ptr[nsupers_j-1] = NULL;
-  
-  /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[].");
-    return (memDist + memNLU);
-  }
-  if ( !(index = intMalloc_dist(len)) ) {
-    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
-    return (memDist + memNLU);
-  }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
-  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
-  /* -------------------------------------------------------------- */
-  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
-  
-  /*------------------------------------------------------------
-    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-    ------------------------------------------------------------*/
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */
-    ljb_i = LBi( jb, grid);  /* Local block number row wise */
-    fsupc = FstBlockC( jb );
-    nsupc = SuperSize( jb );
-    
-    if ( myrow == jbrow ) { /* Block row jb in my process row */
-      /* Scatter A into SPA. */
-      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
-	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
-	  if (i >= asup_rowptr[ilsum[nsupers_i]]) 
-	    printf ("ERR7\n");
-	  jcol = asup_colind[i];
-	  if (jcol >= n)
-	    printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  gb = BlockNum( jcol );
-	  lb = LBj( gb, grid );
-	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
-	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
-	  if (jcol >= ldaspa_j)
-	    printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
-		    iam, jb, gb, j, jcol);
-	  dense_col[jcol] = asup_val[i];
-	}
-	dense_col += ldaspa_j;
-      }
-      
-      /*------------------------------------------------
-       * SET UP U BLOCKS.
-       *------------------------------------------------*/
-      /* Count number of blocks and length of each block. */
-      nrbu = 0;
-      len = 0; /* Number of column subscripts I own. */
-      len1 = 0; /* number of fstnz subscripts */
-      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
-	jcol = usub[i];
-	gb = BlockNum( jcol ); /* Global block number */
-	
-	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
-	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
-	  iam, jb, gb, jcol, jbcol, pc); */
-	
-	lb = LBj( gb, grid );  /* Local block number */
-	pc = PCOL( gb, grid ); /* Process col owning this block */
-	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
-	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
-	pr = PROW( gb, grid );
-	if ( pr != jbrow  && mycol == pc)
-	  bsendx_plist[lb][jbrow] = YES; 
-	if (mycol == pc) {
-	  len += nsupc;
-	  LUb_length[lb] += nsupc;
-	  ToSendD[ljb_i] = YES;
-	  if (Urb_marker[lb] <= jb) { /* First see this block */
-	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
-	    Urb_marker[lb] = jb + 1;
-	    LUb_number[nrbu] = gb;
-	    /* if (gb == 391825 && jb == 145361)
-	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
-	       iam, jb, gb, nrbu); */
-	    nrbu ++;
-	    len1 += SuperSize( gb );
-	    if ( gb != jb )/* Exclude diagonal block. */
-	      ++bmod[ljb_i];/* Mod. count for back solve */
-#if ( PRNTlevel>=1 )
-	    ++nUblocks;
-#endif
-	  }
-	}
-      } /* for i ... */
-      
-      if ( nrbu ) { 
-	/* Sort the blocks of U in increasing block column index.
-	   SuperLU_DIST assumes this is true */
-	/* simple insert sort algorithm */
-	/* to be transformed in quick sort */
-	for (j = 1; j < nrbu; j++) {
-	  k = LUb_number[j];
-	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
-	    LUb_number[i+1] = LUb_number[i];
-	  }
-	  LUb_number[i+1] = k;
-	} 
-	
-	/* Set up the initial pointers for each block in
-	   index[] and nzval[]. */
-	/* Add room for descriptors */
-	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1+1)) ) {
-	  fprintf (stderr, "Malloc fails for Uindex[]");
-	  return (memDist + memNLU);
-	}
-	Ufstnz_br_ptr[ljb_i] = index;
-	if (!(Unzval_br_ptr[ljb_i] =
-	      doublecomplexMalloc_dist(len))) {
-	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
-	  return (memDist + memNLU);
-	}
-	memNLU += (len1+1)*iword + len*dword;
-	uval = Unzval_br_ptr[ljb_i];
-	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-	index[0] = nrbu;  /* Number of column blocks */
-	index[1] = len;   /* Total length of nzval[] */
-	index[2] = len1;  /* Total length of index */
-	index[len1] = -1; /* End marker */
-	next_ind = BR_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbu; k++) {
-	  gb = LUb_number[k];
-	  lb = LBj( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;  /* Reset vector of block length */
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len;
-	  LUb_indptr[lb] = next_ind;
-	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
-	    index[next_ind] = FstBlockC( jb + 1 );
-	  LUb_valptr[lb] = next_val;
-	  next_val += len;
-	}
-	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
-	   and the initial values of A from SPA into Unzval_br_ptr[]. */
-	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
-	  jcol = usub[i];
-	  gb = BlockNum( jcol );
-	  
-	  if ( mycol == PCOL( gb, grid ) ) {
-	    lb = LBj( gb, grid );
-	    k = LUb_indptr[lb]; /* Start fstnz in index */
-	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
-	  }
-	}  /* for i ... */
-	
-	for (i = 0; i < nrbu; i++) {
-	  gb = LUb_number[i];
-	  lb = LBj( gb, grid );   
-	  next_ind = LUb_indptr[lb];
-	  k = FstBlockC( jb + 1);
-	  jcol = ilsum_j[lb];
-	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
-	    dense_col = dense;
-	    j = index[next_ind+jj];
-	    for (ii = j; ii < k; ii++) {
-	      uval[LUb_valptr[lb]++] = dense_col[jcol];
-	      dense_col[jcol] = zero;
-	      dense_col += ldaspa_j;	      
-	    }
-	  }
-	}
-      } else {
-	Ufstnz_br_ptr[ljb_i] = NULL;
-	Unzval_br_ptr[ljb_i] = NULL;
-      } /* if nrbu ... */	
-    } /* if myrow == jbrow */
-    
-      /*------------------------------------------------
-       * SET UP L BLOCKS.
-       *------------------------------------------------*/
-    if (mycol == jbcol) {  /* Block column jb in my process column */
-      /* Scatter A_inf into SPA. */
-      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
-	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
-	  irow = ainf_rowind[i];
-	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
-	  gb = BlockNum( irow );
-	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
-	  if ( myrow == PROW( gb, grid ) ) {
-	    lb = LBi( gb, grid );
-	    irow = ilsum[lb] + irow - FstBlockC( gb );
-	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
-	    dense_col[irow] = ainf_val[i];
-	  }
-	}
-	dense_col += ldaspa;
-      }      
-      
-      /* sort the indices of the diagonal block at the beginning of xlsub */
-      if (myrow == jbrow) {
-	k = xlsub[ljb_j];
-	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	  irow = lsub[i];
-	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
-	    lsub[i] = lsub[k + irow - fsupc];
-	    lsub[k + irow - fsupc] = irow;
-	    i --;
-	  }
-	}
-      }
-      
-      /* Count number of blocks and length of each block. */
-      nrbl = 0;
-      len = 0; /* Number of row subscripts I own. */
-      kseen = 0;
-      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	irow = lsub[i];
-	gb = BlockNum( irow ); /* Global block number */	  
-	pr = PROW( gb, grid ); /* Process row owning this block */
-	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
-	     myrow == jbrow) {
-	  fsendx_plist[ljb_j][pr] = YES;
-	  ++nfsendx;
-	}
-	if ( myrow == pr ) {
-	  lb = LBi( gb, grid );  /* Local block number */
-	  if (Lrb_marker[lb] <= jb) { /* First see this block */
-	    Lrb_marker[lb] = jb + 1;
-	    LUb_length[lb] = 1;
-	    LUb_number[nrbl++] = gb;
-	    if ( gb != jb ) /* Exclude diagonal block. */
-	      ++fmod[lb]; /* Mod. count for forward solve */
-	    if ( kseen == 0 && myrow != jbrow ) {
-	      ++nfrecvx;
-	      kseen = 1;
-	    }
-#if ( PRNTlevel>=1 )
-	    ++nLblocks;
-#endif
-	  } else 
-	    ++LUb_length[lb];	    
-	  ++len;
-	}
-      } /* for i ... */
-      
-      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-	/* Set up the initial pointers for each block in 
-	   index[] and nzval[]. */
-	/* If I am the owner of the diagonal block, order it first in LUb_number.
-	   Necessary for SuperLU_DIST routines */
-	kseen = EMPTY;
-	for (j = 0; j < nrbl; j++) {
-	  if (LUb_number[j] == jb)
-	    kseen = j;
-	}
-	if (kseen != EMPTY && kseen != 0) {
-	  LUb_number[kseen] = LUb_number[0];
-	  LUb_number[0] = jb;
-	}
-	
-	/* Add room for descriptors */
-	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-	if ( !(index = intMalloc_dist(len1)) ) {
-	  fprintf (stderr, "Malloc fails for index[]");
-	  return (memDist + memNLU);
-	}
-	Lrowind_bc_ptr[ljb_j] = index;
-	if (!(Lnzval_bc_ptr[ljb_j] = 
-	      doublecomplexMalloc_dist(len*nsupc))) {
-	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb);
-	  return (memDist + memNLU);
-	}
-	memNLU += len1*iword + len*nsupc*dword;
-	
-	lusup = Lnzval_bc_ptr[ljb_j];
-	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-	index[0] = nrbl;  /* Number of row blocks */
-	index[1] = len;   /* LDA of the nzval[] */
-	next_ind = BC_HEADER;
-	next_val = 0;
-	for (k = 0; k < nrbl; ++k) {
-	  gb = LUb_number[k];
-	  lb = LBi( gb, grid );
-	  len = LUb_length[lb];
-	  LUb_length[lb] = 0;
-	  index[next_ind++] = gb; /* Descriptor */
-	  index[next_ind++] = len; 
-	  LUb_indptr[lb] = next_ind;
-	    LUb_valptr[lb] = next_val;
-	    next_ind += len;
-	    next_val += len;
-	  }
-	  /* Propagate the compressed row subscripts to Lindex[],
-	     and the initial values of A from SPA into Lnzval[]. */
-	  len = index[1];  /* LDA of lusup[] */
-	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
-	    irow = lsub[i];
-	    gb = BlockNum( irow );
-	    if ( myrow == PROW( gb, grid ) ) {
-	      lb = LBi( gb, grid );
-	      k = LUb_indptr[lb]++; /* Random access a block */
-	      index[k] = irow;
-	      k = LUb_valptr[lb]++;
-	      irow = ilsum[lb] + irow - FstBlockC( gb );
-	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
-		lusup[k] = dense_col[irow];
-		dense_col[irow] = zero;
-		k += len;
-		dense_col += ldaspa;
-	      }
-	    }
-	  } /* for i ... */
-	} else {
-	  Lrowind_bc_ptr[ljb_j] = NULL;
-	  Lnzval_bc_ptr[ljb_j] = NULL;
-	} /* if nrbl ... */		  
-      } /* if mycol == pc */
-  } /* for jb ... */
-
-  SUPERLU_FREE(ilsum_j);
-  SUPERLU_FREE(Urb_marker);
-  SUPERLU_FREE(LUb_length);
-  SUPERLU_FREE(LUb_indptr);
-  SUPERLU_FREE(LUb_number);
-  SUPERLU_FREE(LUb_valptr);
-  SUPERLU_FREE(Lrb_marker);
-  SUPERLU_FREE(dense);
-  
-  /* Free the memory used for storing L and U */
-  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
-  if (lsub != NULL)
-    SUPERLU_FREE(lsub);  
-  if (usub != NULL)
-    SUPERLU_FREE(usub);
-  
-  /* Free the memory used for storing A */
-  SUPERLU_FREE(ainf_colptr);
-  if (ainf_rowind != NULL) {
-    SUPERLU_FREE(ainf_rowind);
-    SUPERLU_FREE(ainf_val);
-  }
-  SUPERLU_FREE(asup_rowptr);
-  if (asup_colind != NULL) {
-    SUPERLU_FREE(asup_colind);	
-    SUPERLU_FREE(asup_val);	
-  }
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
-    fprintf (stderr, "Malloc fails for recvBuf[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for nnzToRecv[].");
-    return (memDist + memNLU);
-  }
-  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
-    fprintf (stderr, "Malloc fails for ptrToRecv[].");
-    return (memDist + memNLU);
-  }
-  
-  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
-    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
-  
-  for (p = 0; p < nprocs; p++)
-    nnzToRecv[p] = 0;
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    nnzToRecv[p] += grid->npcol;
-  }    
-  i = 0;
-  for (p = 0; p < nprocs; p++) {
-    ptrToRecv[p] = i;
-    i += nnzToRecv[p];
-    ptrToSend[p] = 0;
-    if (p != iam)
-      nnzToSend[p] = nnzToRecv[iam];
-    else
-      nnzToSend[p] = 0;
-  }
-  nnzToRecv[iam] = 0;
-  i = ptrToRecv[iam];
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    if (p == iam) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      for (j = 0; j < grid->npcol; j++, i++)
-	recvBuf[i] = ToSendR[ljb_j][j];
-    }
-  }   
-  
-  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
-		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
-  
-  for (jb = 0; jb < nsupers; jb++) {
-    jbcol = PCOL( jb, grid );
-    jbrow = PROW( jb, grid );
-    p = PNUM(jbrow, jbcol, grid);
-    ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-    ljb_i = LBi( jb, grid ); /* Local block number row wise */	
-    /* (myrow == jbrow) {
-       if (ToSendD[ljb_i] == YES)
-       ToRecv[jb] = 1;
-       }
-       else {
-       if (recvBuf[ptrToRecv[p] + mycol] == YES)
-       ToRecv[jb] = 2;
-       } */
-    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
-      if (myrow == jbrow)
-	ToRecv[jb] = 1;
-      else
-	ToRecv[jb] = 2;
-    }
-    if (mycol == jbcol) {
-      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) 
-	ToSendR[ljb_j][i] = recvBuf[j];  
-      ToSendR[ljb_j][mycol] = EMPTY;
-    }
-    ptrToRecv[p] += grid->npcol;
-  }   
-  
-  /* exchange information about bsendx_plist in between column of processors */
-  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
-		 MPI_MAX, grid->cscp.comm);
-  
-  for (jb = 0; jb < nsupers; jb ++) {
-    jbcol = PCOL( jb, grid);
-    jbrow = PROW( jb, grid);
-    if (mycol == jbcol) {
-      ljb_j = LBj( jb, grid ); /* Local block number column wise */	
-      if (myrow == jbrow ) {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
-	  (*bsendx_plist)[k] = recvBuf[k];
-	  if ((*bsendx_plist)[k] != EMPTY)
-	    nbsendx ++;
-	}
-      }
-      else {
-	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) 
-	  (*bsendx_plist)[k] = EMPTY;
-      }
-    }
-  }
-  
-  SUPERLU_FREE(nnzToRecv);
-  SUPERLU_FREE(ptrToRecv);
-  SUPERLU_FREE(nnzToSend);
-  SUPERLU_FREE(ptrToSend);
-  SUPERLU_FREE(recvBuf);
-  
-  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-  Llu->Unzval_br_ptr = Unzval_br_ptr;
-  Llu->ToRecv = ToRecv;
-  Llu->ToSendD = ToSendD;
-  Llu->ToSendR = ToSendR;
-  Llu->fmod = fmod;
-  Llu->fsendx_plist = fsendx_plist;
-  Llu->nfrecvx = nfrecvx;
-  Llu->nfsendx = nfsendx;
-  Llu->bmod = bmod;
-  Llu->bsendx_plist = bsendx_plist;
-  Llu->nbrecvx = nbrecvx;
-  Llu->nbsendx = nbsendx;
-  Llu->ilsum = ilsum;
-  Llu->ldalsum = ldaspa;
-  LUstruct->Glu_persist = Glu_persist;	
-#if ( PRNTlevel>=1 )
-  if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-		     nLblocks, nUblocks);
-#endif
-  
-  /* Find the maximum buffer size. */
-  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		MPI_MAX, grid->comm);
-  
-#if ( DEBUGlevel>=1 )
-  /* Memory allocated but not freed:
-     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
-     ToRecv, ToSendR, ToSendD
-  */
-  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
-#endif
-    
-  return (- (memDist+memNLU));
-} /* dist_psymbtonum */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzutil.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/pzutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/pzutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,530 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-/*
- * Gather A from the distributed compressed row format to
- * global A in compressed column format.
- */
-int pzCompRow_loc_to_CompCol_global
-(
- int_t need_value, /* Input. Whether need to gather numerical values */
- SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
- gridinfo_t *grid, /* Input */
- SuperMatrix *GA   /* Output */
-)
-{
-    NRformat_loc *Astore;
-    NCformat *GAstore;
-    doublecomplex *a, *a_loc;
-    int_t *colind, *rowptr;
-    int_t *colptr_loc, *rowind_loc;
-    int_t m_loc, n, i, j, k, l;
-    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
-    doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
-    doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
-    int_t *colcnt, *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
-			   local block rows.
-			   Use n_loc+1 pointers for each block. */
-    int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
-			   Use n_loc+1 pointers for each block. */
-    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
-    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
-    int_t *fst_rows, *n_locs;
-    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
-    int   it, n_loc, procs;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pzCompRow_loc_to_CompCol_global");
-#endif
-
-    /* Initialization. */
-    n = A->ncol;
-    Astore = (NRformat_loc *) A->Store;
-    nnz_loc = Astore->nnz_loc;
-    m_loc = Astore->m_loc;
-    fst_row = Astore->fst_row;
-    a = Astore->nzval;
-    rowptr = Astore->rowptr;
-    colind = Astore->colind;
-    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */
-
-    /* ------------------------------------------------------------
-       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
-       ------------------------------------------------------------*/
-    zCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
-                             &rowind_loc, &colptr_loc);
-    /* Change local row index numbers to global numbers. */
-    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;
-
-#if ( DEBUGlevel>=2 )
-    printf("Proc %d\n", grid->iam);
-    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
-    PrintInt10("colptr_loc", n+1, colptr_loc);
-#endif
-
-    procs = grid->nprow * grid->npcol;
-    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
-	  ABORT("Malloc fails for fst_rows[]");
-    n_locs = fst_rows + procs;
-    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
-		  grid->comm);
-    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
-    n_locs[procs-1] = n - fst_rows[procs-1];
-    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
-	  ABORT("Malloc fails for recvcnts[]");
-    sendcnts = recvcnts + procs;
-    rdispls = sendcnts + procs;
-    sdispls = rdispls + procs;
-    itemp_32 = sdispls + procs;
-
-    /* All-to-all transfer column pointers of each block.
-       Now the matrix view is P-by-P block-partition. */
-    /* n column starts for each column, and procs column ends for each block */
-    if ( !(colptr_send = intMalloc_dist(n + procs)) )
-	   ABORT("Malloc fails for colptr_send[]");
-    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
-	   ABORT("Malloc fails for colptr_blk[]");
-    for (i = 0, j = 0; i < procs; ++i) {
-        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
-	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
-	sendcnts[i] = n_locs[i] + 1;
-#if ( DEBUGlevel>=1 )
-	assert(j == fst_rows[i]);
-#endif
-	sdispls[i] = j + i;
-	recvcnts[i] = n_loc + 1;
-	rdispls[i] = i * (n_loc + 1);
-	j += n_locs[i]; /* First column of next block in colptr_loc[] */
-    }
-    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
-		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);
-
-    /* Adjust colptr_blk[] so that they contain the local indices of the
-       column pointers in the receive buffer. */
-    nnz = 0; /* The running sum of the nonzeros counted by far */
-    k = 0;
-    for (i = 0; i < procs; ++i) {
-	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
-	    colnnz = colptr_blk[j+1] - colptr_blk[j];
-	    /*assert(k<=j);*/
-	    colptr_blk[k] = nnz;
-	    nnz += colnnz; /* Start of the next column */
-	    ++k;
-	}
-	colptr_blk[k++] = nnz; /* Add an END marker for each block */
-    }
-    /*assert(k == (n_loc+1)*procs);*/
-
-    /* Now prepare to transfer row indices and values. */
-    sdispls[0] = 0;
-    for (i = 0; i < procs-1; ++i) {
-        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
-	sdispls[i+1] = sdispls[i] + sendcnts[i];
-    }
-    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
-    for (i = 0; i < procs; ++i) {
-        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
-	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
-    }
-    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
-    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
-
-    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
-    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
-        ABORT("Malloc fails for rowind_recv[]");
-    rowind_buf = rowind_recv + k;
-    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
-		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
-    if ( need_value ) {
-        if ( !(a_recv = (doublecomplex *) doublecomplexMalloc_dist(2*k)) )
-	    ABORT("Malloc fails for rowind_recv[]");
-	a_buf = a_recv + k;
-	MPI_Alltoallv(a_loc, sendcnts, sdispls, SuperLU_MPI_DOUBLE_COMPLEX,
-                      a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
-                      grid->comm);
-    }
-      
-    /* Reset colptr_loc[] to point to the n_loc global columns. */
-    colptr_loc[0] = 0;
-    itemp = colptr_send;
-    for (j = 0; j < n_loc; ++j) {
-        colnnz = 0;
-	for (i = 0; i < procs; ++i) {
-	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
-	    colnnz += colptr_blk[k+1] - colptr_blk[k];
-	}
-	colptr_loc[j+1] = colptr_loc[j] + colnnz;
-	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
-    }
-    itemp[n_loc] = colptr_loc[n_loc];
-      
-    /* Merge blocks of row indices into columns of row indices. */
-    for (i = 0; i < procs; ++i) {
-        k = i * (n_loc + 1);
-	for (j = 0; j < n_loc; ++j) { /* i-th block */
-	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
-	        rowind_buf[itemp[j]] = rowind_recv[l];
-		++itemp[j];
-	    }
-	}
-    }
-
-    if ( need_value ) {
-        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
-        for (i = 0; i < procs; ++i) {
-	    k = i * (n_loc + 1);
-	    for (j = 0; j < n_loc; ++j) { /* i-th block */
-	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
-		    a_buf[itemp[j]] = a_recv[l];
-		    ++itemp[j];
-		}
-	    }
-	}
-    }
-
-    /* ------------------------------------------------------------
-       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
-       ------------------------------------------------------------*/
-    GA->nrow  = A->nrow;
-    GA->ncol  = A->ncol;
-    GA->Stype = SLU_NC;
-    GA->Dtype = A->Dtype;
-    GA->Mtype = A->Mtype;
-    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
-    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");
-
-    /* First gather the size of each piece. */
-    nnz_loc = colptr_loc[n_loc];
-    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
-    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
-    GAstore->nnz = nnz;
-    
-    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
-        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
-    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
-        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
-    /* Allgatherv for row indices. */
-    rdispls[0] = 0;
-    for (i = 0; i < procs-1; ++i) {
-        rdispls[i+1] = rdispls[i] + itemp[i];
-        itemp_32[i] = itemp[i];
-    }
-    itemp_32[procs-1] = itemp[procs-1];
-    it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
-		   itemp_32, rdispls, mpi_int_t, grid->comm);
-    if ( need_value ) {
-      if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
-          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
-		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
-    } else GAstore->nzval = NULL;
-
-    /* Now gather the column pointers. */
-    rdispls[0] = 0;
-    for (i = 0; i < procs-1; ++i) {
-        rdispls[i+1] = rdispls[i] + n_locs[i];
-        itemp_32[i] = n_locs[i];
-    }
-    itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
-		   itemp_32, rdispls, mpi_int_t, grid->comm);
-
-    /* Recompute column pointers. */
-    for (i = 1; i < procs; ++i) {
-        k = rdispls[i];
-	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
-	itemp[i] += itemp[i-1]; /* prefix sum */
-    }
-    GAstore->colptr[n] = nnz;
-
-#if ( DEBUGlevel>=2 )
-    if ( !grid->iam ) {
-        printf("After pdCompRow_loc_to_CompCol_global()\n");
-	zPrint_CompCol_Matrix_dist(GA);
-    }
-#endif
-
-    SUPERLU_FREE(a_loc);
-    SUPERLU_FREE(rowind_loc);
-    SUPERLU_FREE(colptr_loc);
-    SUPERLU_FREE(fst_rows);
-    SUPERLU_FREE(recvcnts);
-    SUPERLU_FREE(colptr_send);
-    SUPERLU_FREE(colptr_blk);
-    SUPERLU_FREE(rowind_recv);
-    if ( need_value) SUPERLU_FREE(a_recv);
-#if ( DEBUGlevel>=1 )
-    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
-    CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
-#endif
-    return 0;
-} /* pzCompRow_loc_to_CompCol_global */
-
-
-/*
- * Permute the distributed dense matrix: B <= perm(X).
- * perm[i] = j means the i-th row of X is in the j-th row of B.
- */
-int pzPermute_Dense_Matrix
-(
- int_t fst_row,
- int_t m_loc,
- int_t row_to_proc[],
- int_t perm[],
- doublecomplex X[], int ldx,
- doublecomplex B[], int ldb,
- int nrhs,
- gridinfo_t *grid
-)
-{
-    int_t i, j, k, l;
-    int p, procs;
-    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
-    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
-    int *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t *send_ibuf, *recv_ibuf;
-    doublecomplex *send_dbuf, *recv_dbuf;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Enter pzPermute_Dense_Matrix()");
-#endif
-
-    procs = grid->nprow * grid->npcol;
-    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
-        ABORT("Malloc fails for sendcnts[].");
-    sendcnts_nrhs = sendcnts + procs;
-    recvcnts = sendcnts_nrhs + procs;
-    recvcnts_nrhs = recvcnts + procs;
-    sdispls = recvcnts_nrhs + procs;
-    sdispls_nrhs = sdispls + procs;
-    rdispls = sdispls_nrhs + procs;
-    rdispls_nrhs = rdispls + procs;
-    ptr_to_ibuf = rdispls_nrhs + procs;
-    ptr_to_dbuf = ptr_to_ibuf + procs;
-
-    for (i = 0; i < procs; ++i) sendcnts[i] = 0;
-
-    /* Count the number of X entries to be sent to each process.*/
-    for (i = fst_row; i < fst_row + m_loc; ++i) {
-        p = row_to_proc[perm[i]];
-	++sendcnts[p];
-    }
-    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
-    sdispls[0] = rdispls[0] = 0;
-    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
-    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
-    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
-    for (i = 1; i < procs; ++i) {
-        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
-	sdispls_nrhs[i] = sdispls[i] * nrhs;
-	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
-	rdispls_nrhs[i] = rdispls[i] * nrhs;
-	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
-	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
-    }
-    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
-    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
-    /*assert(k == m_loc);*/
-    /*assert(l == m_loc);*/
-    if ( !(send_ibuf = intMalloc_dist(k + l)) )
-        ABORT("Malloc fails for send_ibuf[].");
-    recv_ibuf = send_ibuf + k;
-    if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) )
-        ABORT("Malloc fails for send_dbuf[].");
-    recv_dbuf = send_dbuf + k * nrhs;
-
-    for (i = 0; i < procs; ++i) {
-        ptr_to_ibuf[i] = sdispls[i];
-	ptr_to_dbuf[i] = sdispls_nrhs[i];
-    }
-
-    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
-    for (i = fst_row; i < fst_row + m_loc; ++i) {
-        j = perm[i];
-	p = row_to_proc[j];
-	send_ibuf[ptr_to_ibuf[p]] = j;
-	j = ptr_to_dbuf[p];
-	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
-	    send_dbuf[j++] = X[i-fst_row + k*ldx];
-	}
-	++ptr_to_ibuf[p];
-	ptr_to_dbuf[p] += nrhs;
-    }
-	  
-    /* Transfer the (permuted) row indices and numerical values. */
-    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
-		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
-    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
-		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX,
-		  grid->comm);
-
-    /* Copy the buffer into b. */
-    for (i = 0, l = 0; i < m_loc; ++i) {
-        j = recv_ibuf[i] - fst_row; /* Relative row number */
-	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
-	    B[j + k*ldb] = recv_dbuf[l++];
-	}
-    }
-
-    SUPERLU_FREE(sendcnts);
-    SUPERLU_FREE(send_ibuf);
-    SUPERLU_FREE(send_dbuf);
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(grid->iam, "Exit pzPermute_Dense_Matrix()");
-#endif
-    return 0;
-} /* pzPermute_Dense_Matrix */
-
-
-/*
- * Initialize the data structure for the solution phase.
- */
-int zSolveInit(superlu_options_t *options, SuperMatrix *A, 
-	       int_t perm_r[], int_t perm_c[], int_t nrhs,
-	       LUstruct_t *LUstruct, gridinfo_t *grid,
-	       SOLVEstruct_t *SOLVEstruct)
-{
-    int_t *row_to_proc, *inv_perm_c, *itemp;
-    NRformat_loc *Astore;
-    int_t        i, fst_row, m_loc, p;
-    int          procs;
-
-    Astore = (NRformat_loc *) A->Store;
-    fst_row = Astore->fst_row;
-    m_loc = Astore->m_loc;
-    procs = grid->nprow * grid->npcol;
-    
-    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
-	ABORT("Malloc fails for row_to_proc[]");
-    SOLVEstruct->row_to_proc = row_to_proc;
-    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
-        ABORT("Malloc fails for inv_perm_c[].");
-    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
-    SOLVEstruct->inv_perm_c = inv_perm_c;
-
-    /* ------------------------------------------------------------
-       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
-       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
-       
-       NOTE: For those processes that do not own any row, it must
-             must be set so that fst_row == A->nrow. 
-       ------------------------------------------------------------*/
-    if ( !(itemp = intMalloc_dist(procs+1)) )
-        ABORT("Malloc fails for itemp[]");
-    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
-		  grid->comm);
-    itemp[procs] = A->nrow;
-    for (p = 0; p < procs; ++p) {
-        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
-    }
-#if ( DEBUGlevel>=2 )
-    if ( !grid->iam ) {
-      printf("fst_row = %d\n", fst_row);
-      PrintInt10("row_to_proc", A->nrow, row_to_proc);
-      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
-    }
-#endif
-    SUPERLU_FREE(itemp);
-
-#if 0
-    /* Compute the mapping between rows and processes. */
-    /* XSL NOTE: What happens if # of mapped processes is smaller
-       than total Procs?  For the processes without any row, let
-       fst_row be EMPTY (-1). Make sure this case works! */
-    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
-		  grid->comm);
-    itemp[procs] = n;
-    for (p = 0; p < procs; ++p) {
-        j = itemp[p];
-	if ( j != EMPTY ) {
-	    k = itemp[p+1];
-	    if ( k == EMPTY ) k = n;
-	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
-	}
-    }
-#endif    
-
-    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
-		   &SOLVEstruct->num_diag_procs,
-		   &SOLVEstruct->diag_procs,
-		   &SOLVEstruct->diag_len);
-
-    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
-	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
-        ABORT("Malloc fails for gstrs_comm[]");
-    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
-		 LUstruct->Glu_persist, SOLVEstruct);
-
-    if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *)
-           SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) )
-        ABORT("Malloc fails for gsmv_comm[]");
-    SOLVEstruct->A_colind_gsmv = NULL;
-    
-    options->SolveInitialized = YES;
-    return 0;
-} /* zSolveInit */
-
-/*
- * Release the resources used for the solution phase.
- */
-void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct)
-{
-    int_t *it;
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-    if ( options->RefineInitialized ) {
-        pzgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
-    }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
-    options->SolveInitialized = NO;
-} /* zSolveFinalize */
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
-		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) 
-{
-    double err, xnorm, temperr, tempxnorm;
-    doublecomplex *x_work, *xtrue_work;
-    doublecomplex temp;
-    int i, j;
-
-    for (j = 0; j < nrhs; j++) {
-      x_work = &x[j*ldx];
-      xtrue_work = &xtrue[j*ldxtrue];
-      err = xnorm = 0.0;
-      for (i = 0; i < n; i++) {
-        z_sub(&temp, &x_work[i], &xtrue_work[i]);
-	err = SUPERLU_MAX(err, z_abs(&temp));
-	xnorm = SUPERLU_MAX(xnorm, z_abs(&x_work[i]));
-      }
-
-      /* get the golbal max err & xnrom */
-      temperr = err;
-      tempxnorm = xnorm;
-      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-
-      err = err / xnorm;
-      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
-    }
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/slamch.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/slamch.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/slamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/slamch.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,983 +0,0 @@
-#include <stdio.h>
-#include "Cnames.h"
-#define TRUE_ (1)
-#define FALSE_ (0)
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define dabs(x) (double)abs(x)
-
-float slamch_(char *cmach)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMCH determines single precision machine parameters.   
-
-    Arguments   
-    =========   
-
-    CMACH   (input) CHARACTER*1   
-            Specifies the value to be returned by SLAMCH:   
-            = 'E' or 'e',   SLAMCH := eps   
-            = 'S' or 's ,   SLAMCH := sfmin   
-            = 'B' or 'b',   SLAMCH := base   
-            = 'P' or 'p',   SLAMCH := eps*base   
-            = 'N' or 'n',   SLAMCH := t   
-            = 'R' or 'r',   SLAMCH := rnd   
-            = 'M' or 'm',   SLAMCH := emin   
-            = 'U' or 'u',   SLAMCH := rmin   
-            = 'L' or 'l',   SLAMCH := emax   
-            = 'O' or 'o',   SLAMCH := rmax   
-
-            where   
-
-            eps   = relative machine precision   
-            sfmin = safe minimum, such that 1/sfmin does not overflow   
-            base  = base of the machine   
-            prec  = eps*base   
-            t     = number of (base) digits in the mantissa   
-            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
-            emin  = minimum exponent before (gradual) underflow   
-            rmin  = underflow threshold - base**(emin-1)   
-            emax  = largest exponent before overflow   
-            rmax  = overflow threshold  - (base**emax)*(1-eps)   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    int i__1;
-    float ret_val;
-    /* Builtin functions */
-    double pow_ri(float *, int *);
-    /* Local variables */
-    static float base;
-    static int beta;
-    static float emin, prec, emax;
-    static int imin, imax;
-    static int lrnd;
-    static float rmin, rmax, t, rmach;
-    extern int lsame_(char *, char *);
-    static float small, sfmin;
-    extern /* Subroutine */ int slamc2_(int *, int *, int *, float 
-	    *, int *, float *, int *, float *);
-    static int it;
-    static float rnd, eps;
-
-
-
-    if (first) {
-	first = FALSE_;
-	slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
-	base = (float) beta;
-	t = (float) it;
-	if (lrnd) {
-	    rnd = 1.f;
-	    i__1 = 1 - it;
-	    eps = pow_ri(&base, &i__1) / 2;
-	} else {
-	    rnd = 0.f;
-	    i__1 = 1 - it;
-	    eps = pow_ri(&base, &i__1);
-	}
-	prec = eps * base;
-	emin = (float) imin;
-	emax = (float) imax;
-	sfmin = rmin;
-	small = 1.f / rmax;
-	if (small >= sfmin) {
-
-/*           Use SMALL plus a bit, to avoid the possibility of rou
-nding   
-             causing overflow when computing  1/sfmin. */
-
-	    sfmin = small * (eps + 1.f);
-	}
-    }
-
-    if (lsame_(cmach, "E")) {
-	rmach = eps;
-    } else if (lsame_(cmach, "S")) {
-	rmach = sfmin;
-    } else if (lsame_(cmach, "B")) {
-	rmach = base;
-    } else if (lsame_(cmach, "P")) {
-	rmach = prec;
-    } else if (lsame_(cmach, "N")) {
-	rmach = t;
-    } else if (lsame_(cmach, "R")) {
-	rmach = rnd;
-    } else if (lsame_(cmach, "M")) {
-	rmach = emin;
-    } else if (lsame_(cmach, "U")) {
-	rmach = rmin;
-    } else if (lsame_(cmach, "L")) {
-	rmach = emax;
-    } else if (lsame_(cmach, "O")) {
-	rmach = rmax;
-    }
-
-    ret_val = rmach;
-    return ret_val;
-
-/*     End of SLAMCH */
-
-} /* slamch_ */
-
-
-/* Subroutine */ int slamc1_(int *beta, int *t, int *rnd, int 
-	*ieee1)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC1 determines the machine parameters given by BETA, T, RND, and   
-    IEEE1.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    IEEE1   (output) INT   
-            Specifies whether rounding appears to be done in the IEEE   
-            'round to nearest' style.   
-
-    Further Details   
-    ===============   
-
-    The routine is based on the routine  ENVRON  by Malcolm and   
-    incorporates suggestions by Gentleman and Marovich. See   
-
-       Malcolm M. A. (1972) Algorithms to reveal properties of   
-          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
-
-       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
-          that reveal properties of floating point arithmetic units.   
-          Comms. of the ACM, 17, 276-277.   
-
-   ===================================================================== 
-*/
-    /* Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    float r__1, r__2;
-    /* Local variables */
-    static int lrnd;
-    static float a, b, c, f;
-    static int lbeta;
-    static float savec;
-    static int lieee1;
-    static float t1, t2;
-    extern double slamc3_(float *, float *);
-    static int lt;
-    static float one, qtr;
-
-
-
-    if (first) {
-	first = FALSE_;
-	one = 1.f;
-
-/*        LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BE
-TA,   
-          IEEE1, T and RND.   
-
-          Throughout this routine  we use the function  SLAMC3  to ens
-ure   
-          that relevant values are  stored and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          Compute  a = 2.0**m  with the  smallest positive integer m s
-uch   
-          that   
-
-             fl( a + 1.0 ) = a. */
-
-	a = 1.f;
-	c = 1.f;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L10:
-	if (c == one) {
-	    a *= 2;
-	    c = slamc3_(&a, &one);
-	    r__1 = -(double)a;
-	    c = slamc3_(&c, &r__1);
-	    goto L10;
-	}
-/* +       END WHILE   
-
-          Now compute  b = 2.0**m  with the smallest positive integer 
-m   
-          such that   
-
-             fl( a + b ) .gt. a. */
-
-	b = 1.f;
-	c = slamc3_(&a, &b);
-
-/* +       WHILE( C.EQ.A )LOOP */
-L20:
-	if (c == a) {
-	    b *= 2;
-	    c = slamc3_(&a, &b);
-	    goto L20;
-	}
-/* +       END WHILE   
-
-          Now compute the base.  a and c  are neighbouring floating po
-int   
-          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
- so   
-          their difference is beta. Adding 0.25 to c is to ensure that
- it   
-          is truncated to beta and not ( beta - 1 ). */
-
-	qtr = one / 4;
-	savec = c;
-	r__1 = -(double)a;
-	c = slamc3_(&c, &r__1);
-	lbeta = c + qtr;
-
-/*        Now determine whether rounding or chopping occurs,  by addin
-g a   
-          bit  less  than  beta/2  and a  bit  more  than  beta/2  to 
- a. */
-
-	b = (float) lbeta;
-	r__1 = b / 2;
-	r__2 = -(double)b / 100;
-	f = slamc3_(&r__1, &r__2);
-	c = slamc3_(&f, &a);
-	if (c == a) {
-	    lrnd = TRUE_;
-	} else {
-	    lrnd = FALSE_;
-	}
-	r__1 = b / 2;
-	r__2 = b / 100;
-	f = slamc3_(&r__1, &r__2);
-	c = slamc3_(&f, &a);
-	if (lrnd && c == a) {
-	    lrnd = FALSE_;
-	}
-
-/*        Try and decide whether rounding is done in the  IEEE  'round
- to   
-          nearest' style. B/2 is half a unit in the last place of the 
-two   
-          numbers A and SAVEC. Furthermore, A is even, i.e. has last  
-bit   
-          zero, and SAVEC is odd. Thus adding B/2 to A should not  cha
-nge   
-          A, but adding B/2 to SAVEC should change SAVEC. */
-
-	r__1 = b / 2;
-	t1 = slamc3_(&r__1, &a);
-	r__1 = b / 2;
-	t2 = slamc3_(&r__1, &savec);
-	lieee1 = t1 == a && t2 > savec && lrnd;
-
-/*        Now find  the  mantissa, t.  It should  be the  integer part
- of   
-          log to the base beta of a,  however it is safer to determine
-  t   
-          by powering.  So we find t as the smallest positive integer 
-for   
-          which   
-
-             fl( beta**t + 1.0 ) = 1.0. */
-
-	lt = 0;
-	a = 1.f;
-	c = 1.f;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L30:
-	if (c == one) {
-	    ++lt;
-	    a *= lbeta;
-	    c = slamc3_(&a, &one);
-	    r__1 = -(double)a;
-	    c = slamc3_(&c, &r__1);
-	    goto L30;
-	}
-/* +       END WHILE */
-
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *ieee1 = lieee1;
-    return 0;
-
-/*     End of SLAMC1 */
-
-} /* slamc1_ */
-
-
-/* Subroutine */ int slamc2_(int *beta, int *t, int *rnd, float *
-	eps, int *emin, float *rmin, int *emax, float *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC2 determines the machine parameters specified in its argument   
-    list.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    EPS     (output) FLOAT   
-            The smallest positive number such that   
-
-               fl( 1.0 - EPS ) .LT. 1.0,   
-
-            where fl denotes the computed value.   
-
-    EMIN    (output) INT   
-            The minimum exponent before (gradual) underflow occurs.   
-
-    RMIN    (output) FLOAT   
-            The smallest normalized number for the machine, given by   
-            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
-  
-            of BETA.   
-
-    EMAX    (output) INT   
-            The maximum exponent before overflow occurs.   
-
-    RMAX    (output) FLOAT   
-            The largest positive number for the machine, given by   
-            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
-  
-            value of BETA.   
-
-    Further Details   
-    ===============   
-
-    The computation of  EPS  is based on a routine PARANOIA by   
-    W. Kahan of the University of California at Berkeley.   
-
-   ===================================================================== 
-*/
-    /* Table of constant values */
-    static int c__1 = 1;
-    
-    /* Initialized data */
-    static int first = TRUE_;
-    static int iwarn = FALSE_;
-    /* System generated locals */
-    int i__1;
-    float r__1, r__2, r__3, r__4, r__5;
-    /* Builtin functions */
-    double pow_ri(float *, int *);
-    /* Local variables */
-    static int ieee;
-    static float half;
-    static int lrnd;
-    static float leps, zero, a, b, c;
-    static int i, lbeta;
-    static float rbase;
-    static int lemin, lemax, gnmin;
-    static float small;
-    static int gpmin;
-    static float third, lrmin, lrmax, sixth;
-    static int lieee1;
-    extern /* Subroutine */ int slamc1_(int *, int *, int *, 
-	    int *);
-    extern double slamc3_(float *, float *);
-    extern /* Subroutine */ int slamc4_(int *, float *, int *), 
-	    slamc5_(int *, int *, int *, int *, int *, 
-	    float *);
-    static int lt, ngnmin, ngpmin;
-    static float one, two;
-
-
-
-    if (first) {
-	first = FALSE_;
-	zero = 0.f;
-	one = 1.f;
-	two = 2.f;
-
-/*        LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values
- of   
-          BETA, T, RND, EPS, EMIN and RMIN.   
-
-          Throughout this routine  we use the function  SLAMC3  to ens
-ure   
-          that relevant values are stored  and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          SLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1. 
-*/
-
-	slamc1_(&lbeta, &lt, &lrnd, &lieee1);
-
-/*        Start to find EPS. */
-
-	b = (float) lbeta;
-	i__1 = -lt;
-	a = pow_ri(&b, &i__1);
-	leps = a;
-
-/*        Try some tricks to see whether or not this is the correct  E
-PS. */
-
-	b = two / 3;
-	half = one / 2;
-	r__1 = -(double)half;
-	sixth = slamc3_(&b, &r__1);
-	third = slamc3_(&sixth, &sixth);
-	r__1 = -(double)half;
-	b = slamc3_(&third, &r__1);
-	b = slamc3_(&b, &sixth);
-	b = dabs(b);
-	if (b < leps) {
-	    b = leps;
-	}
-
-	leps = 1.f;
-
-/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
-L10:
-	if (leps > b && b > zero) {
-	    leps = b;
-	    r__1 = half * leps;
-/* Computing 5th power */
-	    r__3 = two, r__4 = r__3, r__3 *= r__3;
-/* Computing 2nd power */
-	    r__5 = leps;
-	    r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5);
-	    c = slamc3_(&r__1, &r__2);
-	    r__1 = -(double)c;
-	    c = slamc3_(&half, &r__1);
-	    b = slamc3_(&half, &c);
-	    r__1 = -(double)b;
-	    c = slamc3_(&half, &r__1);
-	    b = slamc3_(&half, &c);
-	    goto L10;
-	}
-/* +       END WHILE */
-
-	if (a < leps) {
-	    leps = a;
-	}
-
-/*        Computation of EPS complete.   
-
-          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3
-)).   
-          Keep dividing  A by BETA until (gradual) underflow occurs. T
-his   
-          is detected when we cannot recover the previous A. */
-
-	rbase = one / lbeta;
-	small = one;
-	for (i = 1; i <= 3; ++i) {
-	    r__1 = small * rbase;
-	    small = slamc3_(&r__1, &zero);
-/* L20: */
-	}
-	a = slamc3_(&one, &small);
-	slamc4_(&ngpmin, &one, &lbeta);
-	r__1 = -(double)one;
-	slamc4_(&ngnmin, &r__1, &lbeta);
-	slamc4_(&gpmin, &a, &lbeta);
-	r__1 = -(double)a;
-	slamc4_(&gnmin, &r__1, &lbeta);
-	ieee = FALSE_;
-
-	if (ngpmin == ngnmin && gpmin == gnmin) {
-	    if (ngpmin == gpmin) {
-		lemin = ngpmin;
-/*            ( Non twos-complement machines, no gradual under
-flow;   
-                e.g.,  VAX ) */
-	    } else if (gpmin - ngpmin == 3) {
-		lemin = ngpmin - 1 + lt;
-		ieee = TRUE_;
-/*            ( Non twos-complement machines, with gradual und
-erflow;   
-                e.g., IEEE standard followers ) */
-	    } else {
-		lemin = min(ngpmin,gpmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if (ngpmin == gpmin && ngnmin == gnmin) {
-	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
-		lemin = max(ngpmin,ngnmin);
-/*            ( Twos-complement machines, no gradual underflow
-;   
-                e.g., CYBER 205 ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
-		 {
-	    if (gpmin - min(ngpmin,ngnmin) == 3) {
-		lemin = max(ngpmin,ngnmin) - 1 + lt;
-/*            ( Twos-complement machines with gradual underflo
-w;   
-                no known machine ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else {
-/* Computing MIN */
-	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
-	    lemin = min(i__1,gnmin);
-/*         ( A guess; no known machine ) */
-	    iwarn = TRUE_;
-	}
-/* **   
-   Comment out this if block if EMIN is ok */
-	if (iwarn) {
-	    first = TRUE_;
-	    printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    printf("EMIN = %8i\n",lemin);
-	    printf("If, after inspection, the value EMIN looks acceptable");
-            printf("please comment out \n the IF block as marked within the"); 
-            printf("code of routine SLAMC2, \n otherwise supply EMIN"); 
-            printf("explicitly.\n");
-	}
-/* **   
-
-          Assume IEEE arithmetic if we found denormalised  numbers abo
-ve,   
-          or if arithmetic seems to round in the  IEEE style,  determi
-ned   
-          in routine SLAMC1. A true IEEE machine should have both  thi
-ngs   
-          true; however, faulty machines may have one or the other. */
-
-	ieee = ieee || lieee1;
-
-/*        Compute  RMIN by successive division by  BETA. We could comp
-ute   
-          RMIN as BASE**( EMIN - 1 ),  but some machines underflow dur
-ing   
-          this computation. */
-
-	lrmin = 1.f;
-	i__1 = 1 - lemin;
-	for (i = 1; i <= 1-lemin; ++i) {
-	    r__1 = lrmin * rbase;
-	    lrmin = slamc3_(&r__1, &zero);
-/* L30: */
-	}
-
-/*        Finally, call SLAMC5 to compute EMAX and RMAX. */
-
-	slamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *eps = leps;
-    *emin = lemin;
-    *rmin = lrmin;
-    *emax = lemax;
-    *rmax = lrmax;
-
-    return 0;
-
-
-/*     End of SLAMC2 */
-
-} /* slamc2_ */
-
-
-double slamc3_(float *a, float *b)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC3  is intended to force  A  and  B  to be stored prior to doing 
-  
-    the addition of  A  and  B ,  for use in situations where optimizers 
-  
-    might hold one of these in a register.   
-
-    Arguments   
-    =========   
-
-    A, B    (input) FLOAT   
-            The values A and B.   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       System generated locals */
-    float ret_val;
-
-
-
-    ret_val = *a + *b;
-
-    return ret_val;
-
-/*     End of SLAMC3 */
-
-} /* slamc3_ */
-
-
-/* Subroutine */ int slamc4_(int *emin, float *start, int *base)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC4 is a service routine for SLAMC2.   
-
-    Arguments   
-    =========   
-
-    EMIN    (output) EMIN   
-            The minimum exponent before (gradual) underflow, computed by 
-  
-            setting A = START and dividing by BASE until the previous A   
-            can not be recovered.   
-
-    START   (input) FLOAT   
-            The starting point for determining EMIN.   
-
-    BASE    (input) INT   
-            The base of the machine.   
-
-   ===================================================================== 
-*/
-    /* System generated locals */
-    int i__1;
-    float r__1;
-    /* Local variables */
-    static float zero, a;
-    static int i;
-    static float rbase, b1, b2, c1, c2, d1, d2;
-    extern double slamc3_(float *, float *);
-    static float one;
-
-
-
-    a = *start;
-    one = 1.f;
-    rbase = one / *base;
-    zero = 0.f;
-    *emin = 1;
-    r__1 = a * rbase;
-    b1 = slamc3_(&r__1, &zero);
-    c1 = a;
-    c2 = a;
-    d1 = a;
-    d2 = a;
-/* +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.   
-      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP */
-L10:
-    if (c1 == a && c2 == a && d1 == a && d2 == a) {
-	--(*emin);
-	a = b1;
-	r__1 = a / *base;
-	b1 = slamc3_(&r__1, &zero);
-	r__1 = b1 * *base;
-	c1 = slamc3_(&r__1, &zero);
-	d1 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d1 += b1;
-/* L20: */
-	}
-	r__1 = a * rbase;
-	b2 = slamc3_(&r__1, &zero);
-	r__1 = b2 / rbase;
-	c2 = slamc3_(&r__1, &zero);
-	d2 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d2 += b2;
-/* L30: */
-	}
-	goto L10;
-    }
-/* +    END WHILE */
-
-    return 0;
-
-/*     End of SLAMC4 */
-
-} /* slamc4_ */
-
-
-/* Subroutine */ int slamc5_(int *beta, int *p, int *emin, 
-	int *ieee, int *emax, float *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC5 attempts to compute RMAX, the largest machine floating-point   
-    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
-    approximately to a power of 2.  It will fail on machines where this   
-    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
-  
-    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
-    too large (i.e. too close to zero), probably with overflow.   
-
-    Arguments   
-    =========   
-
-    BETA    (input) INT   
-            The base of floating-point arithmetic.   
-
-    P       (input) INT   
-            The number of base BETA digits in the mantissa of a   
-            floating-point value.   
-
-    EMIN    (input) INT   
-            The minimum exponent before (gradual) underflow.   
-
-    IEEE    (input) INT   
-            A logical flag specifying whether or not the arithmetic   
-            system is thought to comply with the IEEE standard.   
-
-    EMAX    (output) INT   
-            The largest exponent before overflow   
-
-    RMAX    (output) FLOAT   
-            The largest machine floating-point number.   
-
-   ===================================================================== 
-  
-
-
-       First compute LEXP and UEXP, two powers of 2 that bound   
-       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
-       approximately to the bound that is closest to abs(EMIN).   
-       (EMAX is the exponent of the required number RMAX). */
-    /* Table of constant values */
-    static float c_b5 = 0.f;
-    
-    /* System generated locals */
-    int i__1;
-    float r__1;
-    /* Local variables */
-    static int lexp;
-    static float oldy;
-    static int uexp, i;
-    static float y, z;
-    static int nbits;
-    extern double slamc3_(float *, float *);
-    static float recbas;
-    static int exbits, expsum, try__;
-
-
-
-    lexp = 1;
-    exbits = 1;
-L10:
-    try__ = lexp << 1;
-    if (try__ <= -(*emin)) {
-	lexp = try__;
-	++exbits;
-	goto L10;
-    }
-    if (lexp == -(*emin)) {
-	uexp = lexp;
-    } else {
-	uexp = try__;
-	++exbits;
-    }
-
-/*     Now -LEXP is less than or equal to EMIN, and -UEXP is greater   
-       than or equal to EMIN. EXBITS is the number of bits needed to   
-       store the exponent. */
-
-    if (uexp + *emin > -lexp - *emin) {
-	expsum = lexp << 1;
-    } else {
-	expsum = uexp << 1;
-    }
-
-/*     EXPSUM is the exponent range, approximately equal to   
-       EMAX - EMIN + 1 . */
-
-    *emax = expsum + *emin - 1;
-    nbits = exbits + 1 + *p;
-
-/*     NBITS is the total number of bits needed to store a   
-       floating-point number. */
-
-    if (nbits % 2 == 1 && *beta == 2) {
-
-/*        Either there are an odd number of bits used to store a   
-          floating-point number, which is unlikely, or some bits are 
-  
-          not used in the representation of numbers, which is possible
-,   
-          (e.g. Cray machines) or the mantissa has an implicit bit,   
-          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
-   
-          most likely. We have to assume the last alternative.   
-          If this is true, then we need to reduce EMAX by one because 
-  
-          there must be some way of representing zero in an implicit-b
-it   
-          system. On machines like Cray, we are reducing EMAX by one 
-  
-          unnecessarily. */
-
-	--(*emax);
-    }
-
-    if (*ieee) {
-
-/*        Assume we are on an IEEE machine which reserves one exponent
-   
-          for infinity and NaN. */
-
-	--(*emax);
-    }
-
-/*     Now create RMAX, the largest machine number, which should   
-       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .   
-
-       First compute 1.0 - BETA**(-P), being careful that the   
-       result is less than 1.0 . */
-
-    recbas = 1.f / *beta;
-    z = *beta - 1.f;
-    y = 0.f;
-    i__1 = *p;
-    for (i = 1; i <= *p; ++i) {
-	z *= recbas;
-	if (y < 1.f) {
-	    oldy = y;
-	}
-	y = slamc3_(&y, &z);
-/* L20: */
-    }
-    if (y >= 1.f) {
-	y = oldy;
-    }
-
-/*     Now multiply by BETA**EMAX to get RMAX. */
-
-    i__1 = *emax;
-    for (i = 1; i <= *emax; ++i) {
-	r__1 = y * *beta;
-	y = slamc3_(&r__1, &c_b5);
-/* L30: */
-    }
-
-    *rmax = y;
-    return 0;
-
-/*     End of SLAMC5 */
-
-} /* slamc5_ */
-
-
-double pow_ri(float *ap, int *bp)
-{
-double pow, x;
-int n;
-
-pow = 1;
-x = *ap;
-n = *bp;
-
-if(n != 0)
-	{
-	if(n < 0)
-		{
-		n = -n;
-		x = 1/x;
-		}
-	for( ; ; )
-		{
-		if(n & 01)
-			pow *= x;
-		if(n >>= 1)
-			x *= x;
-		else
-			break;
-		}
-	}
-return(pow);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_colorder.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_colorder.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_colorder.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_colorder.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,221 +0,0 @@
-#include "superlu_ddefs.h"
-
-
-int check_perm_dist(char *, int_t, int_t *);
-
-void
-sp_colorder(superlu_options_t *options,  SuperMatrix *A, int_t *perm_c, 
-	    int_t *etree, SuperMatrix *AC)
-{
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- *
- * sp_colorder() permutes the columns of the original matrix. It performs
- * the following steps:
- *
- *    1. Apply column permutation perm_c[] to A's column pointers to form AC;
- *
- *    2. If options->Fact = DOFACT, then
- *       (1) Compute column elimination tree etree[] of AC'AC;
- *       (2) Post order etree[] to get a postordered elimination tree etree[],
- *           and a postorder permutation post[];
- *       (3) Apply post[] permutation to columns of AC;
- *       (4) Overwrite perm_c[] with the product perm_c * post.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         Specifies whether or not the elimination tree will be re-used.
- *         If options->Fact == DOFACT, this means first time factor A, 
- *         etree is computed and output.
- *         Otherwise, re-factor A, etree is input, unchanged on exit.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NCP; Dtype = SLU__D; Mtype = SLU_GE.
- *         In the future, more general A can be handled.
- *
- * perm_c  (input/output) int*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *         If options->Fact == DOFACT, perm_c is both input and output.
- *         On output, it is changed according to a postorder of etree.
- *         Otherwise, perm_c is input.
- *         
- * etree   (input/output) int*
- *         Elimination tree of Pc*(A'+A)*Pc', dimension A->ncol.
- *         If options->Fact == DOFACT, etree is an output argument,
- *         otherwise it is an input argument.
- *         Note: etree is a vector of parent pointers for a forest whose
- *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- * AC      (output) SuperMatrix*
- *         The resulting matrix after applied the column permutation
- *         perm_c[] to matrix A. The type of AC can be:
- *         Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
- *
- */
-
-    NCformat  *Astore;
-    NCPformat *ACstore;
-    int_t       *iwork, *post;
-    register  int_t n, i;
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter sp_colorder()");
-#endif
-
-    n = A->ncol;
-    
-    /* Apply column permutation perm_c to A's column pointers so to
-       obtain NCP format in AC = A*Pc.  */
-    AC->Stype       = SLU_NCP;
-    AC->Dtype       = A->Dtype;
-    AC->Mtype       = A->Mtype;
-    AC->nrow        = A->nrow;
-    AC->ncol        = A->ncol;
-    Astore          = A->Store;
-    ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) );
-    if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore");
-    ACstore->nnz    = Astore->nnz;
-    ACstore->nzval  = Astore->nzval;
-    ACstore->rowind = Astore->rowind;
-    ACstore->colbeg = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
-    if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg");
-    ACstore->colend = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
-    if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend");
-
-#if ( DEBUGlevel>=3 )
-    if ( !iam ) {
-	PrintInt10("pre_order:", n, perm_c);
-	check_perm_dist("Initial perm_c", n, perm_c);
-    }
-#endif      
-
-    for (i = 0; i < n; i++) {
-	ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; 
-	ACstore->colend[perm_c[i]] = Astore->colptr[i+1];
-    }
-	
-    if ( options->Fact == DOFACT ) { 
-	/* Factor A "from scratch" -- we also compute the etree, and
-	 * make perm_c consistent with the postorder of the etree.
-	 */
-
-	iwork = (int_t*) SUPERLU_MALLOC((n+1)*sizeof(int_t)); 
-	if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
-
-	if ( A->nrow != A->ncol  /* Rectangular matrix */
-	     || options->ColPerm == MMD_ATA ) {
-	    /* Compute the column etree of A*Pc'. */
-	    sp_coletree_dist(ACstore->colbeg, ACstore->colend, ACstore->rowind,
-			     A->nrow, A->ncol, etree);
-	} else {
-	    /* Compute the etree of Pc*(A'+A)*Pc'. */
-	    int_t *b_colptr, *b_rowind, bnz, j;
-	    int_t *c_colbeg, *c_colend;
-
-	    /* Form B = A + A'. */
-	    at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
-			   &bnz, &b_colptr, &b_rowind);
-
-	    /* Form C = Pc*B*Pc'. */
-	    c_colbeg = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
-	    c_colend = (int_t*) SUPERLU_MALLOC(n*sizeof(int_t));
-	    if (!(c_colbeg) || !(c_colend) )
-		ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend");
-	    for (i = 0; i < n; i++) {
-		c_colbeg[perm_c[i]] = b_colptr[i]; 
-		c_colend[perm_c[i]] = b_colptr[i+1];
-	    }
-	    for (j = 0; j < n; ++j) {
-		for (i = c_colbeg[j]; i < c_colend[j]; ++i) {
-		    b_rowind[i] = perm_c[b_rowind[i]];
-		}
-	    }
-
-	    /* Compute etree of C. */
-	    sp_symetree_dist(c_colbeg, c_colend, b_rowind, n, etree);
-
-	    SUPERLU_FREE(b_colptr);
-	    if ( bnz ) SUPERLU_FREE(b_rowind);
-	    SUPERLU_FREE(c_colbeg);
-	    SUPERLU_FREE(c_colend);
-	}
-#if ( DEBUGlevel>=3 )
-	if ( !iam ) PrintInt10("etree:", n, etree);
-#endif	
-	
-	/* Post order etree */
-	post = (int_t *) TreePostorder_dist(n, etree);
-	/* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;
-	   iwork = post; */
-
-	/* Renumber etree in postorder */
-	for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]];
-	for (i = 0; i < n; ++i) etree[i] = iwork[i];
-
-#if ( DEBUGlevel>=3 )
-	if ( !iam ) PrintInt10("postorder etree:", n, etree);
-#endif
-	
-	/* Postmultiply A*Pc by post[] */
-	for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i];
-	for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i];
-	for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i];
-	for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i];
-
-	for (i = 0; i < n; ++i)
-	    iwork[i] = post[perm_c[i]];  /* product of perm_c and post */
-	for (i = 0; i < n; ++i) perm_c[i] = iwork[i];
-
-#if ( DEBUGlevel>=3 )
-	if ( !iam ) {
-	    PrintInt10("Pc*post:", n, perm_c);
-	    check_perm_dist("final perm_c", n, perm_c);
-	}
-#endif
-
-	SUPERLU_FREE (post);
-	SUPERLU_FREE (iwork);
-
-    } /* end if options->Fact == DOFACT ... */
-
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed:
-       ACstore, ACstore->colbeg, ACstore->colend  */
-    CHECK_MALLOC(iam, "Exit sp_colorder()");
-#endif
-
-} /* SP_COLORDER */
-
-int
-check_perm_dist(char *what, int_t n, int_t *perm)
-{
-    register int_t i;
-    int_t          *marker;
-    marker = (int_t *) intCalloc_dist(n);
-
-    for (i = 0; i < n; ++i) {
-	if ( marker[perm[i]] == 1 || perm[i] >= n ) {
-	    printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]);
-	    ABORT("check_perm_dist");
-	} else {
-	    marker[perm[i]] = 1;
-	}
-    }
-
-    SUPERLU_FREE(marker);
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_ienv.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_ienv.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/sp_ienv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/sp_ienv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,71 +0,0 @@
-/*
- * File name:		sp_ienv.c
- * History:             Modified from lapack routine ILAENV
- */
-#include "superlu_ddefs.h"
-#include "machines.h"
-
-int_t
-sp_ienv_dist(int_t ispec)
-{
-/*
-    Purpose   
-    =======   
-
-    sp_ienv_dist() is inquired to choose machine-dependent parameters for the
-    local environment. See ISPEC for a description of the parameters.   
-
-    This version provides a set of parameters which should give good,   
-    but not optimal, performance on many of the currently available   
-    computers.  Users are encouraged to modify this subroutine to set   
-    the tuning parameters for their particular machine using the option   
-    and problem size information in the arguments.   
-
-    Arguments   
-    =========   
-
-    ISPEC   (input) int
-            Specifies the parameter to be returned as the value of SP_IENV_DIST.
-            = 1: the panel size w; a panel consists of w consecutive
-	         columns of matrix A in the process of Gaussian elimination.
-		 The best value depends on machine's cache characters.
-            = 2: the relaxation parameter relax; if the number of
-	         nodes (columns) in a subtree of the elimination tree is less
-		 than relax, this subtree is considered as one supernode,
-		 regardless of the their row structures.
-            = 3: the maximum size for a supernode, which must be greater
-                 than or equal to relaxation parameter (see case 2);
-	    = 4: the minimum row dimension for 2-D blocking to be used;
-	    = 5: the minimum column dimension for 2-D blocking to be used;
-	    = 6: the estimated fills factor for the adjacency structures 
-	         of L and U, compared with A;
-	    
-   (SP_IENV_DIST) (output) int
-            >= 0: the value of the parameter specified by ISPEC   
-            < 0:  if SP_IENV_DIST = -k, the k-th argument had an illegal value.
-  
-    ===================================================================== 
-*/
-    int i;
-
-    switch (ispec) {
-#if ( MACH==CRAY_T3E )
-	case 2: return (4);
-	case 3: return (30);
-#elif ( MACH==IBM )
-	case 2: return (10);
-	case 3: return (60);
-#else
-	case 2: return (6);
-	case 3: return (50);
-#endif
-        case 6: return (4);
-    }
-
-    /* Invalid value for ISPEC */
-    i = 1;
-    xerbla_("sp_ienv", &i);
-    return 0;
-
-} /* sp_ienv_dist */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_ddefs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,309 +0,0 @@
-
-
-/*
- * -- Distributed SuperLU routine (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- *
- */
-
-#ifndef __SUPERLU_dDEFS /* allow multiple inclusions */
-#define __SUPERLU_dDEFS
-
-/*
- * File name:	superlu_ddefs.h
- * Purpose:     Distributed SuperLU data types and function prototypes
- * History:
- */
-
-#include "superlu_defs.h"
-
-/* 
- * On each processor, the blocks in L are stored in compressed block
- * column format, the blocks in U are stored in compressed block row format.
- */
-typedef struct {
-    int_t   **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
-    double  **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
-    int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
-    double  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
-#if 0
-    int_t   *Lsub_buf;        /* Buffer for the remote subscripts of L */
-    double  *Lval_buf;        /* Buffer for the remote nonzeros of L   */
-#endif
-    int_t   *Lsub_buf_2[2];   /* Buffers for the remote subscripts of L*/
-    double  *Lval_buf_2[2];   /* Buffers for the remote nonzeros of L  */
-    int_t   *Usub_buf;        /* Buffer for the remote subscripts of U */
-    double  *Uval_buf;        /* Buffer for the remote nonzeros of U   */
-    double  *ujrow;           /* used in panel factorization.          */
-    int_t   bufmax[NBUFFERS]; /* Buffer size; 5 entries
-			       *     0 : size of Lsub_buf[]
-			       *     1 : size of Lval_buf[]
-			       *     2 : size of Usub_buf[] 
-			       *     3 : size of Uval_buf[]
-			       *     4 : size of tempv[LDA]
-			       */
-
-    /*-- Record communication schedule for factorization. --*/
-    int_t   *ToRecv;          /* Recv from no one (0), left (1), and up (2).*/
-    int_t   *ToSendD;         /* Whether need to send down block row.       */
-    int_t   **ToSendR;        /* List of processes to send right block col. */
-
-    /*-- Record communication schedule for solves. --*/
-    int_t   *fmod;            /* Modification count for L-solve            */
-    int_t   **fsendx_plist;   /* Column process list to send down Xk       */
-    int_t   *frecv;           /* Modifications to be recv'd in proc row    */
-    int_t   nfrecvx;          /* Number of Xk I will receive in L-solve    */
-    int_t   nfsendx;          /* Number of Xk I will send in L-solve       */
-    int_t   *bmod;            /* Modification count for U-solve            */
-    int_t   **bsendx_plist;   /* Column process list to send down Xk       */
-    int_t   *brecv;           /* Modifications to be recv'd in proc row    */
-    int_t   nbrecvx;          /* Number of Xk I will receive in U-solve    */
-    int_t   nbsendx;          /* Number of Xk I will send in U-solve       */
-
-    /*-- Auxiliary arrays used for solves. --*/
-    int_t   *ilsum;           /* Starting position of each supernode in lsum
-				 (local)  */
-    int_t   ldalsum;          /* LDA of lsum (local) */
-    int_t   SolveMsgSent;     /* Number of actual messages sent in LU-solve */
-    int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
-} LocalLU_t;
-
-typedef struct {
-    int_t *etree;
-    Glu_persist_t *Glu_persist;
-    LocalLU_t *Llu;
-} LUstruct_t;
-
-/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */
-typedef struct {
-    int_t lbnum;  /* Row block number (local).      */
-    int_t indpos; /* Starting position in Uindex[]. */
-} Ucb_indptr_t;
-
-/*-- Data structure for communication during matrix-vector multiplication. */
-typedef struct {
-    int_t *extern_start;
-    int_t *ind_tosend;    /* X indeices to be sent to other processes */
-    int_t *ind_torecv;    /* X indeices to be received from other processes */
-    int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs)
-			     (also point to val_torecv) */
-    int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs)
-			     (also point to val_tosend) */
-    int   *SendCounts;    /* Numbers of X indices to be sent
-			     (also numbers of X values to be received) */
-    int   *RecvCounts;    /* Numbers of X indices to be received
-			     (also numbers of X values to be sent) */
-    double *val_tosend;   /* X values to be sent to other processes */
-    double *val_torecv;   /* X values to be received from other processes */
-    int_t TotalIndSend;   /* Total number of indices to be sent
-			     (also total number of values to be received) */
-    int_t TotalValSend;   /* Total number of values to be sent.
-			     (also total number of indices to be received) */
-} pdgsmv_comm_t;
-
-/*-- Data structure for redistribution of B and X --*/
-typedef struct {
-    int  *B_to_X_SendCnt;
-    int  *X_to_B_SendCnt;
-    int  *ptr_to_ibuf, *ptr_to_dbuf;
-} pxgstrs_comm_t;
-
-/*-- Data structure holding the information for the solution phase --*/
-typedef struct {
-    int_t *row_to_proc;
-    int_t *inv_perm_c;
-    int_t num_diag_procs, *diag_procs, *diag_len;
-    pdgsmv_comm_t *gsmv_comm;
-    pxgstrs_comm_t *gstrs_comm;
-    int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column
-                             indices of A are translated into the relative
-                             positions in the gathered x-vector.
-                             This is re-used in repeated calls to pdgsmv() */
-} SOLVEstruct_t;
-
-
-/***********************************************************************
- * Function prototypes
- ***********************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/* Supernodal LU factor related */
-extern void
-dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
-			    int_t *, int_t *, Stype_t, Dtype_t, Mtype_t);
-extern void
-dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t,
-			       int_t, double *, int_t *, int_t *,
-			       Stype_t, Dtype_t, Mtype_t);
-extern void
-dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *,
-                         double **, int_t **, int_t **);
-extern int
-pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *,
-	 		        SuperMatrix *);
-extern void
-dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *);
-extern void
-dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t,
-			  Stype_t, Dtype_t, Mtype_t);
-extern void
-dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, 
-			      int_t *, int_t *, int_t *, int_t *, int_t *,
-			      Stype_t, Dtype_t, Mtype_t);
-extern void
-dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t,
-                        double *, int_t);
-
-extern void    dallocateA_dist (int_t, int_t, double **, int_t **, int_t **);
-extern void    dGenXtrue_dist (int_t, int_t, double *, int_t);
-extern void    dFillRHS_dist (char *, int_t, double *, int_t,
-                              SuperMatrix *, double *, int_t);
-extern int     dcreate_matrix(SuperMatrix *, int, double **, int *, 
-			      double **, int *, FILE *, gridinfo_t *);
-
-/* Driver related */
-extern void    dgsequ_dist (SuperMatrix *, double *, double *, double *,
-			    double *, double *, int_t *);
-extern double  dlangs_dist (char *, SuperMatrix *);
-extern void    dlaqgs_dist (SuperMatrix *, double *, double *, double,
-			    double, double, char *);
-extern void    pdgsequ (SuperMatrix *, double *, double *, double *,
-			double *, double *, int_t *, gridinfo_t *);
-extern double  pdlangs (char *, SuperMatrix *, gridinfo_t *);
-extern void    pdlaqgs (SuperMatrix *, double *, double *, double,
-			double, double, char *);
-extern int     pdPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[],
-				      double [], int, double [], int, int,
-				      gridinfo_t *);
-
-extern int     sp_dtrsv_dist (char *, char *, char *, SuperMatrix *,
-			      SuperMatrix *, double *, int *);
-extern int     sp_dgemv_dist (char *, double, SuperMatrix *, double *,
-			      int, double, double *, int);
-extern int     sp_dgemm_dist (char *, char *, int, int, int, double,
-			SuperMatrix *, double *, int, double, 
-			double *, int);
-
-extern int_t ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, 
-			 LUstruct_t *, gridinfo_t *);
-extern void  pdgssvx_ABglobal(superlu_options_t *, SuperMatrix *, 
-			      ScalePermstruct_t *, double *,
-			      int, int, gridinfo_t *, LUstruct_t *, double *,
-			      SuperLUStat_t *, int *);
-extern int_t pddistribute(fact_t, int_t, SuperMatrix *, 
-			 ScalePermstruct_t *, Glu_freeable_t *, 
-			 LUstruct_t *, gridinfo_t *);
-extern void  pdgssvx(superlu_options_t *, SuperMatrix *, 
-		     ScalePermstruct_t *, double *,
-		     int, int, gridinfo_t *, LUstruct_t *,
-		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
-extern int  dSolveInit(superlu_options_t *, SuperMatrix *, int_t [], int_t [],
-		       int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *);
-extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
-	                  int_t [], int_t [], gridinfo_t *grid,
-	                  Glu_persist_t *, SOLVEstruct_t *);
-extern void pxgstrs_finalize(pxgstrs_comm_t *);
-extern void dSolveFinalize(superlu_options_t *, SOLVEstruct_t *);
-extern void dldperm(int_t, int_t, int_t, int_t [], int_t [],
-		    double [], int_t *, double [], double []);
-extern int_t pdgstrf(superlu_options_t *, int, int, double,
-		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
-extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
-			     double *, int_t, int, SuperLUStat_t *, int *);
-extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
-		    double *, int_t, int_t, int_t, int, SOLVEstruct_t *,
-		    SuperLUStat_t *, int *);
-extern void dlsum_fmod(double *, double *, double *, double *,
-		       int, int, int_t , int_t *, int_t, int_t, int_t,
-		       int_t *, gridinfo_t *, LocalLU_t *, 
-		       MPI_Request [], SuperLUStat_t *);
-extern void dlsum_bmod(double *, double *, double *,
-                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
-                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t *);
-extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
-		    ScalePermstruct_t *, gridinfo_t *,
-		    double [], int_t, double [], int_t, int,
-		    SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
-extern void pdgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *,
-		  gridinfo_t *, double *, int_t, double *, int_t,
-		  int, double *, SuperLUStat_t *, int *);
-extern int   pdgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *,
-				   gridinfo_t *, int_t *, int_t *[],
-				   double *[], int_t *[], int_t []);
-extern int  pdgsmv_AXglobal(int_t, int_t [], double [], int_t [],
-	                       double [], double []);
-extern int  pdgsmv_AXglobal_abs(int_t, int_t [], double [], int_t [],
-				 double [], double []);
-extern void pdgsmv_init(SuperMatrix *, int_t *, gridinfo_t *,
-			pdgsmv_comm_t *);
-extern void pdgsmv(int_t, SuperMatrix *, gridinfo_t *, pdgsmv_comm_t *,
-		   double x[], double ax[]);
-extern void pdgsmv_finalize(pdgsmv_comm_t *);
-
-/* Memory-related */
-extern double  *doubleMalloc_dist(int_t);
-extern double  *doubleCalloc_dist(int_t);
-extern void  *duser_malloc_dist (int_t, int_t);
-extern void  duser_free_dist (int_t, int_t);
-extern int_t dQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *, mem_usage_t *);
-extern void    Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
-extern void    LUstructInit(const int_t, const int_t, LUstruct_t *);
-extern void    LUstructFree(LUstruct_t *);
-
-/* Auxiliary routines */
-extern void    dfill_dist (double *, int_t, double);
-extern void    dinf_norm_error_dist (int_t, int_t, double*, int_t,
-                                     double*, int_t, gridinfo_t*);
-extern void    pdinf_norm_error(int, int_t, int_t, double [], int_t,
-				double [], int_t , gridinfo_t *);
-extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, 
-			   double **, int_t **, int_t **);
-
-/* Distribute the data for numerical factorization */
-extern int_t ddist_psymbtonum
-(fact_t, int_t, SuperMatrix *,
- ScalePermstruct_t *, Pslu_freeable_t *, 
- LUstruct_t *, gridinfo_t *);
-
-/* Routines for debugging */
-extern void  dPrintLblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *,
-		 	   LocalLU_t *);
-extern void  dPrintUblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *,
-			   LocalLU_t *);
-extern void  dPrint_CompCol_Matrix_dist(SuperMatrix *);
-extern void  dPrint_Dense_Matrix_dist(SuperMatrix *);
-extern int   dPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern int   file_PrintDouble5(FILE *, char *, int_t, double *);
-
-/* BLAS */
-
-#ifdef USE_VENDOR_BLAS
-extern int dgemm_(char*, char*, int*, int*, int*, double*,
-                  double*, int*, double*, int*, double*,
-                  double*, int*, int, int);
-extern int dtrsv_(char*, char*, char*, int*, double*, int*,
-                  double*, int*, int, int, int);
-#else
-extern int dgemm_(char*, char*, int*, int*, int*, double*,
-                  double*, int*, double*, int*, double*,
-                  double*, int*);
-extern int dtrsv_(char*, char*, char*, int*, double*, int*,
-                  double*, int*);
-#endif
-
-extern int dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
-
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_dDEFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_defs.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_defs.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_defs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_defs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,572 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 2.2) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- * Feburary 20, 2008
- */
-
-#ifndef __SUPERLU_DEFS /* allow multiple inclusions */
-#define __SUPERLU_DEFS
-
-/*
- * File name:	superlu_defs.h
- * Purpose:     Definitions which are precision-neutral
- */
-
-#include <HYPRE_utilities.h>
-#include <fortran.h>
-#ifdef _CRAY
-#include <fortran.h>
-#include <string.h>
-#endif
-#include <mpi.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-
-/* Define my integer size int_t */
-#ifdef _CRAY
-typedef short int_t;
-/*#undef int   Revert back to int of default size. */
-#define mpi_int_t   MPI_SHORT
-#elif defined (_LONGINT)
-typedef long int int_t;
-#define mpi_int_t   MPI_LONG
-#else /* Default */
-typedef int int_t;
-#define mpi_int_t   MPI_INT
-#endif
-
-/***********************************************************************
- * Enumerated types
- ***********************************************************************/
-/*typedef enum {FALSE, TRUE}                                      boolean_t;*/
-typedef enum {NO, YES}                                          yes_no_t;
-typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
-typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                   rowperm_t;
-typedef enum {NATURAL, MMD_AT_PLUS_A, MMD_ATA, METIS_AT_PLUS_A,
-	      PARMETIS, MY_PERMC}                               colperm_t;
-typedef enum {NOTRANS, TRANS, CONJ}                             trans_t;
-typedef enum {NOEQUIL, ROW, COL, BOTH}                          DiagScale_t;
-typedef enum {NOREFINE, SINGLE=1, DOUBLE, EXTRA}                IterRefine_t;
-typedef enum {LUSUP, UCOL, LSUB, USUB}                          MemType;
-typedef enum {HEAD, TAIL}                                       stack_end_t;
-typedef enum {SYSTEM, USER}                                     LU_space_t;
-
-#include "Cnames.h"
-#include "supermatrix.h"
-#include "util_dist.h"
-#include "psymbfact.h"
-
-
-/***********************************************************************
- * Constants
- ***********************************************************************/
-/* 
- * For each block column of L, the index[] array contains both the row 
- * subscripts and the integers describing the size of the blocks.
- * The organization of index[] looks like:
- *
- *     [ BLOCK COLUMN HEADER (size BC_HEADER)
- *           number of blocks 
- *           number of row subscripts, i.e., LDA of nzval[]
- *       BLOCK 0                                        <----
- *           BLOCK DESCRIPTOR (of size LB_DESCRIPTOR)  |
- *               block number (global)                      |
- *               number of full rows in the block           |
- *           actual row subscripts                          |
- *       BLOCK 1                                            | Repeat ...
- *           BLOCK DESCRIPTOR                               | number of blocks
- *               block number (global)                      | 
- *               number of full rows in the block           |
- *           actual row subscripts                          |
- *       .                                                  |
- *       .                                                  |
- *       .                                              <----
- *     ]
- *
- * For each block row of U, the organization of index[] looks like:
- *
- *     [ BLOCK ROW HEADER (of size BR_HEADER)
- *           number of blocks 
- *           number of entries in nzval[]
- *           number of entries in index[]
- *       BLOCK 0                                        <----
- *           BLOCK DESCRIPTOR (of size UB_DESCRIPTOR)  |
- *               block number (global)                      |
- *               number of nonzeros in the block            |
- *           actual fstnz subscripts                        |
- *       BLOCK 1                                            | Repeat ...
- *           BLOCK DESCRIPTOR                               | number of blocks
- *               block number (global)                      |
- *               number of nonzeros in the block            |
- *           actual fstnz subscripts                        |
- *       .                                                  |
- *       .                                                  |
- *       .                                              <----
- *     ]
- *
- */
-#define BC_HEADER      2
-#define LB_DESCRIPTOR  2
-#define BR_HEADER      3
-#define UB_DESCRIPTOR  2
-#define NBUFFERS       5
-
-/*
- * Communication tags
- */
-    /* For numeric factorization. */
-#define NTAGS    10000
-#define UjROW    10
-#define UkSUB    11
-#define UkVAL    12
-#define LkSUB    13
-#define LkVAL    14
-#define LkkDIAG  15
-    /* For triangular solves. */
-#define XK_H     1  /* The header preceeding each X block. */
-#define LSUM_H   1  /* The header preceeding each MOD block. */
-#define GSUM     20 
-#define Xk       21
-#define Yk       22
-#define LSUM     23
-
-/* 
- * Communication scopes
- */
-#define COMM_ALL      100
-#define COMM_COLUMN   101
-#define COMM_ROW      102
-
-/*
- * Matrix distribution for sparse matrix-vector multiplication
- */
-#define SUPER_LINEAR     11
-#define SUPER_BLOCK      12
-
-/*
- * No of marker arrays used in the symbolic factorization, each of size n
- */
-#define NO_MARKER     3
-
-
-
-/***********************************************************************
- * Macros
- ***********************************************************************/
-#define IAM(comm)    { int rank; MPI_Comm_rank ( comm, &rank ); rank};
-#define MYROW(iam,grid) ( (iam) / grid->npcol )
-#define MYCOL(iam,grid) ( (iam) % grid->npcol )
-#define BlockNum(i)     ( supno[i] )
-#define FstBlockC(bnum) ( xsup[bnum] )
-#define SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] )
-#define LBi(bnum,grid)  ( (bnum)/grid->nprow )/* Global to local block rowwise */
-#define LBj(bnum,grid)  ( (bnum)/grid->npcol )/* Global to local block columnwise*/
-#define PROW(bnum,grid) ( (bnum) % grid->nprow )
-#define PCOL(bnum,grid) ( (bnum) % grid->npcol )
-#define PNUM(i,j,grid)  ( (i)*grid->npcol + j ) /* Process number at coord(i,j) */
-#define CEILING(a,b)    ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) )
-    /* For triangular solves */
-#define RHS_ITERATE(i)                    \
-        for (i = 0; i < nrhs; ++i)
-#define X_BLK(i)                          \
-        ilsum[i] * nrhs + (i+1) * XK_H
-#define LSUM_BLK(i)                       \
-        ilsum[i] * nrhs + (i+1) * LSUM_H
-
-#define SuperLU_timer_  SuperLU_timer_dist_
-#define LOG2(x)   (log10((double) x) / log10(2.0))
-
-
-#if ( VAMPIR>=1 ) 
-#define VT_TRACEON    VT_traceon()
-#define VT_TRACEOFF   VT_traceoff()
-#else
-#define VT_TRACEON 
-#define VT_TRACEOFF
-#endif
-
-
-/***********************************************************************
- * New data types
- ***********************************************************************/
-
-/* 
- *   Define the 2D mapping of matrix blocks to process grid.
- *
- *   Process grid:
- *     Processes are numbered (0 : P-1).
- *     P = Pr x Pc, where Pr, Pc are the number of process rows and columns.
- *     (pr,pc) is the coordinate of IAM; 0 <= pr < Pr, 0 <= pc < Pc.
- *
- *   Matrix blocks:
- *     Matrix is partitioned according to supernode partitions, both
- *     column and row-wise. 
- *     The k-th block columns (rows) contains columns (rows) (s:t), where
- *             s=xsup[k], t=xsup[k+1]-1.
- *     Block A(I,J) contains
- *             rows from (xsup[I]:xsup[I+1]-1) and
- *             columns from (xsup[J]:xsup[J+1]-1)
- *
- *  Mapping of matrix entry (i,j) to matrix block (I,J):
- *     (I,J) = ( supno[i], supno[j] )
- *
- *  Mapping of matrix block (I,J) to process grid (pr,pc):
- *     (pr,pc) = ( MOD(I,NPROW), MOD(J,NPCOL) )
- *  
- *  (xsup[nsupers],supno[n]) are replicated on all processors.
- *
- */
-
-/*-- Communication subgroup */
-typedef struct {
-    MPI_Comm comm;        /* MPI communicator */
-    int Np;               /* number of processes */
-    int Iam;              /* my process number */
-} superlu_scope_t;
-
-/*-- Process grid definition */
-typedef struct {
-    MPI_Comm comm;        /* MPI communicator */
-    superlu_scope_t rscp; /* row scope */
-    superlu_scope_t cscp; /* column scope */
-    int iam;              /* my process number in this scope */
-    int_t nprow;          /* number of process rows */
-    int_t npcol;          /* number of process columns */
-} gridinfo_t;
-
-
-/*
- *-- The structures are determined by SYMBFACT and used thereafter.
- *
- * (xsup,supno) describes mapping between supernode and column:
- *	xsup[s] is the leading column of the s-th supernode.
- *      supno[i] is the supernode no to which column i belongs;
- *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
- *	        xsup 0 1 2 4 7 12
- *	Note: dfs will be performed on supernode rep. relative to the new 
- *	      row pivoting ordering
- *
- * This is allocated during symbolic factorization SYMBFACT.
- */
-typedef struct {
-    int_t     *xsup;
-    int_t     *supno;
-} Glu_persist_t;
-
-/*
- *-- The structures are determined by SYMBFACT and used by DDISTRIBUTE.
- * 
- * (xlsub,lsub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xlsub[j] points to the starting
- *	location of the j-th column in lsub[*]. Note that xlsub 
- *	is indexed by column.
- *	Storage: original row subscripts
- *
- *      During the course of sparse LU factorization, we also use
- *	(xlsub,lsub) for the purpose of symmetric pruning. For each
- *	supernode {s,s+1,...,t=s+r} with first column s and last
- *	column t, the subscript set
- *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
- *	is the structure of column s (i.e. structure of this supernode).
- *	It is used for the storage of numerical values.
- *	Furthermore,
- *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
- *	is the structure of the last column t of this supernode.
- *	It is for the purpose of symmetric pruning. Therefore, the
- *	structural subscripts can be rearranged without making physical
- *	interchanges among the numerical values.
- *
- *	However, if the supernode has only one column, then we
- *	only keep one set of subscripts. For any subscript interchange
- *	performed, similar interchange must be done on the numerical
- *	values.
- *
- *	The last column structures (for pruning) will be removed
- *	after the numercial LU factorization phase.
- *
- * (xusub,usub): xusub[i] points to the starting location of column i
- *      in usub[]. For each U-segment, only the row index of first nonzero
- *      is stored in usub[].
- *
- *      Each U column consists of a number of full segments. Each full segment
- *      starts from a leading nonzero, running up to the supernode (block)
- *      boundary. (Recall that the column-wise supernode partition is also
- *      imposed on the rows.) Because the segment is full, we don't store all
- *      the row indices. Instead, only the leading nonzero index is stored.
- *      The rest can be found together with xsup/supno pair.
- *      For example, 
- *          usub[xsub[j+1]] - usub[xsub[j]] = number of segments in column j.
- *          for any i in usub[], 
- *              supno[i]         = block number in which i belongs to
- *  	        xsup[supno[i]+1] = first row of the next block
- *              The nonzeros of this segment are: 
- *                  i, i+1 ... xsup[supno[i]+1]-1 (only i is stored in usub[])
- *
- */
-typedef struct {
-    int_t     *lsub;     /* compressed L subscripts */
-    int_t     *xlsub;
-    int_t     *usub;     /* compressed U subscripts */
-    int_t     *xusub;
-    int_t     nzlmax;    /* current max size of lsub */
-    int_t     nzumax;    /*    "    "    "      usub */
-    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
-} Glu_freeable_t;
-
-
-/* 
- *-- The structure used to store matrix A of the linear system and
- *   several vectors describing the transformations done to matrix A.
- *
- * A      (SuperMatrix*)
- *        Matrix A in A*X=B, of dimension (A->nrow, A->ncol).
- *        The number of linear equations is A->nrow. The type of A can be:
- *        Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
- *         
- * DiagScale  (DiagScale_t)
- *        Specifies the form of equilibration that was done.
- *        = NOEQUIL: No equilibration.
- *        = ROW:  Row equilibration, i.e., A was premultiplied by diag(R).
- *        = COL:  Column equilibration, i.e., A was postmultiplied by diag(C).
- *        = BOTH: Both row and column equilibration, i.e., A was replaced 
- *                 by diag(R)*A*diag(C).
- *
- * R      double*, dimension (A->nrow)
- *        The row scale factors for A.
- *        If DiagScale = ROW or BOTH, A is multiplied on the left by diag(R).
- *        If DiagScale = NOEQUIL or COL, R is not defined.
- *
- * C      double*, dimension (A->ncol)
- *        The column scale factors for A.
- *        If DiagScale = COL or BOTH, A is multiplied on the right by diag(C).
- *        If DiagScale = NOEQUIL or ROW, C is not defined.
- *         
- * perm_r (int*) dimension (A->nrow)
- *        Row permutation vector which defines the permutation matrix Pr,
- *        perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- * perm_c (int*) dimension (A->ncol)
- *	  Column permutation vector, which defines the 
- *        permutation matrix Pc; perm_c[i] = j means column i of A is 
- *        in position j in A*Pc.
- *
- */
-typedef struct {
-    DiagScale_t DiagScale;
-    double *R;
-    double *C; 
-    int_t  *perm_r;
-    int_t  *perm_c;
-} ScalePermstruct_t;
-
-/* 
- *-- This contains the options used to control the solve process.
- *
- * Fact   (fact_t)
- *        Specifies whether or not the factored form of the matrix
- *        A is supplied on entry, and if not, how the matrix A should
- *        be factorizaed.
- *        = DOFACT: The matrix A will be factorized from scratch, and the
- *             factors will be stored in L and U.
- *        = SamePattern: The matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the column elimination tree
- *             LUstruct->etree.
- *        = SamePattern_SameRowPerm: The matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, and the
- *             both row and column permutation vectors perm_r and perm_c,
- *             distributed data structure set up from the previous symbolic
- *             factorization.
- *        = FACTORED: On entry, L, U, perm_r and perm_c contain the 
- *              factored form of A. If DiagScale is not NOEQUIL, the matrix
- *              A has been equilibrated with scaling factors R and C.
- *
- * Equil  (yes_no_t)
- *        Specifies whether to equilibrate the system (scale A's row and
- *        columns to have unit norm).
- *
- * ColPerm (colperm_t)
- *        Specifies what type of column permutation to use to reduce fill.
- *        = NATURAL: use the natural ordering 
- *        = MMD_ATA: use minimum degree ordering on structure of A'*A
- *        = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A
- *        = MY_PERMC: use the ordering specified in ScalePermstruct->perm_c[]
- *         
- * Trans  (trans_t)
- *        Specifies the form of the system of equations:
- *        = NOTRANS: A * X = B        (No transpose)
- *        = TRANS:   A**T * X = B     (Transpose)
- *        = CONJ:    A**H * X = B     (Transpose)
- *
- * IterRefine (IterRefine_t)
- *        Specifies whether to perform iterative refinement.
- *        = NO: no iterative refinement
- *        = WorkingPrec: perform iterative refinement in working precision
- *        = ExtraPrec: perform iterative refinement in extra precision
- *
- * PrintStat (yes_no_t)
- *        Specifies whether to print the solver's statistics.
- *
- * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU)
- *        Specifies the threshold used for a diagonal entry to be an
- *        acceptable pivot.
- *
- * RowPerm (rowperm_t) (only for SuperLU_DIST)
- *        Specifies whether to permute rows of the original matrix.
- *        = NO: not to permute the rows
- *        = LargeDiag: make the diagonal large relative to the off-diagonal
- *        = MY_PERMR: use the permutation given in ScalePermstruct->perm_r[]
- *           
- * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST)
- *        Specifies whether to replace the tiny diagonals by
- *        sqrt(epsilon)*||A|| during LU factorization.
- *
- * SolveInitialized (yes_no_t) (only for SuperLU_DIST)
- *        Specifies whether the initialization has been performed to the
- *        triangular solve.
- *
- * RefineInitialized (yes_no_t) (only for SuperLU_DIST)
- *        Specifies whether the initialization has been performed to the
- *        sparse matrix-vector multiplication routine needed in iterative
- *        refinement.
- */
-typedef struct {
-    fact_t        Fact;
-    yes_no_t      Equil;
-    yes_no_t      ParSymbFact;
-    colperm_t     ColPerm;
-    rowperm_t     RowPerm;
-    double        DiagPivotThresh;
-    IterRefine_t  IterRefine;
-    trans_t       Trans;
-    yes_no_t      ReplaceTinyPivot;
-    yes_no_t      SolveInitialized;
-    yes_no_t      RefineInitialized;
-    yes_no_t      PrintStat;
-} superlu_options_t;
-
-typedef struct {
-    float for_lu;
-    float total;
-    int_t expansions;
-} mem_usage_t;
-
-
-/***********************************************************************
- * Function prototypes
- ***********************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern void    set_default_options_dist(superlu_options_t *);
-extern void    print_options_dist(superlu_options_t *);
-extern void    Destroy_CompCol_Matrix_dist(SuperMatrix *);
-extern void    Destroy_SuperNode_Matrix_dist(SuperMatrix *);
-extern void    Destroy_SuperMatrix_Store_dist(SuperMatrix *);
-extern void    Destroy_CompCol_Permuted_dist(SuperMatrix *);
-extern void    Destroy_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern void    Destroy_CompRow_Matrix_dist(SuperMatrix *);
-extern void    sp_colorder (superlu_options_t*, SuperMatrix*, int_t*, int_t*,
-			    SuperMatrix*);
-extern int_t   sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t,
-				 int_t *);
-extern void    countnz_dist (const int_t, int_t *, int_t *, int_t *,
-			     Glu_persist_t *, Glu_freeable_t *);
-extern int_t   fixupL_dist (const int_t, const int_t *, Glu_persist_t *,
-			    Glu_freeable_t *);
-extern int_t   *TreePostorder_dist (int_t, int_t *);
-extern float   slamch_(char *);
-extern double  dlamch_(char *);
-extern void    *superlu_malloc_dist (size_t);
-extern void    superlu_free_dist (void*);
-extern int_t   *intMalloc_dist (int_t);
-extern int_t   *intCalloc_dist (int_t);
-
-/* Auxiliary routines */
-extern double  SuperLU_timer_ ();
-extern void    superlu_abort_and_exit_dist(char *);
-extern int_t   sp_ienv_dist (int_t);
-extern int     lsame_ (char *, char *);
-extern int     xerbla_ (char *, int *);
-extern void    ifill_dist (int_t *, int_t, int_t);
-extern void    super_stats_dist (int_t, int_t *);
-extern void    ScalePermstructInit(const int_t, const int_t, 
-				   ScalePermstruct_t *);
-extern void    ScalePermstructFree(ScalePermstruct_t *);
-extern void  superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *);
-extern void  superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t,
-			     gridinfo_t *);
-extern void  superlu_gridexit(gridinfo_t *);
-extern void  get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *);
-extern void  a_plus_at_dist(const int_t, const int_t, int_t *, int_t *,
-			    int_t *, int_t **, int_t **);
-extern void  bcast_tree(void *, int, MPI_Datatype, int, int,
-			gridinfo_t *, int, int *);
-extern int_t symbfact(superlu_options_t *, int, SuperMatrix *, int_t *,
-                      int_t *, Glu_persist_t *, Glu_freeable_t *);
-extern int_t symbfact_SubInit(fact_t, void *, int_t, int_t, int_t, int_t,
-			      Glu_persist_t *, Glu_freeable_t *);
-extern int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *,
-			       Glu_freeable_t *);
-extern int_t symbfact_SubFree(Glu_freeable_t *);
-extern void  get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *,
-			    int_t **, int_t **);
-extern int_t QuerySpace_dist(int_t, int_t, Glu_freeable_t *, mem_usage_t *);
-extern int   xerbla_ (char *, int *);
-extern void  pxerbla (char *, gridinfo_t *, int_t);
-extern void  PStatInit(SuperLUStat_t *);
-extern void  PStatFree(SuperLUStat_t *);
-extern void  PStatPrint(superlu_options_t *, SuperLUStat_t *, gridinfo_t *);
-
-
-/* Prototypes for parallel symbolic factorization */
-extern float symbfact_dist
-(int,  int, SuperMatrix *, int_t *, int_t *,  int_t *, int_t *,
- Pslu_freeable_t *, MPI_Comm *, MPI_Comm *,  mem_usage_t *);
-
-/* Get the column permutation using parmetis */
-extern float get_perm_c_parmetis 
-(SuperMatrix *, int_t *, int_t *, int, int, 
- int_t **, int_t **, gridinfo_t *, MPI_Comm *);
-
-/* Auxiliary routines for memory expansions used during
-   the parallel symbolic factorization routine */
-
-extern int_t psymbfact_LUXpandMem
-(int_t, int_t, int_t, int_t, int_t, int_t, int_t, int_t, 
- Pslu_freeable_t *, Llu_symbfact_t *,  vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-extern int_t psymbfact_LUXpand
-(int_t, int_t, int_t, int_t, int_t *, int_t, int_t, int_t, int_t, 
- Pslu_freeable_t *, Llu_symbfact_t *,  vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-extern int_t psymbfact_LUXpand_RL
-(int_t, int_t, int_t, int_t, int_t, int_t,
- Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *);
-
-extern int_t psymbfact_prLUXpand
-(int_t,  int_t,
- MemType, Llu_symbfact_t *, psymbfact_stat_t *);
-
-/* Routines for debugging */
-extern void  print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *);
-extern void  check_repfnz_dist(int_t, int_t, int_t, int_t *);
-extern int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *);
-extern void  PrintDouble5(char *, int_t, double *);
-extern void  PrintInt10(char *, int_t, int_t *);
-extern int   file_PrintInt10(FILE *, char *, int_t, int_t *);
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_DEFS */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_grid.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_grid.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_grid.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_grid.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,169 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_ddefs.h"
-
-/* Define global variables */
-MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX;
-
-/*
- * All processes in the MPI communicator must call this routine.
- */
-void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
-					 the new grid is formed. */
-		      int_t nprow, int_t npcol, gridinfo_t *grid)
-{
-    int Np = nprow * npcol;
-    int_t *usermap;
-    int i, j, info;
-
-    /* Make a list of the processes in the new communicator. */
-    usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t));
-    for (j = 0; j < npcol; ++j)
-	for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j;
-    
-    /* Check MPI environment initialization. */
-    MPI_Initialized( &info );
-    if ( !info )
-	ABORT("C main program must explicitly call MPI_Init()");
-
-    MPI_Comm_size( Bcomm, &info );
-    if ( info < Np )
-	ABORT("Number of processes is smaller than NPROW * NPCOL");
-
-    superlu_gridmap(Bcomm, nprow, npcol, usermap, nprow, grid);
-    
-    SUPERLU_FREE(usermap);
-}
-
-
-/*
- * All processes in the MPI communicator must call this routine.
- */
-void superlu_gridmap(
-		     MPI_Comm Bcomm, /* The base communicator upon which
-					the new grid is formed. */
-		     int_t nprow,
-		     int_t npcol,
-		     int_t usermap[], /* usermap(i,j) holds the process
-					 number to be placed in {i,j} of
-					 the process grid.  */
-		     int_t ldumap,    /* The leading dimension of the
-					 2D array usermap[].  */
-		     gridinfo_t *grid)
-{
-    MPI_Group mpi_base_group, superlu_grp;
-    int Np = nprow * npcol, mycol, myrow;
-    int *pranks;
-    int i, j, info;
-    static int first_init = 1;
-
-    /* Create datatype in C for MPI complex. */
-    if ( first_init ) {
-	MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX );
-	MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX );
-	first_init = 0;
-    }
-
-    /* Check MPI environment initialization. */
-    MPI_Initialized( &info );
-    if ( !info )
-	ABORT("C main program must explicitly call MPI_Init()");
-
-    grid->nprow = nprow;
-    grid->npcol = npcol;
-
-    /* Make a list of the processes in the new communicator. */
-    pranks = (int *) SUPERLU_MALLOC(Np*sizeof(int));
-    for (j = 0; j < npcol; ++j)
-	for (i = 0; i < nprow; ++i)
-	    pranks[i*npcol+j] = usermap[j*ldumap+i];
-    
-    /*
-     * Form MPI communicator for all.
-     */
-    /* Get the group underlying Bcomm. */
-    MPI_Comm_group( Bcomm, &mpi_base_group );
-    /* Create the new group. */
-    MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp );
-    /* Create the new communicator. */
-    /* NOTE: The call is to be executed by all processes in Bcomm,
-       even if they do not belong in the new group -- superlu_grp. */
-    MPI_Comm_create( Bcomm, superlu_grp, &grid->comm );
-
-    /* Bail out if I am not in the group, superlu_group. */
-    if ( grid->comm == MPI_COMM_NULL ) {
-	grid->comm = Bcomm;
-	MPI_Comm_rank( Bcomm, &i );
-	grid->iam = i;
-	/*grid->iam = -1;*/
-	SUPERLU_FREE(pranks);
-	return;
-    }
-
-    MPI_Comm_rank( grid->comm, &(grid->iam) );
-    myrow = grid->iam / npcol;
-    mycol = grid->iam % npcol;
-
-    /*
-     * Form MPI communicator for myrow, scope = COMM_ROW.
-     */
-#if 0
-    for (i = 0; i < npcol; ++i) pranks[i] = myrow*npcol + i;
-    MPI_Comm_group( grid->comm, &superlu_grp );          /* Find all's group */
-    MPI_Group_incl( superlu_grp, npcol, pranks, &grp );  /* Form new group */
-    MPI_Comm_create( grid->comm, grp, &grid->rscp.comm );/* Create new comm */
-#else
-    MPI_Comm_split(grid->comm, myrow, mycol, &(grid->rscp.comm));
-#endif
-
-    /*
-     * Form MPI communicator for mycol, scope = COMM_COLUMN.
-     */
-#if 0
-    for (i = 0; i < nprow; ++i) pranks[i] = i*npcol + mycol;
-    MPI_Group_incl( superlu_grp, nprow, pranks, &grp );  /* Form new group */
-    MPI_Comm_create( grid->comm, grp, &grid->cscp.comm );/* Create new comm */
-#else
-    MPI_Comm_split(grid->comm, mycol, myrow, &(grid->cscp.comm));
-#endif
-
-    grid->rscp.Np = npcol;
-    grid->rscp.Iam = mycol;
-    grid->cscp.Np = nprow;
-    grid->cscp.Iam = myrow;
-
-#if 0
-    {
-	int tag_ub;
-	if ( !grid->iam ) {
-	    MPI_Attr_get(Bcomm, MPI_TAG_UB, &tag_ub, &info);
-	    printf("MPI_TAG_UB %d\n", tag_ub);
-	    /* returns 4295677672
-	       In reality it is restricted to no greater than 16384. */
-	}
-	exit(0);
-    }
-#endif
-
-    SUPERLU_FREE(pranks);
-    MPI_Group_free(&superlu_grp);
-    MPI_Group_free(&mpi_base_group);
-}
-
-void superlu_gridexit(gridinfo_t *grid)
-{
-    if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) {
-	/* Marks the communicator objects for deallocation. */
-	MPI_Comm_free( &grid->rscp.comm );
-	MPI_Comm_free( &grid->cscp.comm );
-	MPI_Comm_free( &grid->comm );
-    }
-    if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) {
-	MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX );
-    }
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_timer.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_timer.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_timer.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_timer.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,67 +0,0 @@
-/* 
- * Purpose
- * ======= 
- *	Returns the time in seconds used by the process.
- *
- * Note: the timer function call is machine dependent. Use conditional
- *       compilation to choose the appropriate function.
- *
- */
-
-#include "superlu_defs.h"
-
-#ifdef SUN 
-/*
- * 	It uses the system call gethrtime(3C), which is accurate to 
- *	nanoseconds. 
-*/
-#include <sys/time.h>
- 
-double SuperLU_timer_() {
-    return ( (double)gethrtime() / 1e9 );
-}
-
-#elif defined ( UNIX_TIMER )
-
-#include <sys/types.h>
-#include <sys/times.h>
-#include <time.h>
-#include <sys/time.h>
-
-#ifndef CLK_TCK
-#define CLK_TCK 60
-#endif
-
-double SuperLU_timer_()
-{
-    struct tms use;
-    double tmp;
-    times(&use);
-    tmp = use.tms_utime;
-    tmp += use.tms_stime;
-    return (double)(tmp) / (double) CLK_TCK;
-}
-
-#elif _WIN32
-
-#include <time.h>
-
-double SuperLU_timer_()
-{
-    clock_t t;
-    t=clock();
-
-    return ((double)t)/CLOCKS_PER_SEC;
-}
-
-#else
-
-#include <mpi.h>
-
-double SuperLU_timer_()
-{
-    return MPI_Wtime();
-}
-
-#endif
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/superlu_zdefs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,312 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- *
- */
-
-#ifndef __SUPERLU_zDEFS /* allow multiple inclusions */
-#define __SUPERLU_zDEFS
-
-/*
- * File name:	superlu_zdefs.h
- * Purpose:     Distributed SuperLU data types and function prototypes
- * History:
- */
-
-#include "superlu_defs.h"
-#include "dcomplex.h"
-
-/* 
- * On each processor, the blocks in L are stored in compressed block
- * column format, the blocks in U are stored in compressed block row format.
- */
-typedef struct {
-    int_t   **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
-    doublecomplex  **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
-    int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
-    doublecomplex  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
-#if 0
-    int_t   *Lsub_buf;        /* Buffer for the remote subscripts of L */
-    double  *Lval_buf;        /* Buffer for the remote nonzeros of L   */
-#endif
-    int_t   *Lsub_buf_2[2];   /* Buffers for the remote subscripts of L*/
-    doublecomplex  *Lval_buf_2[2];   /* Buffers for the remote nonzeros of L  */
-    int_t   *Usub_buf;        /* Buffer for the remote subscripts of U */
-    doublecomplex  *Uval_buf;        /* Buffer for the remote nonzeros of U   */
-    doublecomplex  *ujrow;           /* used in panel factorization.          */
-    int_t   bufmax[NBUFFERS]; /* Buffer size; 5 entries
-			       *     0 : size of Lsub_buf[]
-			       *     1 : size of Lval_buf[]
-			       *     2 : size of Usub_buf[] 
-			       *     3 : size of Uval_buf[]
-			       *     4 : size of tempv[LDA]
-			       */
-
-    /*-- Record communication schedule for factorization. --*/
-    int_t   *ToRecv;          /* Recv from no one (0), left (1), and up (2).*/
-    int_t   *ToSendD;         /* Whether need to send down block row.       */
-    int_t   **ToSendR;        /* List of processes to send right block col. */
-
-    /*-- Record communication schedule for solves. --*/
-    int_t   *fmod;            /* Modification count for L-solve            */
-    int_t   **fsendx_plist;   /* Column process list to send down Xk       */
-    int_t   *frecv;           /* Modifications to be recv'd in proc row    */
-    int_t   nfrecvx;          /* Number of Xk I will receive in L-solve    */
-    int_t   nfsendx;          /* Number of Xk I will send in L-solve       */
-    int_t   *bmod;            /* Modification count for U-solve            */
-    int_t   **bsendx_plist;   /* Column process list to send down Xk       */
-    int_t   *brecv;           /* Modifications to be recv'd in proc row    */
-    int_t   nbrecvx;          /* Number of Xk I will receive in U-solve    */
-    int_t   nbsendx;          /* Number of Xk I will send in U-solve       */
-
-    /*-- Auxiliary arrays used for solves. --*/
-    int_t   *ilsum;           /* Starting position of each supernode in lsum
-				 (local)  */
-    int_t   ldalsum;          /* LDA of lsum (local) */
-    int_t   SolveMsgSent;     /* Number of actual messages sent in LU-solve */
-    int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
-} LocalLU_t;
-
-typedef struct {
-    int_t *etree;
-    Glu_persist_t *Glu_persist;
-    LocalLU_t *Llu;
-} LUstruct_t;
-
-/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */
-typedef struct {
-    int_t lbnum;  /* Row block number (local).      */
-    int_t indpos; /* Starting position in Uindex[]. */
-} Ucb_indptr_t;
-
-/*-- Data structure for communication during matrix-vector multiplication. */
-typedef struct {
-    int_t *extern_start;
-    int_t *ind_tosend;    /* X indeices to be sent to other processes */
-    int_t *ind_torecv;    /* X indeices to be received from other processes */
-    int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs)
-			     (also point to val_torecv) */
-    int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs)
-			     (also point to val_tosend) */
-    int   *SendCounts;    /* Numbers of X indices to be sent
-			     (also numbers of X values to be received) */
-    int   *RecvCounts;    /* Numbers of X indices to be received
-			     (also numbers of X values to be sent) */
-    doublecomplex *val_tosend;   /* X values to be sent to other processes */
-    doublecomplex *val_torecv;   /* X values to be received from other processes */
-    int_t TotalIndSend;   /* Total number of indices to be sent
-			     (also total number of values to be received) */
-    int_t TotalValSend;   /* Total number of values to be sent.
-			     (also total number of indices to be received) */
-} pzgsmv_comm_t;
-
-/*-- Data structure for redistribution of B and X --*/
-typedef struct {
-    int  *B_to_X_SendCnt;
-    int  *X_to_B_SendCnt;
-    int  *ptr_to_ibuf, *ptr_to_dbuf;
-} pxgstrs_comm_t;
-
-/*-- Data structure holding the information for the solution phase --*/
-typedef struct {
-    int_t *row_to_proc;
-    int_t *inv_perm_c;
-    int_t num_diag_procs, *diag_procs, *diag_len;
-    pzgsmv_comm_t *gsmv_comm;
-    pxgstrs_comm_t *gstrs_comm;
-    int_t *A_colind_gsmv; /* After pzgsmv_init(), the global column
-                             indices of A are translated into the relative
-                             positions in the gathered x-vector.
-                             This is re-used in repeated calls to pzgsmv() */
-} SOLVEstruct_t;
-
-
-/***********************************************************************
- * Function prototypes
- ***********************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/* Supernodal LU factor related */
-extern void
-zCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
-			    int_t *, int_t *, Stype_t, Dtype_t, Mtype_t);
-extern void
-zCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t,
-			       int_t, doublecomplex *, int_t *, int_t *,
-			       Stype_t, Dtype_t, Mtype_t);
-extern void
-zCompRow_to_CompCol_dist(int_t, int_t, int_t, doublecomplex *, int_t *, int_t *,
-                         doublecomplex **, int_t **, int_t **);
-extern int
-pzCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *,
-	 		        SuperMatrix *);
-extern void
-zCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *);
-extern void
-zCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, doublecomplex *, int_t,
-			  Stype_t, Dtype_t, Mtype_t);
-extern void
-zCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, 
-			      int_t *, int_t *, int_t *, int_t *, int_t *,
-			      Stype_t, Dtype_t, Mtype_t);
-extern void
-zCopy_Dense_Matrix_dist(int_t, int_t, doublecomplex *, int_t,
-                        doublecomplex *, int_t);
-
-extern void    zallocateA_dist (int_t, int_t, doublecomplex **, int_t **, int_t **);
-extern void    zGenXtrue_dist (int_t, int_t, doublecomplex *, int_t);
-extern void    zFillRHS_dist (char *, int_t, doublecomplex *, int_t,
-                              SuperMatrix *, doublecomplex *, int_t);
-extern int     zcreate_matrix(SuperMatrix *, int, doublecomplex **, int *, 
-			      doublecomplex **, int *, FILE *, gridinfo_t *);
-
-/* Driver related */
-extern void    zgsequ_dist (SuperMatrix *, double *, double *, double *,
-			    double *, double *, int_t *);
-extern double  zlangs_dist (char *, SuperMatrix *);
-extern void    zlaqgs_dist (SuperMatrix *, double *, double *, double,
-			    double, double, char *);
-extern void    pzgsequ (SuperMatrix *, double *, double *, double *,
-			double *, double *, int_t *, gridinfo_t *);
-extern double  pzlangs (char *, SuperMatrix *, gridinfo_t *);
-extern void    pzlaqgs (SuperMatrix *, double *, double *, double,
-			double, double, char *);
-extern int     pzPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[],
-				      doublecomplex [], int, doublecomplex [], int, int,
-				      gridinfo_t *);
-
-extern int     sp_ztrsv_dist (char *, char *, char *, SuperMatrix *,
-			      SuperMatrix *, doublecomplex *, int *);
-extern int     sp_zgemv_dist (char *, doublecomplex, SuperMatrix *, doublecomplex *,
-			      int, doublecomplex, doublecomplex *, int);
-extern int     sp_zgemm_dist (char *, char *, int, int, int, doublecomplex,
-			SuperMatrix *, doublecomplex *, int, doublecomplex, 
-			doublecomplex *, int);
-
-extern int_t zdistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *, 
-			 LUstruct_t *, gridinfo_t *);
-extern void  pzgssvx_ABglobal(superlu_options_t *, SuperMatrix *, 
-			      ScalePermstruct_t *, doublecomplex *,
-			      int, int, gridinfo_t *, LUstruct_t *, double *,
-			      SuperLUStat_t *, int *);
-extern int_t pzdistribute(fact_t, int_t, SuperMatrix *, 
-			 ScalePermstruct_t *, Glu_freeable_t *, 
-			 LUstruct_t *, gridinfo_t *);
-extern void  pzgssvx(superlu_options_t *, SuperMatrix *, 
-		     ScalePermstruct_t *, doublecomplex *,
-		     int, int, gridinfo_t *, LUstruct_t *,
-		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
-extern int  zSolveInit(superlu_options_t *, SuperMatrix *, int_t [], int_t [],
-		       int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *);
-extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
-	                  int_t [], int_t [], gridinfo_t *grid,
-	                  Glu_persist_t *, SOLVEstruct_t *);
-extern void pxgstrs_finalize(pxgstrs_comm_t *);
-extern void zSolveFinalize(superlu_options_t *, SOLVEstruct_t *);
-extern void zldperm(int_t, int_t, int_t, int_t [], int_t [],
-		    doublecomplex [], int_t *, double [], double []);
-extern int_t pzgstrf(superlu_options_t *, int, int, double,
-		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
-extern void pzgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
-			     doublecomplex *, int_t, int, SuperLUStat_t *, int *);
-extern void pzgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
-		    doublecomplex *, int_t, int_t, int_t, int, SOLVEstruct_t *,
-		    SuperLUStat_t *, int *);
-extern void zlsum_fmod(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *,
-		       int, int, int_t , int_t *, int_t, int_t, int_t,
-		       int_t *, gridinfo_t *, LocalLU_t *, 
-		       MPI_Request [], SuperLUStat_t *);
-extern void zlsum_bmod(doublecomplex *, doublecomplex *, doublecomplex *,
-                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
-                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t *);
-extern void pzgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
-		    ScalePermstruct_t *, gridinfo_t *,
-		    doublecomplex [], int_t, doublecomplex [], int_t, int,
-		    SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
-extern void pzgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *,
-		  gridinfo_t *, doublecomplex *, int_t, doublecomplex *, int_t,
-		  int, double *, SuperLUStat_t *, int *);
-extern int   pzgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *,
-				   gridinfo_t *, int_t *, int_t *[],
-				   doublecomplex *[], int_t *[], int_t []);
-extern int  pzgsmv_AXglobal(int_t, int_t [], doublecomplex [], int_t [],
-	                       doublecomplex [], doublecomplex []);
-extern int  pzgsmv_AXglobal_abs(int_t, int_t [], doublecomplex [], int_t [],
-				 doublecomplex [], double []);
-extern void pzgsmv_init(SuperMatrix *, int_t *, gridinfo_t *,
-			pzgsmv_comm_t *);
-extern void pzgsmv(int_t, SuperMatrix *, gridinfo_t *, pzgsmv_comm_t *,
-		   doublecomplex x[], doublecomplex ax[]);
-extern void pzgsmv_finalize(pzgsmv_comm_t *);
-
-/* Memory-related */
-extern doublecomplex  *doublecomplexMalloc_dist(int_t);
-extern doublecomplex  *doublecomplexCalloc_dist(int_t);
-extern double  *doubleMalloc_dist(int_t);
-extern double  *doubleCalloc_dist(int_t);
-extern void  *duser_malloc_dist (int_t, int_t);
-extern void  duser_free_dist (int_t, int_t);
-extern int_t zQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *, mem_usage_t *);
-extern void    Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
-extern void    LUstructInit(const int_t, const int_t, LUstruct_t *);
-extern void    LUstructFree(LUstruct_t *);
-
-/* Auxiliary routines */
-extern void    zfill_dist (doublecomplex *, int_t, doublecomplex);
-extern void    zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t,
-                                     doublecomplex*, int_t, gridinfo_t*);
-extern void    pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t,
-				doublecomplex [], int_t , gridinfo_t *);
-extern void  zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, 
-			   doublecomplex **, int_t **, int_t **);
-
-/* Distribute the data for numerical factorization */
-extern int_t zdist_psymbtonum
-(fact_t, int_t, SuperMatrix *,
- ScalePermstruct_t *, Pslu_freeable_t *, 
- LUstruct_t *, gridinfo_t *);
-
-/* Routines for debugging */
-extern void  zPrintLblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *,
-		 	   LocalLU_t *);
-extern void  zPrintUblocks(int_t, int_t, gridinfo_t *, Glu_persist_t *,
-			   LocalLU_t *);
-extern void  zPrint_CompCol_Matrix_dist(SuperMatrix *);
-extern void  zPrint_Dense_Matrix_dist(SuperMatrix *);
-extern int   zPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern void  PrintDoublecomplex(char *, int_t, doublecomplex *);
-extern int   file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *);
-
-/* BLAS */
-
-#ifdef USE_VENDOR_BLAS
-extern int zgemm_(char*, char*, int*, int*, int*, doublecomplex*,
-                  doublecomplex*, int*, doublecomplex*, int*, doublecomplex*,
-                  doublecomplex*, int*, int, int);
-extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
-                  doublecomplex*, int*, int, int, int);
-#else
-extern int zgemm_(char*, char*, int*, int*, int*, doublecomplex*,
-                  doublecomplex*, int*, doublecomplex*, int*, doublecomplex*,
-                  doublecomplex*, int*);
-extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
-                  doublecomplex*, int*);
-#endif
-
-extern int zger_(int*, int*, doublecomplex*, doublecomplex*, int*,
-                 doublecomplex*, int*, doublecomplex*, int*);
-
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_dDEFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,177 +0,0 @@
-#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */
-#define __SUPERLU_SUPERMATRIX
-
-
-/********************************************
- * The matrix types are defined as follows. *
- ********************************************/
-typedef enum {
-    SLU_NC,    /* column-wise, no supernode */
-    SLU_NCP,   /* column-wise, column-permuted, no supernode 
-                  (The consecutive columns of nonzeros, after permutation,
-		   may not be stored  contiguously.) */
-    SLU_NR,    /* row-wize, no supernode */
-    SLU_SC,    /* column-wise, supernode */
-    SLU_SCP,   /* supernode, column-wise, permuted */    
-    SLU_SR,    /* row-wise, supernode */
-    SLU_DN,     /* Fortran style column-wise storage for dense matrix */
-    SLU_NR_loc  /* distributed compressed row format  */ 
-} Stype_t;
-
-typedef enum {
-    SLU_S,     /* single */
-    SLU_D,     /* double */
-    SLU_C,     /* single complex */
-    SLU_Z      /* double complex */
-} Dtype_t;
-
-typedef enum {
-    SLU_GE,    /* general */
-    SLU_TRLU,  /* lower triangular, unit diagonal */
-    SLU_TRUU,  /* upper triangular, unit diagonal */
-    SLU_TRL,   /* lower triangular */
-    SLU_TRU,   /* upper triangular */
-    SLU_SYL,   /* symmetric, store lower half */
-    SLU_SYU,   /* symmetric, store upper half */
-    SLU_HEL,   /* Hermitian, store lower half */
-    SLU_HEU    /* Hermitian, store upper half */
-} Mtype_t;
-
-typedef struct {
-	Stype_t Stype; /* Storage type: interprets the storage structure 
-		   	  pointed to by *Store. */
-	Dtype_t Dtype; /* Data type. */
-	Mtype_t Mtype; /* Matrix type: describes the mathematical property of 
-			  the matrix. */
-	int_t  nrow;   /* number of rows */
-	int_t  ncol;   /* number of columns */
-	void *Store;   /* pointer to the actual storage of the matrix */
-} SuperMatrix;
-
-/***********************************************
- * The storage schemes are defined as follows. *
- ***********************************************/
-
-/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */
-typedef struct {
-    int_t  nnz;	    /* number of nonzeros in the matrix */
-    void *nzval;    /* pointer to array of nonzero values, packed by column */
-    int_t  *rowind; /* pointer to array of row indices of the nonzeros */
-    int_t  *colptr; /* pointer to array of beginning of columns in nzval[] 
-		       and rowind[]  */
-                    /* Note:
-		       Zero-based indexing is used;
-		       colptr[] has ncol+1 entries, the last one pointing
-		       beyond the last column, so that colptr[ncol] = nnz. */
-} NCformat;
-
-/* Stype == SLU_NR */
-typedef struct {
-    int_t  nnz;	    /* number of nonzeros in the matrix */
-    void *nzval;    /* pointer to array of nonzero values, packed by raw */
-    int_t  *colind; /* pointer to array of columns indices of the nonzeros */
-    int_t  *rowptr; /* pointer to array of beginning of rows in nzval[] 
-		       and colind[]  */
-                    /* Note:
-		       Zero-based indexing is used;
-		       rowptr[] has nrow+1 entries, the last one pointing
-		       beyond the last row, so that rowptr[nrow] = nnz. */
-} NRformat;
-
-/* Stype == SLU_SC */
-typedef struct {
-  int_t  nnz;	     /* number of nonzeros in the matrix */
-  int_t  nsuper;     /* number of supernodes, minus 1 */
-  void *nzval;       /* pointer to array of nonzero values, packed by column */
-  int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */
-  int_t *rowind;     /* pointer to array of compressed row indices of 
-			rectangular supernodes */
-  int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */
-  int_t *col_to_sup;   /* col_to_sup[j] is the supernode number to which column 
-			j belongs; mapping from column to supernode number. */
-  int_t *sup_to_col;   /* sup_to_col[s] points to the start of the s-th 
-			supernode; mapping from supernode number to column.
-		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
-		              sup_to_col: 0 1 2 4 7 12           (nsuper=4) */
-                     /* Note:
-		        Zero-based indexing is used;
-		        nzval_colptr[], rowind_colptr[], col_to_sup and
-		        sup_to_col[] have ncol+1 entries, the last one
-		        pointing beyond the last column.
-		        For col_to_sup[], only the first ncol entries are
-		        defined. For sup_to_col[], only the first nsuper+2
-		        entries are defined. */
-} SCformat;
-
-/* Stype == SLU_SCP */
-typedef struct {
-  int_t  nnz;	     /* number of nonzeros in the matrix */
-  int_t  nsuper;     /* number of supernodes */
-  void *nzval;       /* pointer to array of nonzero values, packed by column */
-  int_t  *nzval_colbeg;/* nzval_colbeg[j] points to beginning of column j
-			  in nzval[] */
-  int_t  *nzval_colend;/* nzval_colend[j] points to one past the last element
-			  of column j in nzval[] */
-  int_t  *rowind;      /* pointer to array of compressed row indices of 
-			  rectangular supernodes */
-  int_t *rowind_colbeg;/* rowind_colbeg[j] points to beginning of column j
-			  in rowind[] */
-  int_t *rowind_colend;/* rowind_colend[j] points to one past the last element
-			  of column j in rowind[] */
-  int_t *col_to_sup;   /* col_to_sup[j] is the supernode number to which column
-			  j belongs; mapping from column to supernode. */
-  int_t *sup_to_colbeg; /* sup_to_colbeg[s] points to the start of the s-th 
-			   supernode; mapping from supernode to column.*/
-  int_t *sup_to_colend; /* sup_to_colend[s] points to one past the end of the
-			   s-th supernode; mapping from supernode number to
-			   column.
-		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
-		              sup_to_colbeg: 0 1 2 4 7              (nsuper=4)
-			      sup_to_colend: 1 2 4 7 12                    */
-                     /* Note:
-		        Zero-based indexing is used;
-		        nzval_colptr[], rowind_colptr[], col_to_sup and
-		        sup_to_col[] have ncol+1 entries, the last one
-		        pointing beyond the last column.         */
-} SCPformat;
-
-/* Stype == SLU_NCP */
-typedef struct {
-    int_t nnz;	  /* number of nonzeros in the matrix */
-    void *nzval;  /* pointer to array of nonzero values, packed by column */
-    int_t *rowind;/* pointer to array of row indices of the nonzeros */
-		  /* Note: nzval[]/rowind[] always have the same length */
-    int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] 
-                     and rowind[]  */
-    int_t *colend;/* colend[j] points to one past the last element of column
-		     j in nzval[] and rowind[]  */
-		  /* Note:
-		     Zero-based indexing is used;
-		     The consecutive columns of the nonzeros may not be 
-		     contiguous in storage, because the matrix has been 
-		     postmultiplied by a column permutation matrix. */
-} NCPformat;
-
-/* Stype == SLU_DN */
-typedef struct {
-    int_t lda;    /* leading dimension */
-    void *nzval;  /* array of size lda*ncol to represent a dense matrix */
-} DNformat;
-
-/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */
-typedef struct {
-    int_t nnz_loc;   /* number of nonzeros in the local submatrix */
-    int_t m_loc;     /* number of rows local to this processor */
-    int_t fst_row;   /* global index of the first row */
-    void  *nzval;    /* pointer to array of nonzero values, packed by row */
-    int_t *rowptr;   /* pointer to array of beginning of rows in nzval[] 
-			and colind[]  */
-    int_t *colind;   /* pointer to array of column indices of the nonzeros */
-                     /* Note:
-			Zero-based indexing is used;
-			rowptr[] has n_loc + 1 entries, the last one pointing
-			beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
-} NRformat_loc;
-
-
-#endif  /* __SUPERLU_SUPERMATRIX */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/supermatrix.h.old	1970-01-01 00:00:00.000000000 +0000
@@ -1,144 +0,0 @@
-#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */
-#define __SUPERLU_SUPERMATRIX
-
-
-/********************************************
- * The matrix types are defined as follows. *
- ********************************************/
-typedef enum {
-    SLU_NC,    /* column-wise, no supernode */
-    SLU_NR,    /* row-wize, no supernode */
-    SLU_SC,    /* column-wise, supernode */
-    SLU_SR,    /* row-wise, supernode */
-    SLU_NCP,   /* column-wise, column-permuted, no supernode 
-                  (The consecutive columns of nonzeros, after permutation,
-		   may not be stored  contiguously.) */
-    SLU_DN,     /* Fortran style column-wise storage for dense matrix */
-    SLU_NR_loc  /* distributed compressed row format  */ 
-} Stype_t;
-
-typedef enum {
-    SLU_S,     /* single */
-    SLU_D,     /* double */
-    SLU_C,     /* single complex */
-    SLU_Z      /* double complex */
-} Dtype_t;
-
-typedef enum {
-    SLU_GE,    /* general */
-    SLU_TRLU,  /* lower triangular, unit diagonal */
-    SLU_TRUU,  /* upper triangular, unit diagonal */
-    SLU_TRL,   /* lower triangular */
-    SLU_TRU,   /* upper triangular */
-    SLU_SYL,   /* symmetric, store lower half */
-    SLU_SYU,   /* symmetric, store upper half */
-    SLU_HEL,   /* Hermitian, store lower half */
-    SLU_HEU    /* Hermitian, store upper half */
-} Mtype_t;
-
-typedef struct {
-	Stype_t Stype; /* Storage type: interprets the storage structure 
-		   	  pointed to by *Store. */
-	Dtype_t Dtype; /* Data type. */
-	Mtype_t Mtype; /* Matrix type: describes the mathematical property of 
-			  the matrix. */
-	int_t  nrow;   /* number of rows */
-	int_t  ncol;   /* number of columns */
-	void *Store;   /* pointer to the actual storage of the matrix */
-} SuperMatrix;
-
-/***********************************************
- * The storage schemes are defined as follows. *
- ***********************************************/
-
-/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */
-typedef struct {
-    int_t  nnz;	    /* number of nonzeros in the matrix */
-    void *nzval;    /* pointer to array of nonzero values, packed by column */
-    int_t  *rowind; /* pointer to array of row indices of the nonzeros */
-    int_t  *colptr; /* pointer to array of beginning of columns in nzval[] 
-		       and rowind[]  */
-                    /* Note:
-		       Zero-based indexing is used;
-		       colptr[] has ncol+1 entries, the last one pointing
-		       beyond the last column, so that colptr[ncol] = nnz. */
-} NCformat;
-
-/* Stype == SLU_NR */
-typedef struct {
-    int_t  nnz;	    /* number of nonzeros in the matrix */
-    void *nzval;    /* pointer to array of nonzero values, packed by raw */
-    int_t  *colind; /* pointer to array of columns indices of the nonzeros */
-    int_t  *rowptr; /* pointer to array of beginning of rows in nzval[] 
-		       and colind[]  */
-                    /* Note:
-		       Zero-based indexing is used;
-		       rowptr[] has nrow+1 entries, the last one pointing
-		       beyond the last row, so that rowptr[nrow] = nnz. */
-} NRformat;
-
-/* Stype == SLU_SC */
-typedef struct {
-  int_t  nnz;	     /* number of nonzeros in the matrix */
-  int_t  nsuper;     /* number of supernodes, minus 1 */
-  void *nzval;       /* pointer to array of nonzero values, packed by column */
-  int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */
-  int_t *rowind;     /* pointer to array of compressed row indices of 
-			rectangular supernodes */
-  int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */
-  int_t *col_to_sup;   /* col_to_sup[j] is the supernode number to which column 
-			j belongs; mapping from column to supernode number. */
-  int_t *sup_to_col;   /* sup_to_col[s] points to the start of the s-th 
-			supernode; mapping from supernode number to column.
-		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
-		              sup_to_col: 0 1 2 4 7 12           (nsuper=4) */
-                     /* Note:
-		        Zero-based indexing is used;
-		        nzval_colptr[], rowind_colptr[], col_to_sup and
-		        sup_to_col[] have ncol+1 entries, the last one
-		        pointing beyond the last column.
-		        For col_to_sup[], only the first ncol entries are
-		        defined. For sup_to_col[], only the first nsuper+2
-		        entries are defined. */
-} SCformat;
-
-/* Stype == SLU_NCP */
-typedef struct {
-    int_t nnz;	  /* number of nonzeros in the matrix */
-    void *nzval;  /* pointer to array of nonzero values, packed by column */
-    int_t *rowind;/* pointer to array of row indices of the nonzeros */
-		  /* Note: nzval[]/rowind[] always have the same length */
-    int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] 
-                     and rowind[]  */
-    int_t *colend;/* colend[j] points to one past the last element of column
-		     j in nzval[] and rowind[]  */
-		  /* Note:
-		     Zero-based indexing is used;
-		     The consecutive columns of the nonzeros may not be 
-		     contiguous in storage, because the matrix has been 
-		     postmultiplied by a column permutation matrix. */
-} NCPformat;
-
-/* Stype == SLU_DN */
-typedef struct {
-    int_t lda;    /* leading dimension */
-    void *nzval;  /* array of size lda*ncol to represent a dense matrix */
-} DNformat;
-
-/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */
-typedef struct {
-    int_t nnz_loc;   /* number of nonzeros in the local submatrix */
-    int_t m_loc;     /* number of rows local to this processor */
-    int_t fst_row;   /* global index of the first row */
-    void  *nzval;    /* pointer to array of nonzero values, packed by row */
-    int_t *rowptr;   /* pointer to array of beginning of rows in nzval[] 
-			and colind[]  */
-    int_t *colind;   /* pointer to array of column indices of the nonzeros */
-                     /* Note:
-			Zero-based indexing is used;
-			rowptr[] has n_loc + 1 entries, the last one pointing
-			beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
-} NRformat_loc;
-
-
-#endif  /* __SUPERLU_SUPERMATRIX */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/symbfact.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/symbfact.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/symbfact.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/symbfact.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,863 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
- * Modified by X. S. Li.
- */
-
-#include "superlu_ddefs.h"
-
-/* What type of supernodes we want */
-#define T2_SUPER
-
-
-/*
- * Internal protypes
- */
-static void  relax_snode(int_t, int_t *, int_t, int_t *, int_t *);
-static int_t snode_dfs(SuperMatrix *, const int_t, const int_t, int_t *,
-		       int_t *,	Glu_persist_t *, Glu_freeable_t *);
-static int_t column_dfs(SuperMatrix *, const int_t, int_t *, int_t *, int_t *,
-			int_t *, int_t *, int_t *, int_t *, int_t *,
-			Glu_persist_t *, Glu_freeable_t *);
-static int_t pivotL(const int_t, int_t *, int_t *,
-		    Glu_persist_t *, Glu_freeable_t *);
-static int_t set_usub(const int_t, const int_t, const int_t, int_t *, int_t *,
-		      Glu_persist_t *, Glu_freeable_t *);
-static void  pruneL(const int_t, const int_t *, const int_t, const int_t,
-		    const int_t *, const int_t *, int_t *,
-		    Glu_persist_t *, Glu_freeable_t *);
-
-
-/************************************************************************/
-int_t symbfact
-/************************************************************************/
-(
- superlu_options_t *options, /* input options */
- int         pnum,     /* process number */
- SuperMatrix *A,       /* original matrix A permuted by columns (input) */
- int_t       *perm_c,  /* column permutation vector (input) */
- int_t       *etree,   /* column elimination tree (input) */
- Glu_persist_t *Glu_persist,  /* output */
- Glu_freeable_t *Glu_freeable /* output */
- )
-{
-/* 
- * Purpose
- * =======
- *   symbfact() performs a symbolic factorization on matrix A and sets up 
- *   the nonzero data structures which are suitable for supernodal Gaussian
- *   elimination with no pivoting (GENP). This routine features:
- *        o depth-first search (DFS)
- *        o supernodes
- *        o symmetric structure pruning
- *
- * Return value
- * ============
- *   < 0, number of bytes needed for LSUB.
- *   = 0, matrix dimension is 1.
- *   > 0, number of bytes allocated when out of memory.
- *
- */
-    int_t m, n, min_mn, j, i, k, irep, nseg, pivrow, info;
-    int_t *iwork, *perm_r, *segrep, *repfnz;
-    int_t *xprune, *marker, *parent, *xplore;
-    int_t relax, *desc, *relax_end;
-    int_t nnzL, nnzU;
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(pnum, "Enter symbfact()");
-#endif
-
-    m = A->nrow;
-    n = A->ncol;
-    min_mn = SUPERLU_MIN(m, n);
-
-    /* Allocate storage common to the symbolic factor routines */
-    info = symbfact_SubInit(DOFACT, NULL, 0, m, n, ((NCPformat*)A->Store)->nnz,
-			    Glu_persist, Glu_freeable);
-
-    iwork = (int_t *) intMalloc_dist(6*m+2*n);
-    perm_r = iwork;
-    segrep = iwork + m;
-    repfnz = segrep + m;
-    marker = repfnz + m;
-    parent = marker + m;
-    xplore = parent + m;
-    xprune = xplore + m;
-    relax_end = xprune + n;
-    relax = sp_ienv_dist(2);
-    ifill_dist(perm_r, m, EMPTY);
-    ifill_dist(repfnz, m, EMPTY);
-    ifill_dist(marker, m, EMPTY);
-    Glu_persist->supno[0] = -1;
-    Glu_persist->xsup[0] = 0;
-    Glu_freeable->xlsub[0] = 0;
-    Glu_freeable->xusub[0] = 0;
-
-    /*for (j = 0; j < n; ++j) iperm_c[perm_c[j]] = j;*/
-
-    /* Identify relaxed supernodes. */
-    if ( !(desc = intMalloc_dist(n+1)) )
-	ABORT("Malloc fails for desc[]");;
-    relax_snode(n, etree, relax, desc, relax_end);
-    SUPERLU_FREE(desc);
-    
-    for (j = 0; j < min_mn; ) {
-	if ( relax_end[j] != EMPTY ) { /* beginning of a relaxed snode */
-   	    k = relax_end[j];          /* end of the relaxed snode */
-	 
-	    /* Determine union of the row structure of supernode (j:k). */
-	    if ( (info = snode_dfs(A, j, k, xprune, marker,
-				   Glu_persist, Glu_freeable)) != 0 )
-		return info;
-
-	    for (i = j; i <= k; ++i)
-		pivotL(i, perm_r, &pivrow, Glu_persist, Glu_freeable); 
-
-	    j = k+1;
-	} else {
-	    /* Perform a symbolic factorization on column j, and detects
-	       whether column j starts a new supernode. */
-	    if ((info = column_dfs(A, j, perm_r, &nseg, segrep, repfnz,
-				   xprune, marker, parent, xplore,
-				   Glu_persist, Glu_freeable)) != 0)
-		return info;
-	    
-	    /* Copy the U-segments to usub[*]. */
-	    if ((info = set_usub(min_mn, j, nseg, segrep, repfnz,
-				 Glu_persist, Glu_freeable)) != 0)
-		return info;
-
-	    pivotL(j, perm_r, &pivrow, Glu_persist, Glu_freeable); 
-
-	    /* Prune columns [0:j-1] using column j. */
-	    pruneL(j, perm_r, pivrow, nseg, segrep, repfnz, xprune,
-		   Glu_persist, Glu_freeable);
-
-	    /* Reset repfnz[*] to prepare for the next column. */
-	    for (i = 0; i < nseg; i++) {
-		irep = segrep[i];
-		repfnz[irep] = EMPTY;
-	    }
-
-	    ++j;
-	} /* else */
-    } /* for j ... */
-
-    countnz_dist(min_mn, xprune, &nnzL, &nnzU, Glu_persist, Glu_freeable);
-
-    /* Apply perm_r to L; Compress LSUB array. */
-    i = fixupL_dist(min_mn, perm_r, Glu_persist, Glu_freeable);
-
-    if ( !pnum && (options->PrintStat == YES)) {
-	printf("\tNonzeros in L       %ld\n", nnzL);
-	printf("\tNonzeros in U       %ld\n", nnzU);
-	printf("\tnonzeros in L+U     %ld\n", nnzL + nnzU - min_mn);
-	printf("\tnonzeros in LSUB    %ld\n", i);
-    }
-    SUPERLU_FREE(iwork);
-
-#if ( PRNTlevel>=3 )
-    PrintInt10("lsub", Glu_freeable->xlsub[n], Glu_freeable->lsub);
-    PrintInt10("xlsub", n+1, Glu_freeable->xlsub);
-    PrintInt10("xprune", n, xprune);
-    PrintInt10("usub", Glu_freeable->xusub[n], Glu_freeable->usub);
-    PrintInt10("xusub", n+1, Glu_freeable->xusub);
-    PrintInt10("supno", n, Glu_persist->supno);
-    PrintInt10("xsup", (Glu_persist->supno[n])+2, Glu_persist->xsup);
-#endif
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(pnum, "Exit symbfact()");
-#endif
-
-    return (-i);
-
-} /* SYMBFACT */
-
-/************************************************************************/
-static void relax_snode
-/************************************************************************/
-(
- const int_t n, /* number of columns in the matrix (input) */
- int_t       *et,   /* column elimination tree (input) */
- const int_t relax, /* max no of columns allowed in a relaxed snode (input) */
- int_t       *desc, /* number of descendants of each etree node. */
- int_t       *relax_end /* last column in a supernode (output) */
- )
-{
-/*
- * Purpose
- * =======
- *   relax_snode() identifies the initial relaxed supernodes, assuming that 
- *   the matrix has been reordered according to an postorder of the etree.
- *
- */ 
-    register int_t j, parent, nsuper;
-    register int_t fsupc; /* beginning of a snode */
-    
-    ifill_dist(relax_end, n, EMPTY);
-    ifill_dist(desc, n+1, 0);
-    nsuper = 0;
-
-    /* Compute the number of descendants of each node in the etree. */
-    for (j = 0; j < n; j++) {
-	parent = et[j];
-	if ( parent != n )  /* not the dummy root */
-	    desc[parent] += desc[j] + 1;
-    }
-
-    /* Identify the relaxed supernodes by postorder traversal of the etree. */
-    for (j = 0; j < n; ) { 
-     	parent = et[j];
-        fsupc = j;
- 	while ( parent != n && desc[parent] < relax ) {
-	    j = parent;
-	    parent = et[j];
-	}
-	/* Found a supernode with j being the last column. */
-	relax_end[fsupc] = j; /* Last column is recorded. */
-	++nsuper;
-	++j;
-	/* Search for a new leaf. */
-	while ( desc[j] != 0 && j < n ) ++j;
-    }
-
-#if ( DEBUGlevel>=1 )
-    printf(".. No of relaxed snodes: %d\trelax: %d\n", nsuper, relax);
-#endif
-} /* RELAX_SNODE */
-
-
-/************************************************************************/
-static int_t snode_dfs
-/************************************************************************/
-(
- SuperMatrix *A,       /* original matrix A permuted by columns (input) */
- const int_t jcol,      /* beginning of the supernode (input) */
- const int_t kcol,      /* end of the supernode (input) */
- int_t       *xprune,   /* pruned location in each adjacency list (output) */
- int_t       *marker,   /* working array of size m */
- Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
- Glu_freeable_t *Glu_freeable
- )
-{
-/* 
- * Purpose
- * =======
- *    snode_dfs() determines the union of the row structures of those 
- *    columns within the relaxed snode.
- *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
- *    the part outside the rectangular supernode must be zero.
- *
- * Return value
- * ============
- *    0   success;
- *   >0   number of bytes allocated when run out of memory.
- *
- */
-    NCPformat *Astore;
-    int_t  *asub, *xa_begin, *xa_end;
-    register int_t i, k, ifrom, ito, nextl, new_next;
-    int_t  nsuper, krow, kmark, mem_error;
-    int_t  *xsup, *supno;
-    int_t  *lsub, *xlsub;
-    int_t  nzlmax, nextu;
-    
-    Astore   = A->Store;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-    xsup     = Glu_persist->xsup;
-    supno    = Glu_persist->supno;
-    lsub     = Glu_freeable->lsub;
-    xlsub    = Glu_freeable->xlsub;
-    nzlmax   = Glu_freeable->nzlmax;
-    nsuper   = ++supno[jcol]; /* Next available supernode number */
-    nextl    = xlsub[jcol];
-    nextu    = Glu_freeable->xusub[jcol];
-
-    for (i = jcol; i <= kcol; i++) {
-	/* For each nonzero in A[*,i] */
-	for (k = xa_begin[i]; k < xa_end[i]; ++k) {
-	    krow = asub[k];
-	    kmark = marker[krow];
-	    if ( kmark != kcol ) { /* First time visit krow */
-		marker[krow] = kcol;
-		lsub[nextl++] = krow;
-		if ( nextl >= nzlmax ) {
-		    if (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl,
-						      LSUB, &nzlmax,
-						      Glu_freeable))
-			return (mem_error);
-		    lsub = Glu_freeable->lsub;
-		}
-	    }
-    	}
-	supno[i] = nsuper;
-	Glu_freeable->xusub[i+1] = nextu; /* Tidy up the pointers in usub[*]. */
-    }
-
-    /* Supernode > 1, then make a copy of the subscripts for pruning */
-    if ( jcol < kcol ) {
-	new_next = nextl + (nextl - xlsub[jcol]);
-	while ( new_next > nzlmax ) {
-	    if (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, LSUB,
-					      &nzlmax, Glu_freeable))
-		return (mem_error);
-	    lsub = Glu_freeable->lsub;
-	}
-	ito = nextl;
-	for (ifrom = xlsub[jcol]; ifrom < nextl; )
-	    lsub[ito++] = lsub[ifrom++];	
-        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
-	nextl = ito;
-    }
-
-    xsup[nsuper+1] = kcol + 1;
-    supno[kcol+1]  = nsuper;
-    xprune[kcol]   = nextl;
-    xlsub[kcol+1]  = nextl;
-#if ( PRNTlevel>=3 )
-    printf(".. snode_dfs(): (%8d:%8d) nextl %d\n", jcol, kcol, nextl);
-#endif
-    return 0;
-} /* SNODE_DFS */
-
-
-
-/************************************************************************/
-static int_t column_dfs
-/************************************************************************/
-(
- SuperMatrix *A,        /* original matrix A permuted by columns (input) */
- const int_t jcol,      /* current column number (input) */
- int_t       *perm_r,   /* row permutation vector (input) */
- int_t       *nseg,     /* number of U-segments in column jcol (output) */
- int_t       *segrep,   /* list of U-segment representatives (output) */
- int_t       *repfnz,   /* list of first nonzeros in the U-segments (output) */
- int_t       *xprune,   /* pruned location in each adjacency list (output) */
- int_t       *marker,   /* working array of size m */
- int_t       *parent,   /* working array of size m */
- int_t       *xplore,   /* working array of size m */
- Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
- Glu_freeable_t *Glu_freeable
- )
-{
-/* 
- * Purpose
- * =======
- *   column_dfs() performs a symbolic factorization on column jcol, and
- *   detects the supernode boundary. This routine uses the row indices of
- *   A[*,jcol] to start the depth-first search (DFS).
- *
- * Output
- * ======
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives. The routine returns a list of such supernodal 
- *   representatives ( segrep[*] ) in topological order of the DFS that 
- *   generates them. The location of the first nonzero in each such 
- *   supernodal segment is also returned ( repfnz[*] ).
- *
- * Data structure
- * ==============
- *   (lsub, xlsub):
- *      lsub[*] contains the compressed subscripts of the supernodes;
- *      xlsub[j] points to the starting location of the j-th column in
- *               lsub[*]; 
- *	Storage: original row subscripts in A.
- *
- *      During the course of symbolic factorization, we also use
- *	(lsub, xlsub, xprune) for the purpose of symmetric pruning.
- *      For each supernode {s,s+1,...,t=s+r} with first column s and last
- *	column t, there are two subscript sets,  the last column
- *      structures (for pruning) will be removed in the end.
- *        o lsub[j], j = xlsub[s], ..., xlsub[s+1]-1
- *          is the structure of column s (i.e. structure of this supernode).
- *          It is used for the storage of numerical values.
- *	  o lsub[j], j = xlsub[t], ..., xlsub[t+1]-1
- *	    is the structure of the last column t of this supernode.
- *	    It is for the purpose of symmetric pruning. Therefore, the
- *	    structural subscripts can be rearranged without making physical
- *	    interchanges among the numerical values.
- *
- *      (1) if t > s, only the subscript sets for column s and column t
- *          are stored. Column t represents pruned adjacency structure.
- *
- *                  --------------------------------------------
- *          lsub[*]    ... |   col s    |   col t   | ...
- *                  --------------------------------------------
- *                          ^            ^           ^
- *                       xlsub[s]    xlsub[s+1]  xlsub[t+1]
- *                                       :           :
- *                                       :         xprune[t]
- *                                   xlsub[t]      
- *                                   xprune[s]    
- *
- *      (2) if t == s, i.e., a singleton supernode, the same subscript set
- *          is used for both G(L) and pruned graph:
- *
- *                  --------------------------------------
- *          lsub[*]    ... |      s     | ...
- *                  --------------------------------------
- *                          ^            ^   
- *                       xlsub[s]   xlsub[s+1]  
- *                                  xprune[s]
- *
- *       DFS will traverse the second subscript list, i.e., the part of the
- *       pruned graph.
- *
- * Local parameters
- * ================
- *   nseg: no of segments in current U[*,j]
- *   jsuper: jsuper=EMPTY if column j does not belong to the same
- *	supernode as j-1. Otherwise, jsuper=nsuper.
- *
- *   marker: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- * Return value
- * ============
- *     0  success;
- *   > 0  number of bytes allocated when run out of space.
- *
- */
-    NCPformat *Astore;
-    int_t     *asub, *xa_begin, *xa_end;
-    int_t     jcolp1, jcolm1, jsuper, nsuper, nextl;
-    int_t     k, krep, krow, kmark, kperm;
-    int_t     fsupc; /* first column of a supernode */
-    int_t     myfnz; /* first nonzero column of a U-segment */
-    int_t     chperm, chmark, chrep, kchild;
-    int_t     xdfs, maxdfs, kpar, oldrep;
-    int_t     jptr, jm1ptr;
-    int_t     ito, ifrom, istop;	/* used to compress row subscripts */
-    int_t     *xsup, *supno, *lsub, *xlsub;
-    int_t     nzlmax;
-    static int_t first = 1, maxsuper;
-    int_t     mem_error;
-    
-    /* Initializations */
-    Astore   = A->Store;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-    xsup     = Glu_persist->xsup;
-    supno    = Glu_persist->supno;
-    lsub     = Glu_freeable->lsub;
-    xlsub    = Glu_freeable->xlsub;
-    nzlmax   = Glu_freeable->nzlmax;
-    jcolp1   = jcol + 1;
-    jcolm1   = jcol - 1;
-    jsuper   = nsuper = supno[jcol];
-    nextl    = xlsub[jcol];
-    if ( first ) {
-	maxsuper = sp_ienv_dist(3);
-	first = 0;
-    }
-    
-    *nseg = 0;
-
-    /* For each nonzero in A[*,jcol] perform depth-first search. */
-    for (k = xa_begin[jcol]; k < xa_end[jcol]; ++k) {
-	krow = asub[k];
-	kmark = marker[krow];
-
-	/* krow was visited before, go to the next nonzero. */
-	if ( kmark == jcol ) continue; 
-	
-	/* 
-	 * For each unmarked neighber krow of jcol ...
-	 */
-	marker[krow] = jcol;
-	kperm = perm_r[krow];
-
-	if ( kperm == EMPTY ) {
-	    /* krow is in L:
-	     * place it in structure of L[*,jcol].
-	     */
-	    lsub[nextl++] = krow; 	/* krow is indexed into A */
-	    if ( nextl >= nzlmax ) {
-		if ( mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, LSUB,
-						   &nzlmax, Glu_freeable) )
-		    return (mem_error);
-		lsub = Glu_freeable->lsub;
-	    }
-	    if ( kmark != jcolm1 ) jsuper = EMPTY; /* Row index subset test */
-	} else {
-	    /* krow is in U:
-	     * If its supernode krep has been explored, update repfnz[*].
-	     */
-	    krep = xsup[supno[kperm]+1] - 1;
-	    myfnz = repfnz[krep];
-	    
-	    if ( myfnz != EMPTY ) { /* krep was visited before */
-		if ( kperm < myfnz ) repfnz[krep] = kperm;
-		/* continue; */
-	    } else {
-		/* Otherwise perform DFS, starting at krep */
-		oldrep = EMPTY;
-		parent[krep] = oldrep;
-		repfnz[krep] = kperm;
-		xdfs = xlsub[krep];
-		maxdfs = xprune[krep];
-		
-		do {
-		    /* 
-		     * For each unmarked kchild of krep 
-		     */
-		    while ( xdfs < maxdfs ) {
-			kchild = lsub[xdfs++];
-			chmark = marker[kchild];
-			
-			if ( chmark != jcol ) { /* Not reached yet */
-			    marker[kchild] = jcol;
-			    chperm = perm_r[kchild];
-			    
-			    /* Case kchild is in L: place it in L[*,k] */
-			    if ( chperm == EMPTY ) {
-				lsub[nextl++] = kchild;
-				if ( nextl >= nzlmax ) {
-				    if ( mem_error =
-					symbfact_SubXpand(A->ncol, jcol, nextl,
-							  LSUB, &nzlmax,
-							  Glu_freeable) )
-					return (mem_error);
-				    lsub = Glu_freeable->lsub;
-				}
-				if ( chmark != jcolm1 ) jsuper = EMPTY;
-			    } else {
-				/* Case kchild is in U: 
-				 * chrep = its supernode-rep. If its rep 
-				 * has been explored, update its repfnz[*].
-				 */
-				chrep = xsup[supno[chperm]+1] - 1;
-				myfnz = repfnz[chrep];
-				if ( myfnz != EMPTY ) {/* Visited before */
-				    if (chperm < myfnz) repfnz[chrep] = chperm;
-				} else {
-				    /* Continue DFS at sup-rep of kchild */
-				    xplore[krep] = xdfs;
-				    oldrep = krep;
-				    krep = chrep; /* Go deeper down G(L') */
-				    parent[krep] = oldrep;
-				    repfnz[krep] = chperm;
-				    xdfs = xlsub[krep];     
-				    maxdfs = xprune[krep];
-				} /* else */
-			    } /* else */
-			} /* if */
-			
-		    } /* while */
-		    
-		    /* krow has no more unexplored neighbors:
-		     *    place supernode-rep krep in postorder DFS;
-		     *    backtrack DFS to its parent.
-		     */
-		    segrep[*nseg] = krep;
-		    ++(*nseg);
-		    kpar = parent[krep]; /* Pop from stack; recurse */
-		    if ( kpar == EMPTY ) break; /* DFS done */
-		    krep = kpar;
-		    xdfs = xplore[krep];
-		    maxdfs = xprune[krep];
-		} while ( kpar != EMPTY ); /* Until empty stack */
-	    } /* else */
-	} /* else */
-    } /* for each nonzero ... */
-    
-    /* Check to see if jcol belongs in the same supernode as jcol-1 */
-    if ( jcol == 0 ) { /* Do nothing for column 0 */
-	nsuper = supno[0] = 0;
-    } else {
-	fsupc = xsup[nsuper];
-	jptr = xlsub[jcol];	/* Not compressed yet */
-	jm1ptr = xlsub[jcolm1];
-	
-#ifdef T2_SUPER
-	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
-#endif
-	/* Make sure the number of columns in a supernode doesn't
-	   exceed threshold. */
-	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
-	
-	/* If jcol starts a new supernode, reclaim storage space in
-	 * lsub[*] from the previous supernode. Note we only store
-	 * the subscript set of the first and last columns of
-	 * a supernode. (first for G(L'), last for pruned graph)
-	 */
-	if ( jsuper ==EMPTY ) { /* Starts a new supernode */
-	    if ( (fsupc < jcolm1-1) ) { /* >= 3 columns in nsuper */
-#ifdef CHK_COMPRESS
-		printf("  Compress lsub[] at super %d-%d\n",fsupc,jcolm1);
-#endif
-		ito = xlsub[fsupc+1];
-		xlsub[jcolm1] = ito;
-		istop = ito + jptr - jm1ptr;
-		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
-		xlsub[jcol] = istop;
-		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
-		    lsub[ito] = lsub[ifrom];
-		nextl = ito;            /* = istop + length(jcol) */
-	    }
-	    ++nsuper;
-	    supno[jcol] = nsuper;
-	} /* if a new supernode */
-	
-    } /* else: jcol > 0 */ 
-    
-    /* Tidy up the pointers before exit */
-    xsup[nsuper+1] = jcolp1;
-    supno[jcolp1]  = nsuper;
-    xprune[jcol]   = nextl; /* Initialize an upper bound for pruning. */
-    xlsub[jcolp1]  = nextl;
-    return 0;
-} /* COLUMN_DFS */
-
-/************************************************************************/
-static int_t pivotL
-/************************************************************************/
-(
- const int_t jcol,     /* current column number     (input)    */
- int_t       *perm_r,  /* row permutation vector    (output)   */
- int_t       *pivrow,  /* the pivot row index       (output)   */
- Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
- Glu_freeable_t *Glu_freeable
- )
-{
-/* Purpose
- * =======
- *   pivotL() interchanges row subscripts so that each diagonal block of a
- *   supernode in L has the row subscripts sorted in order of pivots.
- *   The row subscripts in the off-diagonal block are not sorted.
- *
- */
-    int_t  fsupc;	/* first column in the supernode */
-    int_t  nsupc;	/* number of columns in the supernode */
-    int_t  nsupr;       /* number of rows in the supernode */
-    int_t  lptr;	/* point_ts to the first subscript of the supernode */
-    int_t  diag, diagind;
-    int_t  *lsub_ptr;
-    int_t  isub, itemp;
-    int_t  *lsub, *xlsub;
-
-    /* Initialization. */
-    lsub     = Glu_freeable->lsub;
-    xlsub    = Glu_freeable->xlsub;
-    fsupc    = (Glu_persist->xsup)[(Glu_persist->supno)[jcol]];
-    nsupc    = jcol - fsupc; /* excluding jcol; nsupc >= 0 */
-    lptr     = xlsub[fsupc];
-    nsupr    = xlsub[fsupc+1] - lptr;
-    lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */
-
-    /* Search for diagonal element. */
-    /* diagind = iperm_c[jcol];*/
-    diagind = jcol;
-    diag = EMPTY;
-    for (isub = nsupc; isub < nsupr; ++isub)
-	if ( lsub_ptr[isub] == diagind ) {
-	    diag = isub;
-	    break;
-	}
-
-    /* Diagonal pivot exists? */
-    if ( diag == EMPTY ) {
-	printf("At column %d, ", jcol);
-	ABORT("pivotL() encounters zero diagonal");
-    }
-
-    /* Record pivot row. */
-    *pivrow = lsub_ptr[diag];
-    perm_r[*pivrow] = jcol;  /* perm_r[] should be Identity. */
-    /*assert(*pivrow==jcol);*/
-    
-    /* Interchange row subscripts. */
-    if ( diag != nsupc ) {
-	itemp = lsub_ptr[diag];
-	lsub_ptr[diag] = lsub_ptr[nsupc];
-	lsub_ptr[nsupc] = itemp;
-    }
-
-    return 0;
-} /* PIVOTL */
-
-
-/************************************************************************/
-static int_t set_usub
-/************************************************************************/
-(
- const int_t n,       /* total number of columns (input) */
- const int_t jcol,    /* current column number (input) */
- const int_t nseg,    /* number of supernodal segments in U[*,jcol] (input) */
- int_t       *segrep, /* list of U-segment representatives (output) */
- int_t       *repfnz, /* list of first nonzeros in the U-segments (output) */
- Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
- Glu_freeable_t *Glu_freeable
- )
-{
-/* 
- * Purpose
- * =======
- *   set_usub() sets up data structure to store supernodal segments in U.
- *   The supernodal segments in each column are stored in topological order.
- *   
- * NOTE
- * ====
- *   For each supernodal segment, we only store the index of the first
- *   nonzero index, rather than the indices of the whole segment, because
- *   those indices can be generated from first nonzero and supnodal
- *   representative.
- *   Therefore, for G(U), we store the "skeleton" of it.
- *   
- */
-    int_t ksub, krep, ksupno;
-    int_t k, kfnz;
-    int_t jsupno, nextu;
-    int_t new_next, mem_error;
-    int_t *supno;
-    int_t *usub, *xusub;
-    int_t nzumax;
-
-    supno   = Glu_persist->supno;
-    usub    = Glu_freeable->usub;
-    xusub   = Glu_freeable->xusub;
-    nzumax  = Glu_freeable->nzumax;
-    jsupno  = supno[jcol];
-    nextu   = xusub[jcol];
-
-    new_next = nextu + nseg;
-    while ( new_next > nzumax ) {
-	if (mem_error = symbfact_SubXpand(n, jcol, nextu, USUB, &nzumax,
-					  Glu_freeable))
-	    return (mem_error);
-	usub = Glu_freeable->usub;
-    }
-
-    /* We store U-segments in topological order. */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ++ksub) {
-	krep = segrep[k--];
-	ksupno = supno[krep];
-
-	if ( ksupno != jsupno ) { /* Should go into usub[*] */
-	    kfnz = repfnz[krep];
-	    if ( kfnz != EMPTY ) { /* Nonzero U-segment */
-		usub[nextu++] = kfnz;
-
-/*	    	fsupc = xsup[ksupno];
-	        isub = xlsub[fsupc] + kfnz - fsupc;
-		irow = lsub[isub];
-		usub[nextu++] = perm_r[irow];*/
-	    } /* if ... */
-	} /* if ... */
-    } /* for each segment... */
-
-    xusub[jcol + 1] = nextu; /* Close U[*,jcol] */
-    return 0;
-} /* SET_USUB */
-
-
-/************************************************************************/
-static void pruneL
-/************************************************************************/
-(
- const int_t  jcol,    /* in */
- const int_t  *perm_r, /* in */
- const int_t  pivrow,  /* in */
- const int_t  nseg,    /* in */
- const int_t  *segrep, /* in */
- const int_t  *repfnz, /* in */
- int_t  *xprune,       /* out */
- Glu_persist_t *Glu_persist,   /* global LU data structures (modified) */
- Glu_freeable_t *Glu_freeable
- )
-{
-/*
- * Purpose
- * =======
- *   pruneL() prunes the L-structure of supernodes whose L-structure
- *   contains the current pivot row "pivrow".
- *
- */
-    int_t  jsupno, irep, irep1, kmin, kmax, krow;
-    int_t  i, ktemp;
-    int_t  do_prune; /* logical variable */
-    int_t  *supno;
-    int_t  *lsub, *xlsub;
-
-    supno  = Glu_persist->supno;
-    lsub   = Glu_freeable->lsub;
-    xlsub  = Glu_freeable->xlsub;
-    
-    /*
-     * For each supernode-rep irep in U[*,j]
-     */
-    jsupno = supno[jcol];
-    for (i = 0; i < nseg; i++) {
-	irep = segrep[i];
-	irep1 = irep + 1;
-
-	/* Do not prune with a zero U-segment */
- 	if ( repfnz[irep] == EMPTY ) continue;
-
-	/*
-	 * If irep has not been pruned & it has a nonzero in row L[pivrow,i]
-	 */
-	do_prune = FALSE;
-	if ( supno[irep] != jsupno ) {
-	    if ( xprune[irep] >= xlsub[irep1] ) {
-		kmin = xlsub[irep];
-		kmax = xlsub[irep1] - 1;
-		for (krow = kmin; krow <= kmax; ++krow) 
-		    if ( lsub[krow] == pivrow ) {
-			do_prune = TRUE;
-			break;
-		    }
-	    }
-	    
-    	    if ( do_prune ) {
-	     	/* Do a quicksort-type partition. */
-	        while ( kmin <= kmax ) {
-	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
-			kmax--;
-		    else if ( perm_r[lsub[kmin]] != EMPTY )
-			kmin++;
-		    else { /* kmin below pivrow, and kmax above pivrow: 
-		            * 	   interchange the two subscripts
-			    */
-		        ktemp = lsub[kmin];
-		        lsub[kmin] = lsub[kmax];
-		        lsub[kmax] = ktemp;
-		        kmin++;
-		        kmax--;
-		    }
-	        } /* while */
-	        xprune[irep] = kmin; /* Pruning */
-#if ( DEBUGlevel>=3 )
-		printf(".. pruneL(): use col %d: xprune[%d] = %d\n",
-		       jcol, irep, kmin);
-#endif
-	    } /* if do_prune */
-	} /* if */
-    } /* for each U-segment ... */
-} /* PRUNEL */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,769 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * February 1, 2003
- *
- */
-
-#include <math.h>
-#include "superlu_ddefs.h"
-
-
-/* Deallocate the structure pointing to the actual storage of the matrix. */
-void
-Destroy_SuperMatrix_Store_dist(SuperMatrix *A)
-{
-    SUPERLU_FREE ( A->Store );
-}
-
-void
-Destroy_CompCol_Matrix_dist(SuperMatrix *A)
-{
-    NCformat *Astore = A->Store;
-    SUPERLU_FREE( Astore->rowind );
-    SUPERLU_FREE( Astore->colptr );
-    if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval );
-    SUPERLU_FREE( Astore );
-}
-
-void
-Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A)
-{
-    NRformat_loc *Astore = A->Store;
-    SUPERLU_FREE( Astore->rowptr );
-    SUPERLU_FREE( Astore->colind );
-    SUPERLU_FREE( Astore->nzval );
-    SUPERLU_FREE( Astore );
-}
-
-void
-Destroy_CompRow_Matrix_dist(SuperMatrix *A)
-{
-    SUPERLU_FREE( ((NRformat *)A->Store)->rowptr );
-    SUPERLU_FREE( ((NRformat *)A->Store)->colind );
-    SUPERLU_FREE( ((NRformat *)A->Store)->nzval );
-    SUPERLU_FREE( A->Store );
-}
-
-void
-Destroy_SuperNode_Matrix_dist(SuperMatrix *A)
-{
-    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
-    SUPERLU_FREE ( A->Store );
-}
-
-/* A is of type Stype==NCP */
-void
-Destroy_CompCol_Permuted_dist(SuperMatrix *A)
-{
-    SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
-    SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
-    SUPERLU_FREE ( A->Store );
-}
-
-/* A is of type Stype==DN */
-void
-Destroy_Dense_Matrix_dist(SuperMatrix *A)
-{
-    DNformat* Astore = A->Store;
-    SUPERLU_FREE (Astore->nzval);
-    SUPERLU_FREE ( A->Store );
-}
-
-/* Destroy distributed L & U matrices. */
-void
-Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
-{
-    int_t i, nb, nsupers;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter Destroy_LU()");
-#endif
-
-    nsupers = Glu_persist->supno[n-1] + 1;
-
-    nb = CEILING(nsupers, grid->npcol);
-    for (i = 0; i < nb; ++i) 
-	if ( Llu->Lrowind_bc_ptr[i] ) {
-	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
-	}
-    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
-    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
-
-    nb = CEILING(nsupers, grid->nprow);
-    for (i = 0; i < nb; ++i)
-	if ( Llu->Ufstnz_br_ptr[i] ) {
-	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
-	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
-	}
-    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
-    SUPERLU_FREE (Llu->Unzval_br_ptr);
-
-    /* The following can be freed after factorization. */
-    SUPERLU_FREE(Llu->ToRecv);
-    SUPERLU_FREE(Llu->ToSendD);
-    SUPERLU_FREE(Llu->ToSendR[0]);
-    SUPERLU_FREE(Llu->ToSendR);
-
-    /* The following can be freed only after iterative refinement. */
-    SUPERLU_FREE(Llu->ilsum);
-    SUPERLU_FREE(Llu->fmod);
-    SUPERLU_FREE(Llu->fsendx_plist[0]);
-    SUPERLU_FREE(Llu->fsendx_plist);
-    SUPERLU_FREE(Llu->bmod);
-    SUPERLU_FREE(Llu->bsendx_plist[0]);
-    SUPERLU_FREE(Llu->bsendx_plist);
-
-    SUPERLU_FREE(Glu_persist->xsup);
-    SUPERLU_FREE(Glu_persist->supno);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit Destroy_LU()");
-#endif
-}
-
-/* Allocate storage in ScalePermstruct */
-void ScalePermstructInit(const int_t m, const int_t n,
-			 ScalePermstruct_t *ScalePermstruct)
-{
-    ScalePermstruct->DiagScale = NOEQUIL;
-    if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) )
-	ABORT("Malloc fails for perm_r[].");
-    if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) )
-	ABORT("Malloc fails for perm_c[].");
-}
-
-/* Deallocate ScalePermstruct */
-void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct)
-{
-    SUPERLU_FREE(ScalePermstruct->perm_r);
-    SUPERLU_FREE(ScalePermstruct->perm_c);
-    switch ( ScalePermstruct->DiagScale ) {
-      case ROW:
-	SUPERLU_FREE(ScalePermstruct->R);
-	break;
-      case COL:
-	SUPERLU_FREE(ScalePermstruct->C);
-	break;
-      case BOTH:
-	SUPERLU_FREE(ScalePermstruct->R);
-	SUPERLU_FREE(ScalePermstruct->C);
-	break;
-    }
-}
-
-/* Allocate storage in LUstruct */
-void LUstructInit(const int_t m, const int_t n, LUstruct_t *LUstruct)
-{
-    if ( !(LUstruct->etree = intMalloc_dist(n)) )
-	ABORT("Malloc fails for etree[].");
-    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
-	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
-	ABORT("Malloc fails for Glu_persist_t.");
-    if ( !(LUstruct->Llu = (LocalLU_t *)
-	   SUPERLU_MALLOC(sizeof(LocalLU_t))) )
-	ABORT("Malloc fails for LocalLU_t.");
-}
-
-/* Deallocate LUstruct */
-void LUstructFree(LUstruct_t *LUstruct)
-{
-    SUPERLU_FREE(LUstruct->etree);
-    SUPERLU_FREE(LUstruct->Glu_persist);
-    SUPERLU_FREE(LUstruct->Llu);
-}
-
-/*
- * Count the total number of nonzeros in factors L and U,  and in the 
- * symmetrically reduced L. 
- */
-void
-countnz_dist(const int_t n, int_t *xprune, int_t *nnzL, int_t *nnzU, 
-	     Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
-{
-    int_t  fnz, fsupc, i, j, nsuper;
-    int_t  nnzL0, jlen, irep;
-    int_t  *supno, *xsup, *xlsub, *xusub, *usub;
-
-    supno  = Glu_persist->supno;
-    xsup   = Glu_persist->xsup;
-    xlsub  = Glu_freeable->xlsub;
-    xusub  = Glu_freeable->xusub;
-    usub   = Glu_freeable->usub;
-    *nnzL  = 0;
-    *nnzU  = 0;
-    nnzL0  = 0;
-    nsuper = supno[n];
-
-    if ( n <= 0 ) return;
-
-    /* 
-     * For each supernode in L.
-     */
-    for (i = 0; i <= nsuper; i++) {
-	fsupc = xsup[i];
-	jlen = xlsub[fsupc+1] - xlsub[fsupc];
-
-	for (j = fsupc; j < xsup[i+1]; j++) {
-	    *nnzL += jlen;
-	    *nnzU += j - fsupc + 1;
-	    jlen--;
-	}
-	irep = xsup[i+1] - 1;
-	nnzL0 += xprune[irep] - xlsub[irep];
-    }
-    
-    /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/
-    
-    /* For each column in U. */
-    for (j = 0; j < n; ++j) {
-	for (i = xusub[j]; i < xusub[j+1]; ++i) {
-	    fnz = usub[i];
-	    fsupc = xsup[supno[fnz]+1];
-	    *nnzU += fsupc - fnz;
-	}
-    }
-}
-
-
-
-/*
- * Fix up the data storage lsub for L-subscripts. It removes the subscript
- * sets for structural pruning,	and applies permuation to the remaining
- * subscripts.
- */
-int_t
-fixupL_dist(const int_t n, const int_t *perm_r, 
-	    Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
-{
-    register int_t nsuper, fsupc, nextl, i, j, k, jstrt, lsub_size;
-    int_t          *xsup, *lsub, *xlsub;
-
-    if ( n <= 1 ) return 0;
-
-    xsup   = Glu_persist->xsup;
-    lsub   = Glu_freeable->lsub;
-    xlsub  = Glu_freeable->xlsub;
-    nextl  = 0;
-    nsuper = (Glu_persist->supno)[n];
-    lsub_size = xlsub[n];
-    
-    /* 
-     * For each supernode ...
-     */
-    for (i = 0; i <= nsuper; i++) {
-	fsupc = xsup[i];
-	jstrt = xlsub[fsupc];
-	xlsub[fsupc] = nextl;
-	for (j = jstrt; j < xlsub[fsupc+1]; j++) {
-	    lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
-	    nextl++;
-  	}
-	for (k = fsupc+1; k < xsup[i+1]; k++) 
-	    	xlsub[k] = nextl;	/* Other columns in supernode i */
-
-    }
-
-    xlsub[n] = nextl;
-    return lsub_size;
-}
-
-/*
- * Set the default values for the options argument.
- */
-void set_default_options_dist(superlu_options_t *options)
-{
-    options->Fact              = DOFACT;
-    options->Equil             = YES;
-    options->ParSymbFact       = NO;
-    options->ColPerm           = MMD_AT_PLUS_A;
-    options->RowPerm           = LargeDiag;
-    options->ReplaceTinyPivot  = YES;
-    options->IterRefine        = DOUBLE;
-    options->Trans             = NOTRANS;
-    options->SolveInitialized  = NO;
-    options->RefineInitialized = NO;
-    options->PrintStat         = YES;
-}
-
-/*
- * Print the options setting.
- */
-void print_options_dist(superlu_options_t *options)
-{
-    printf(".. options:\n");
-    printf("\tFact\t %8d\n", options->Fact);
-    printf("\tEquil\t %8d\n", options->Equil);
-    printf("\tColPerm\t %8d\n", options->ColPerm);
-    printf("\tRowPerm\t %8d\n", options->RowPerm);
-    printf("\tReplaceTinyPivot %4d\n", options->ReplaceTinyPivot);
-    printf("\tTrans\t %8d\n", options->Trans);
-    printf("\tIterRefine\t%4d\n", options->IterRefine);
-    printf("..\n");
-}
-
-int_t
-pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
-	     int_t perm_r[], int_t perm_c[], gridinfo_t *grid,
-	     Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct)
-{
-/*
- * Purpose
- * =======
- *   Set up the communication pattern for the triangular solution.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The dimension of the linear system.
- *
- * m_loc  (input) int (local)
- *        The local row dimension of the distributed input matrix.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * fst_row (input) int (global)
- *        The row number of matrix B's first row in the global matrix.
- *
- * perm_r (input) int* (global)
- *        The row permutation vector.
- *
- * perm_c (input) int* (global)
- *        The column permutation vector.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- */
-    int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
-    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
-    int *itemp, *ptr_to_ibuf, *ptr_to_dbuf;
-    int_t *row_to_proc;
-    int_t i, gbi, k, l, num_diag_procs, *diag_procs;
-    int_t irow, lk, q, knsupc, nsupers, *xsup, *supno;
-    int   iam, p, pkk, procs;
-    pxgstrs_comm_t *gstrs_comm;
-
-    procs = grid->nprow * grid->npcol;
-    iam = grid->iam;
-    gstrs_comm = SOLVEstruct->gstrs_comm;
-    xsup = Glu_persist->xsup;
-    supno = Glu_persist->supno;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    row_to_proc = SOLVEstruct->row_to_proc;
-
-    /* ------------------------------------------------------------
-       SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X.
-       ------------------------------------------------------------*/
-    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
-        ABORT("Malloc fails for B_to_X_itemp[].");
-    SendCnt      = itemp;
-    SendCnt_nrhs = itemp +   procs;
-    RecvCnt      = itemp + 2*procs;
-    RecvCnt_nrhs = itemp + 3*procs;
-    sdispls      = itemp + 4*procs;
-    sdispls_nrhs = itemp + 5*procs;
-    rdispls      = itemp + 6*procs;
-    rdispls_nrhs = itemp + 7*procs;
-
-    /* Count the number of elements to be sent to each diagonal process.*/
-    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
-    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
-        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
-	gbi = BlockNum( irow );
-	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
-	++SendCnt[p];
-    }
-  
-    /* Set up the displacements for alltoall. */
-    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
-    sdispls[0] = rdispls[0] = 0;
-    for (p = 1; p < procs; ++p) {
-        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
-        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
-    }
-    for (p = 0; p < procs; ++p) {
-        SendCnt_nrhs[p] = SendCnt[p] * nrhs;
-	sdispls_nrhs[p] = sdispls[p] * nrhs;
-        RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
-	rdispls_nrhs[p] = rdispls[p] * nrhs;
-    }
-
-    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
-    gstrs_comm->B_to_X_SendCnt = SendCnt;
-
-    /* ------------------------------------------------------------
-       SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B.
-       ------------------------------------------------------------*/
-    /* This is freed in pxgstrs_finalize(). */
-    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
-        ABORT("Malloc fails for X_to_B_itemp[].");
-    SendCnt      = itemp;
-    SendCnt_nrhs = itemp +   procs;
-    RecvCnt      = itemp + 2*procs;
-    RecvCnt_nrhs = itemp + 3*procs;
-    sdispls      = itemp + 4*procs;
-    sdispls_nrhs = itemp + 5*procs;
-    rdispls      = itemp + 6*procs;
-    rdispls_nrhs = itemp + 7*procs;
-
-    /* Count the number of X entries to be sent to each process.*/
-    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
-    num_diag_procs = SOLVEstruct->num_diag_procs;
-    diag_procs = SOLVEstruct->diag_procs;
-
-    for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */
-	pkk = diag_procs[p];
-	if ( iam == pkk ) {
-	    for (k = p; k < nsupers; k += num_diag_procs) {
-		knsupc = SuperSize( k );
-		lk = LBi( k, grid ); /* Local block number */
-		irow = FstBlockC( k );
-		for (i = 0; i < knsupc; ++i) {
-#if 0
-		    q = row_to_proc[inv_perm_c[irow]];
-#else
-		    q = row_to_proc[irow];
-#endif
-		    ++SendCnt[q];
-		    ++irow;
-		}
-	    }
-	}
-    }
-
-    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
-    sdispls[0] = rdispls[0] = 0;
-    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
-    SendCnt_nrhs[0] = SendCnt[0] * nrhs;
-    RecvCnt_nrhs[0] = RecvCnt[0] * nrhs;
-    for (p = 1; p < procs; ++p) {
-        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
-        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
-        sdispls_nrhs[p] = sdispls[p] * nrhs;
-        rdispls_nrhs[p] = rdispls[p] * nrhs;
-	SendCnt_nrhs[p] = SendCnt[p] * nrhs;
-	RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
-    }
-
-    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
-    gstrs_comm->X_to_B_SendCnt = SendCnt;
-
-    if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) )
-        ABORT("Malloc fails for ptr_to_ibuf[].");
-    gstrs_comm->ptr_to_ibuf = ptr_to_ibuf;
-    gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs;
-
-} /* PXGSTRS_INIT */
-
-
-void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm)
-{
-    SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt);
-    SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt);
-    SUPERLU_FREE(gstrs_comm->ptr_to_ibuf);
-    SUPERLU_FREE(gstrs_comm);
-}
-
-
-/*
- * Diagnostic print of segment info after panel_dfs().
- */
-void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, 
-			  int_t *segrep, int_t *repfnz)
-{
-    int_t j, k;
-    
-    for (j = jcol; j < jcol+w; j++) {
-	printf("\tcol %d:\n", j);
-	for (k = 0; k < nseg; k++)
-	    printf("\t\tseg %d, segrep %d, repfnz %d\n", k, 
-			segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
-    }
-
-}
-
-void
-PStatInit(SuperLUStat_t *stat)
-{
-    register int_t i;
-
-    if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) )
-	ABORT("Malloc fails for stat->utime[]");
-    if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) )
-	ABORT("SUPERLU_MALLOC fails for stat->ops[]");
-    for (i = 0; i < NPHASES; ++i) {
-        stat->utime[i] = 0.;
-        stat->ops[i] = 0.;
-    }
-    stat->TinyPivots = stat->RefineSteps = 0;
-}
-
-void
-PStatPrint(superlu_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid)
-{
-    double  *utime = stat->utime;
-    flops_t *ops = stat->ops;
-    int_t   iam = grid->iam;
-    flops_t flopcnt, factflop, solveflop;
-
-    if ( options->PrintStat == NO ) return;
-
-    if ( !iam && options->Fact != FACTORED ) {
-        if ( options->Equil != NO )
-	    printf("\tEQUIL time         %8.2f\n", utime[EQUIL]);
-	if ( options->RowPerm != NOROWPERM )
-	    printf("\tROWPERM time       %8.2f\n", utime[ROWPERM]);
-	if ( options->ColPerm != NATURAL )
-	    printf("\tCOLPERM time       %8.2f\n", utime[COLPERM]);
-        printf("\tSYMBFACT time      %8.2f\n", utime[SYMBFAC]);
-	printf("\tDISTRIBUTE time    %8.2f\n", utime[DIST]);
-
-    }
-
-    MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
-	       0, grid->comm);
-    factflop = flopcnt;
-    if ( !iam && options->Fact != FACTORED ) {
-	printf("\tFACTOR time        %8.2f\n", utime[FACT]);
-	if ( utime[FACT] != 0.0 )
-	    printf("\tFactor flops\t%e\tMflops \t%8.2f\n",
-		   flopcnt,
-		   flopcnt*1e-6/utime[FACT]);
-    }
-	
-    MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 
-	       0, grid->comm);
-    solveflop = flopcnt;
-    if ( !iam ) {
-	printf("\tSOLVE time         %8.2f\n", utime[SOLVE]);
-	if ( utime[SOLVE] != 0.0 )
-	    printf("\tSolve flops\t%e\tMflops \t%8.2f\n",
-		   flopcnt,
-		   flopcnt*1e-6/utime[SOLVE]);
-    }
-    
-    if ( !iam && options->IterRefine != NOREFINE ) {
-	printf("\tREFINEMENT time    %8.2f\tSteps%8d\n\n",
-	       utime[REFINE], stat->RefineSteps);
-    }
-
-#if ( PROFlevel>=1 )
-    fflush(stdout);
-    MPI_Barrier( grid->comm );
-
-    {
-	int_t i, P = grid->nprow*grid->npcol;
-	flops_t b, maxflop;
-	if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n");
-	for (i = 0; i < P; ++i) {
-	    if ( iam == i) {
-		printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]);
-		fflush(stdout);
-	    }
-	    MPI_Barrier( grid->comm );
-	}
-	if ( !iam ) printf("\n.. FACT ops distribution:\n");
-	for (i = 0; i < P; ++i) {
-	    if ( iam == i ) {
-		printf("\t\t(%d)\t%e\n", iam, ops[FACT]);
-		fflush(stdout);
-	    }
-	    MPI_Barrier( grid->comm );
-	}
-	MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
-	if ( !iam ) {
-	    b = factflop/P/maxflop;
-	    printf("\tFACT load balance: %.2f\n", b);
-	}
-	if ( !iam ) printf("\n.. SOLVE ops distribution:\n");
-	for (i = 0; i < P; ++i) {
-	    if ( iam == i ) {
-		printf("\t\t%d\t%e\n", iam, ops[SOLVE]);
-		fflush(stdout);
-	    }
-	    MPI_Barrier( grid->comm );
-	}
-	MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm);
-	if ( !iam ) {
-	    b = solveflop/P/maxflop;
-	    printf("\tSOLVE load balance: %.2f\n", b);
-	}
-    }
-#endif
-
-/*  if ( !iam ) fflush(stdout);  CRASH THE SYSTEM pierre.  */
-}
-
-void
-PStatFree(SuperLUStat_t *stat)
-{
-    SUPERLU_FREE(stat->utime);
-    SUPERLU_FREE(stat->ops);
-}
-
-/* 
- * Fills an integer array with a given value.
- */
-void ifill_dist(int_t *a, int_t alen, int_t ival)
-{
-    register int_t i;
-    for (i = 0; i < alen; i++) a[i] = ival;
-}
-
-
-void
-get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid,
-	       int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len)
-{
-    int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk;
-    int_t *xsup;
-
-    i = j = *num_diag_procs = pkk = 0;
-    nprow = grid->nprow;
-    npcol = grid->npcol;
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-
-    do {
-	++(*num_diag_procs);
-	i = (++i) % nprow;
-	j = (++j) % npcol;
-	pkk = PNUM( i, j, grid );
-    } while ( pkk != 0 ); /* Until wrap back to process 0 */
-    if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) )
-	ABORT("Malloc fails for diag_procs[]");
-    if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) )
-	ABORT("Calloc fails for diag_len[]");
-    for (i = j = k = 0; k < *num_diag_procs; ++k) {
-	pkk = PNUM( i, j, grid );
-	(*diag_procs)[k] = pkk;
-	i = (++i) % nprow;
-	j = (++j) % npcol;
-    }
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	i = k % *num_diag_procs;
-	(*diag_len)[i] += knsupc;
-    }
-}
-
-
-/* 
- * Get the statistics of the supernodes 
- */
-#define NBUCKS 10
-static 	int_t	max_sup_size;
-
-void super_stats_dist(int_t nsuper, int_t *xsup)
-{
-    register int_t nsup1 = 0;
-    int_t          i, isize, whichb, bl, bh;
-    int_t          bucket[NBUCKS];
-
-    max_sup_size = 0;
-
-    for (i = 0; i <= nsuper; i++) {
-	isize = xsup[i+1] - xsup[i];
-	if ( isize == 1 ) nsup1++;
-	if ( max_sup_size < isize ) max_sup_size = isize;	
-    }
-
-    printf("    Supernode statistics:\n\tno of super = %d\n", nsuper+1);
-    printf("\tmax supernode size = %d\n", max_sup_size);
-    printf("\tno of size 1 supernodes = %d\n", nsup1);
-
-    /* Histogram of the supernode sizes */
-    ifill_dist (bucket, NBUCKS, 0);
-
-    for (i = 0; i <= nsuper; i++) {
-        isize = xsup[i+1] - xsup[i];
-        whichb = (float) isize / max_sup_size * NBUCKS;
-        if (whichb >= NBUCKS) whichb = NBUCKS - 1;
-        bucket[whichb]++;
-    }
-    
-    printf("\tHistogram of supernode sizes:\n");
-    for (i = 0; i < NBUCKS; i++) {
-        bl = (float) i * max_sup_size / NBUCKS;
-        bh = (float) (i+1) * max_sup_size / NBUCKS;
-        printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]);
-    }
-
-}
-
-
-/*
- * Check whether repfnz[] == EMPTY after reset.
- */
-void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz)
-{
-    int_t jj, k;
-
-    for (jj = jcol; jj < jcol+w; jj++) 
-	for (k = 0; k < n; k++)
-	    if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
-		fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj,
-			k, repfnz[(jj-jcol)*n + k]);
-		ABORT("check_repfnz_dist");
-	    }
-}
-
-void PrintInt10(char *name, int_t len, int_t *x)
-{
-    register int_t i;
-    
-    printf("%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9);
-	printf("%6d", x[i]);
-    }
-    printf("\n");
-}
-
-int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x)
-{
-    register int_t i;
-    
-    fprintf(fp, "%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9);
-	fprintf(fp, "%6d", x[i]);
-    }
-    fprintf(fp, "\n");
-}
-
-int_t
-CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt)
-{
-    register int_t i, j, zd, numzd = 0;
-
-    for (j = 0; j < n; ++j) {
-	zd = 0;
-	for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) {
-	    /*if ( iperm[rowind[i]] == j ) zd = 1;*/
-	    if ( rowind[i] == j ) { zd = 1; break; }
-	}
-	if ( zd == 0 ) {
-#if ( PRNTlevel>=2 )
-	    printf(".. Diagonal of column %d is zero.\n", j);
-#endif
-	    ++numzd;
-	}
-    }
-
-    return numzd;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util_dist.h hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util_dist.h
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/util_dist.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/util_dist.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,150 +0,0 @@
-#ifndef __SUPERLU_UTIL /* allow multiple inclusions */
-#define __SUPERLU_UTIL
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-/*
- * Macros
- */
-#ifndef USER_ABORT
-#define USER_ABORT(msg) superlu_abort_and_exit_dist(msg)
-#endif
-
-#define ABORT(err_msg) \
- { char msg[256];\
-   sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
-   USER_ABORT(msg); }
-
-
-#ifndef USER_MALLOC
-#define USER_MALLOC(size) superlu_malloc_dist(size)
-#endif
-
-#define SUPERLU_MALLOC(size) USER_MALLOC(size)
-
-#ifndef USER_FREE
-#define USER_FREE(addr) superlu_free_dist(addr)
-#endif
-
-#define SUPERLU_FREE(addr) USER_FREE(addr)
-
-#define CHECK_MALLOC(pnum, where) {                 \
-    extern long int superlu_malloc_total;        \
-    printf("(%d) %s: superlu_malloc_total (KBytes) %.3f\n",     \
-	   pnum, where, superlu_malloc_total*1e-3); \
-}
-
-#define SUPERLU_MAX(x, y) 	( (x) > (y) ? (x) : (y) )
-#define SUPERLU_MIN(x, y) 	( (x) < (y) ? (x) : (y) )
-
-/* 
- * Constants 
- */
-#define EMPTY	(-1)
-#ifndef FALSE
-#define FALSE	(0)
-#endif
-#ifndef TRUE
-#define TRUE	(1)
-#endif
-
-/*
- * Type definitions
- */
-typedef float    flops_t;
-typedef unsigned char Logical;
-
-/*
-#ifdef _CRAY
-#define int short
-#endif
-*/
-
-/* 
- * The following enumerate type is used by the statistics variable 
- * to keep track of flop count and time spent at various stages.
- *
- * Note that not all of the fields are disjoint.
- */
-typedef enum {
-    COLPERM, /* find a column ordering that minimizes fills */
-    ROWPERM, /* find a row ordering maximizes diagonal. */
-    RELAX,   /* find artificial supernodes */
-    ETREE,   /* compute column etree */
-    EQUIL,   /* equilibrate the original matrix */
-    SYMBFAC, /* symbolic factorization. */
-    DIST,    /* distribute matrix. */
-    FACT,    /* perform LU factorization */
-    COMM,    /* communication for factorization */
-    SOL_COMM,/* communication for solve */
-    RCOND,   /* estimate reciprocal condition number */
-    SOLVE,   /* forward and back solves */
-    REFINE,  /* perform iterative refinement */
-    FLOAT,   /* time spent in floating-point operations */
-    TRSV,    /* fraction of FACT spent in xTRSV */
-    GEMV,    /* fraction of FACT spent in xGEMV */
-    FERR,    /* estimate error bounds after iterative refinement */
-    NPHASES  /* total number of phases */
-} PhaseType;
-
-typedef struct {
-    int     *panel_histo; /* histogram of panel size distribution */
-    double  *utime;       /* running time at various phases */
-    flops_t *ops;         /* operation count at various phases */
-    int     TinyPivots;   /* number of tiny pivots */
-    int     RefineSteps;  /* number of iterative refinement steps */
-} SuperLUStat_t;
-
-/* Headers for 2 types of dynamatically managed memory */
-typedef struct e_node {
-    int size;      /* length of the memory that has been used */
-    void *mem;     /* pointer to the new malloc'd store */
-} ExpHeader;
-
-typedef struct {
-    int  size;
-    int  used;
-    int  top1;  /* grow upward, relative to &array[0] */
-    int  top2;  /* grow downward */
-    void *array;
-} LU_stack_t;
-
-/* Constants */
-#define GluIntArray(n)   (5 * (n) + 5)
-#define NO_MEMTYPE  4      /* 0: lusup;
-			      1: ucol;
-			      2: lsub;
-			      3: usub */
-
-/* Macros to manipulate stack */
-#define StackFull(x)         ( x + stack.used >= stack.size )
-#define NotDoubleAlign(addr) ( (long)addr & 7 )
-#define DoubleAlign(addr)    ( ((long)addr + 7) & ~7L )
-#define TempSpace(n, w)      ( (2*w + 4 + NO_MARKER)*m*sizeof(int) + \
-			      (w + 1)*n*sizeof(double) )
-#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
-
-#define FIRSTCOL_OF_SNODE(i)	(xsup[i])
-
-#if ( PROFlevel>=1 )
-#define TIC(t)          t = SuperLU_timer_()
-#define TOC(t2, t1)     t2 = SuperLU_timer_() - t1
-#else
-#define TIC(t)
-#define TOC(t2, t1)
-#endif
-
-/*********************************************************
- * Macros used for easy access of sparse matrix entries. *
- *********************************************************/
-#define L_SUB_START(col)     ( Lstore->rowind_colptr[col] )
-#define L_SUB(ptr)           ( Lstore->rowind[ptr] )
-#define L_NZ_START(col)      ( Lstore->nzval_colptr[col] )
-#define L_FST_SUPC(superno)  ( Lstore->sup_to_col[superno] )
-#define U_NZ_START(col)      ( Ustore->colptr[col] )
-#define U_SUB(ptr)           ( Ustore->rowind[ptr] )
-
-#endif /* __SUPERLU_UTIL */
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/xerbla.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/xerbla.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/xerbla.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/xerbla.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,42 +0,0 @@
-#include "Cnames.h"
-
-/* Subroutine */ int xerbla_(char *srname, int *info)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-
-    Purpose   
-    =======   
-
-    XERBLA  is an error handler for the LAPACK routines.   
-    It is called by an LAPACK routine if an input parameter has an   
-    invalid value.  A message is printed and execution stops.   
-
-    Installers may consider modifying the STOP statement in order to   
-    call system-specific exception-handling facilities.   
-
-    Arguments   
-    =========   
-
-    SRNAME  (input) CHARACTER*6   
-            The name of the routine which called XERBLA.   
-
-    INFO    (input) INT   
-            The position of the invalid parameter in the parameter list   
-
-            of the calling routine.   
-
-   ===================================================================== 
-*/
-
-    printf("** On entry to %6s, parameter number %2d had an illegal value\n",
-		srname, *info);
-
-/*     End of XERBLA */
-
-    return 0;
-} /* xerbla_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,724 +0,0 @@
-
-#include "superlu_zdefs.h"
-
-int_t
-zdistribute(fact_t fact, int_t n, SuperMatrix *A, 
-            Glu_freeable_t *Glu_freeable,
-	    LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- *   Distribute the matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (input) int
- *        Dimension of the matrix.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, permuted by columns, of dimension
- *        (A->nrow, A->ncol). The type of A can be:
- *        Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * LUstruct (input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Return value
- * ============
- *   > 0, working storage required (in bytes).
- *
- */
-{
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, 
-          len, len1, nsupc;
-    int_t ljb;  /* local block column number */
-    int_t nrbl; /* number of L blocks in current block column */
-    int_t nrbu; /* number of U blocks in current block column */
-    int_t gb;   /* global block number; 0 < gb <= nsuper */
-    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-    int iam, jbrow, kcol, mycol, myrow, pc, pr;
-    int_t mybufmax[NBUFFERS];
-    NCPformat *Astore;
-    doublecomplex *a;
-    int_t *asub;
-    int_t *xa_begin, *xa_end;
-    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
-    int_t *lsub, *xlsub, *usub, *xusub;
-    int_t nsupers;
-    int_t next_lind;      /* next available position in index[*] */
-    int_t next_lval;      /* next available position in nzval[*] */
-    int_t *index;         /* indices consist of headers and row subscripts */
-    doublecomplex *lusup, *uval; /* nonzero values in L and U */
-    doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-    doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-
-    /*-- Counts to be used in factorization. --*/
-    int_t  *ToRecv, *ToSendD, **ToSendR;
-
-    /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
-
-    /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in 
-			      the full array (local)                 */
-
-    /*-- Auxiliary arrays; freed on return --*/
-    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
-    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
-    int_t *Ucbs;       /* number of column blocks in a block row            */
-    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
-    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
-    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
-    doublecomplex *dense, *dense_col; /* SPA */
-    doublecomplex zero = {0.0, 0.0};
-    int_t  ldaspa;     /* LDA of SPA */
-    int_t mem_use = 0, iword, zword;
-
-#if ( PRNTlevel>=1 )
-    int_t nLblocks = 0, nUblocks = 0;
-#endif
-#if ( PROFlevel>=1 ) 
-    double t, t_u, t_l;
-    int_t u_blks;
-#endif
-
-    /* Initialization. */
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-    nsupers  = supno[n-1] + 1;
-    Astore   = A->Store;
-    a        = Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-#if ( PRNTlevel>=1 )
-    iword = sizeof(int_t);
-    zword = sizeof(doublecomplex);
-#endif
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter zdistribute()");
-#endif
-
-    if ( fact == SamePattern_SameRowPerm ) {
-        /* ---------------------------------------------------------------
-         * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION.
-         * --------------------------------------------------------------- */
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* We can propagate the new values of A into the existing
-	   L and U data structures.            */
-	ilsum = Llu->ilsum;
-	ldaspa = Llu->ldalsum;
-	if ( !(dense = doublecomplexCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
-	if ( !(Urb_length = intCalloc_dist(nrbu)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-	Unzval_br_ptr = Llu->Unzval_br_ptr;
-#if ( PRNTlevel>=1 )
-	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword;
-#endif
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_();
-#endif
-
-	/* Initialize Uval to zero. */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
-	    index = Ufstnz_br_ptr[lb];
-	    if ( index ) {
-		uval = Unzval_br_ptr[lb];
-		len = index[1];
-		for (i = 0; i < len; ++i) uval[i] = zero;
-	    } /* if index != NULL */
-	} /* for lb ... */
-
-	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-
- 		/* Scatter A into SPA (for L), or into U directly. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
- 			    if ( gb < jb ) { /* in U */
- 				index = Ufstnz_br_ptr[lb];
- 				uval = Unzval_br_ptr[lb];
- 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
- 				    /* Skip nonzero values in this block */
- 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
- 				    /* Move pointer to the next block */
- 				    Urb_indptr[lb] += UB_DESCRIPTOR
- 					+ SuperSize( k );
- 				}
- 				/*assert(k == jb);*/
- 				/* start fstnz */
- 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
- 				len = Urb_length[lb];
- 				fsupc1 = FstBlockC( gb+1 );
- 				k = j - fsupc;
- 				/* Sum the lengths of the leading columns */
- 				for (jj = 0; jj < k; ++jj)
-				    len += fsupc1 - index[istart++];
-				/*assert(irow>=index[istart]);*/
-				uval[len + irow - index[istart]] = a[i];
-			    } else { /* in L; put in SPA first */
-  				irow = ilsum[lb] + irow - FstBlockC( gb );
-  				dense_col[irow] = a[i];
-  			    }
-  			}
-		    } /* for i ... */
-  		    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif
-
-		/* Gather the values of A from SPA into Lnzval[]. */
-		ljb = LBj( jb, grid ); /* Local block number */
-		index = Lrowind_bc_ptr[ljb];
-		if ( index ) {
-		    nrbl = index[0];   /* Number of row blocks. */
-		    len = index[1];    /* LDA of lusup[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (jj = 0; jj < nrbl; ++jj) {
-			gb = index[next_lind++];
-			len1 = index[next_lind++]; /* Rows in the block. */
-			lb = LBi( gb, grid );
-			for (bnnz = 0; bnnz < len1; ++bnnz) {
-			    irow = index[next_lind++]; /* Global index. */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    k = next_lval++;
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			} /* for bnnz ... */
-		    } /* for jj ... */
-		} /* if index ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-	} /* for jb ... */
-
-	SUPERLU_FREE(dense);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
-			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } else { 
-        /* --------------------------------------------------
-         * FIRST TIME CREATING THE L AND U DATA STRUCTURE. 
-         * -------------------------------------------------- */
-
-#if ( PROFlevel>=1 )
-	t_l = t_u = 0; u_blks = 0;
-#endif
-	/* No L and U data structures are available yet.
-	   We need to set up the L and U data structures and propagate
-	   the values of A into them.          */
-	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
-	xlsub = Glu_freeable->xlsub;
-	usub = Glu_freeable->usub;    /* compressed U subscripts */
-	xusub = Glu_freeable->xusub;
-    
-	if ( !(ToRecv = intCalloc_dist(nsupers)) )
-	    ABORT("Calloc fails for ToRecv[].");
-
-	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
-	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for ToSendR[].");
-	j = k * grid->npcol;
-	if ( !(index = intMalloc_dist(j)) )
-	    ABORT("Malloc fails for index[].");
-#if ( PRNTlevel>=1 )
-	mem_use += k*sizeof(int_t*) + (j + nsupers)*iword;
-#endif
-	for (i = 0; i < j; ++i) index[i] = EMPTY;
-	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-
-	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr = 
-               (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
-	    ABORT("Malloc fails for Unzval_br_ptr[].");
-	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
-	if ( !(ToSendD = intCalloc_dist(k)) )
-	    ABORT("Malloc fails for ToSendD[].");
-	if ( !(ilsum = intMalloc_dist(k+1)) )
-	    ABORT("Malloc fails for ilsum[].");
-
-	/* Auxiliary arrays used to set up U block data structures.
-	   They are freed on return. */
-	if ( !(rb_marker = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for rb_marker[].");
-	if ( !(Urb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	if ( !(Urb_fstnz = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_fstnz[].");
-	if ( !(Ucbs = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Ucbs[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 2*k*sizeof(int_t*) + (7*k+1)*iword;
-#endif
-	/* Compute ldaspa and ilsum[]. */
-	ldaspa = 0;
-	ilsum[0] = 0;
-	for (gb = 0; gb < nsupers; ++gb) {
-	    if ( myrow == PROW( gb, grid ) ) {
-		i = SuperSize( gb );
-		ldaspa += i;
-		lb = LBi( gb, grid );
-		ilsum[lb + 1] = ilsum[lb] + i;
-	    }
-	}
-	
-            
-	/* ------------------------------------------------------------
-	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
-	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
-	   ------------------------------------------------------------*/
-	
-	/* Loop through each supernode column. */
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    fsupc = FstBlockC( jb );
-	    nsupc = SuperSize( jb );
-	    /* Loop through each column in the block. */
-	    for (j = fsupc; j < fsupc + nsupc; ++j) {
-		/* usub[*] contains only "first nonzero" in each segment. */
-		for (i = xusub[j]; i < xusub[j+1]; ++i) {
-		    irow = usub[i]; /* First nonzero of the segment. */
-		    gb = BlockNum( irow );
-		    kcol = PCOL( gb, grid );
-		    ljb = LBj( gb, grid );
-		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
-		    pr = PROW( gb, grid );
-		    lb = LBi( gb, grid );
-		    if ( mycol == pc ) {
-			if  ( myrow == pr ) {
-			    ToSendD[lb] = YES;
-			    /* Count nonzeros in entire block row. */
-			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
-			    if (rb_marker[lb] <= jb) {/* First see the block */
-				rb_marker[lb] = jb + 1;
-				Urb_fstnz[lb] += nsupc;
-				++Ucbs[lb]; /* Number of column blocks
-					       in block row lb. */
-#if ( PRNTlevel>=1 )
-				++nUblocks;
-#endif
-			    }
-			    ToRecv[gb] = 1;
-			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
-		    }
-		} /* for i ... */
-	    } /* for j ... */
-	} /* for jb ... */
-	
-	/* Set up the initial pointers for each block row in U. */
-	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    len = Urb_length[lb];
-	    rb_marker[lb] = 0; /* Reset block marker. */
-	    if ( len ) {
-		/* Add room for descriptors */
-		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
-		if ( !(index = intMalloc_dist(len1+1)) )
-		    ABORT("Malloc fails for Uindex[].");
-		Ufstnz_br_ptr[lb] = index;
-		if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) )
-		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
-		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-		index[0] = Ucbs[lb]; /* Number of column blocks */
-		index[1] = len;      /* Total length of nzval[] */
-		index[2] = len1;     /* Total length of index[] */
-		index[len1] = -1;    /* End marker */
-	    } else {
-		Ufstnz_br_ptr[lb] = NULL;
-		Unzval_br_ptr[lb] = NULL;
-	    }
-	    Urb_length[lb] = 0; /* Reset block length. */
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
- 	    Urb_fstnz[lb] = BR_HEADER;
-	} /* for lb ... */
-
-	SUPERLU_FREE(Ucbs);
-
-#if ( PROFlevel>=1 )
-	t = SuperLU_timer_() - t;
-	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
-#endif
-#if ( PRNTlevel>=1 )
-        mem_use -= 2*k * iword;
-#endif
-	/* Auxiliary arrays used to set up L block data structures.
-	   They are freed on return.
-	   k is the number of local row blocks.   */
-	if ( !(Lrb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Lrb_length[].");
-	if ( !(Lrb_number = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_number[].");
-	if ( !(Lrb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_indptr[].");
-	if ( !(Lrb_valptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_valptr[].");
-	if (!(dense=doublecomplexCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
-              *sp_ienv_dist(3)))))
-	    ABORT("Calloc fails for SPA dense[].");
-
-	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for bmod[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*zword;
-#endif
-	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-
-	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr = (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
-	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
-	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
-	Lrowind_bc_ptr[k-1] = NULL;
-
-	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for fsendx_plist[].");
-	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
-#if ( PRNTlevel>=1 )
-	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
-#endif
-	/*------------------------------------------------------------
-	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-	  ------------------------------------------------------------*/
-
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-		ljb = LBj( jb, grid ); /* Local block number */
-		
-		/* Scatter A into SPA. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    dense_col[irow] = a[i];
-			}
-		    }
-		    dense_col += ldaspa;
-		}
-
-		jbrow = PROW( jb, grid );
-
-#if ( PROFlevel>=1 )
-		t = SuperLU_timer_();
-#endif
-		/*------------------------------------------------
-		 * SET UP U BLOCKS.
-		 *------------------------------------------------*/
-		kseen = 0;
-		dense_col = dense;
-		/* Loop through each column in the block column. */
-		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
-		    istart = xusub[j];
-		    /* NOTE: Only the first nonzero index of the segment
-		       is stored in usub[]. */
-		    for (i = istart; i < xusub[j+1]; ++i) {
-			irow = usub[i]; /* First nonzero in the segment. */
-			gb = BlockNum( irow );
-			pr = PROW( gb, grid );
-			if ( pr != jbrow &&
-			     myrow == jbrow &&  /* diag. proc. owning jb */
-			     bsendx_plist[ljb][pr] == EMPTY ) {
-			    bsendx_plist[ljb][pr] = YES;
-			    ++nbsendx;
-                        }
-			if ( myrow == pr ) {
-			    lb = LBi( gb, grid ); /* Local block number */
-			    index = Ufstnz_br_ptr[lb];
-			    uval = Unzval_br_ptr[lb];
-			    fsupc1 = FstBlockC( gb+1 );
-			    if (rb_marker[lb] <= jb) { /* First time see 
-							  the block       */
-				rb_marker[lb] = jb + 1;
-				Urb_indptr[lb] = Urb_fstnz[lb];;
-				index[Urb_indptr[lb]] = jb; /* Descriptor */
-				Urb_indptr[lb] += UB_DESCRIPTOR;
-				/* Record the first location in index[] of the
-				   next block */
-				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
-				len = Urb_indptr[lb];/* Start fstnz in index */
-				index[len-1] = 0;
-				for (k = 0; k < nsupc; ++k)
-				    index[len+k] = fsupc1;
-				if ( gb != jb )/* Exclude diagonal block. */
-				    ++bmod[lb];/* Mod. count for back solve */
-				if ( kseen == 0 && myrow != jbrow ) {
-				    ++nbrecvx;
-				    kseen = 1;
-				}
-			    } else { /* Already saw the block */
-				len = Urb_indptr[lb];/* Start fstnz in index */
-			    }
-			    jj = j - fsupc;
-			    index[len+jj] = irow;
-			    /* Load the numerical values */
-			    k = fsupc1 - irow; /* No. of nonzeros in segment */
-			    index[len-1] += k; /* Increment block length in
-						  Descriptor */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (ii = 0; ii < k; ++ii) {
-				uval[Urb_length[lb]++] = dense_col[irow + ii];
-				dense_col[irow + ii] = zero;
-			    }
-			} /* if myrow == pr ... */
-		    } /* for i ... */
-                    dense_col += ldaspa;
-		} /* for j ... */
-
-#if ( PROFlevel>=1 )
-		t_u += SuperLU_timer_() - t;
-		t = SuperLU_timer_();
-#endif
-
-		/*------------------------------------------------
-		 * SET UP L BLOCKS.
-		 *------------------------------------------------*/
-
-		/* Count number of blocks and length of each block. */
-		nrbl = 0;
-		len = 0; /* Number of row subscripts I own. */
-		kseen = 0;
-		istart = xlsub[fsupc];
-		for (i = istart; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    gb = BlockNum( irow ); /* Global block number */
-		    pr = PROW( gb, grid ); /* Process row owning this block */
-		    if ( pr != jbrow &&
-			 myrow == jbrow &&  /* diag. proc. owning jb */
-			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
-			fsendx_plist[ljb][pr] = YES;
-			++nfsendx;
-                    }
-		    if ( myrow == pr ) {
-			lb = LBi( gb, grid );  /* Local block number */
-			if (rb_marker[lb] <= jb) { /* First see this block */
-			    rb_marker[lb] = jb + 1;
-			    Lrb_length[lb] = 1;
-			    Lrb_number[nrbl++] = gb;
-			    if ( gb != jb ) /* Exclude diagonal block. */
-				++fmod[lb]; /* Mod. count for forward solve */
-			    if ( kseen == 0 && myrow != jbrow ) {
-				++nfrecvx;
-				kseen = 1;
-			    }
-#if ( PRNTlevel>=1 )
-			    ++nLblocks;
-#endif
-			} else {
-			    ++Lrb_length[lb];
-			}
-			++len;
-		    }
-		} /* for i ... */
-
-		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
-		       index[] and nzval[]. */
-		    /* Add room for descriptors */
-		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index[]");
-		    Lrowind_bc_ptr[ljb] = index;
-		    if (!(Lnzval_bc_ptr[ljb] = doublecomplexMalloc_dist(((size_t)len)*nsupc))) {
-			fprintf(stderr, "col block %d ", jb);
-			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
-		    }
-		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-		    index[0] = nrbl;  /* Number of row blocks */
-		    index[1] = len;   /* LDA of the nzval[] */
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (k = 0; k < nrbl; ++k) {
-			gb = Lrb_number[k];
-			lb = LBi( gb, grid );
-			len = Lrb_length[lb];
-			Lrb_length[lb] = 0;  /* Reset vector of block length */
-			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
-			Lrb_indptr[lb] = next_lind;
-			Lrb_valptr[lb] = next_lval;
-			next_lind += len;
-			next_lval += len;
-		    }
-		    /* Propagate the compressed row subscripts to Lindex[], and
-		       the initial values of A from SPA into Lnzval[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    len = index[1];  /* LDA of lusup[] */
-		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
-			irow = lsub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    k = Lrb_indptr[lb]++; /* Random access a block */
-			    index[k] = irow;
-			    k = Lrb_valptr[lb]++;
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			}
-		    } /* for i ... */
-		} else {
-		    Lrowind_bc_ptr[ljb] = NULL;
-		    Lnzval_bc_ptr[ljb] = NULL;
-		} /* if nrbl ... */
-#if ( PROFlevel>=1 )
-		t_l += SuperLU_timer_() - t;
-#endif
-	    } /* if mycol == pc */
-
-	} /* for jb ... */
-
-	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-	Llu->Unzval_br_ptr = Unzval_br_ptr;
-	Llu->ToRecv = ToRecv;
-	Llu->ToSendD = ToSendD;
-	Llu->ToSendR = ToSendR;
-	Llu->fmod = fmod;
-	Llu->fsendx_plist = fsendx_plist;
-	Llu->nfrecvx = nfrecvx;
-	Llu->nfsendx = nfsendx;
-	Llu->bmod = bmod;
-	Llu->bsendx_plist = bsendx_plist;
-	Llu->nbrecvx = nbrecvx;
-	Llu->nbsendx = nbsendx;
-	Llu->ilsum = ilsum;
-	Llu->ldalsum = ldaspa;
-	
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-			   nLblocks, nUblocks);
-#endif
-
-	SUPERLU_FREE(rb_marker);
-	SUPERLU_FREE(Urb_fstnz);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-	SUPERLU_FREE(Lrb_length);
-	SUPERLU_FREE(Lrb_number);
-	SUPERLU_FREE(Lrb_indptr);
-	SUPERLU_FREE(Lrb_valptr);
-	SUPERLU_FREE(dense);
-
-	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
-
-#if ( PROFlevel>=1 )
-	if ( !iam ) printf(".. 1st distribute time:\n "
-			   "\tL\t%.2f\n\tU\t%.2f\n"
-			   "\tu_blks %d\tnrbu %d\n--------\n",
-  			   t_l, t_u, u_blks, nrbu);
-#endif
-
-    } /* else fact != SamePattern_SameRowPerm */
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed:
-       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
-    CHECK_MALLOC(iam, "Exit zdistribute()");
-#endif
-
-    return (mem_use);
-} /* ZDISTRIBUTE */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zdistribute_mark.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,695 +0,0 @@
-#include "superlu_zdefs.h"
-
-/*
- * NOTE zdistribute_mark.c
- * ====
- * This version is faster for Mark Baertschy's matrices, remains to be
- * tested for the other matrices.
- *
- * Main difference: there is no dense SPA involved when distributing A into
- * the U structure. That is, the entries in upper triangle of A are loaded
- * directly into U.
- * 
- * The locations of modifications have XSL comments.
- *
- * Date: Apr 23 09:54:15 PDT 2001
- */
-int_t
-zdistribute(fact_t fact, int_t n, SuperMatrix *A, Glu_freeable_t *Glu_freeable,
-	    LUstruct_t *LUstruct, gridinfo_t *grid)
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- *   Distribute the matrix onto the 2D process mesh.
- * 
- * Arguments
- * =========
- * 
- * fact (input) fact_t
- *        Specifies whether or not the L and U structures will be re-used.
- *        = SamePattern_SameRowPerm: L and U structures are input, and
- *                                   unchanged on exit.
- *        = DOFACT or SamePattern: L and U structures are computed and output.
- *
- * n      (input) int
- *        Dimension of the matrix.
- *
- * A      (input) SuperMatrix*
- *	  The original matrix A, permuted by columns, of dimension
- *        (A->nrow, A->ncol). The type of A can be:
- *        Stype = NCP; Dtype = Z; Mtype = GE.
- *
- * LUstruct (input) LUstruct_t*
- *        Data structures for L and U factors.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- */
-{
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc;
-    int_t ljb;  /* local block column number */
-    int_t nrbl; /* number of L blocks in current block column */
-    int_t nrbu; /* number of U blocks in current block column */
-    int_t gb;   /* global block number; 0 < gb <= nsuper */
-    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
-    int iam, jbrow, kcol, mycol, myrow, pc, pr;
-    int_t mybufmax[NBUFFERS];
-    NCPformat *Astore;
-    doublecomplex *a;
-    int_t *asub;
-    int_t *xa_begin, *xa_end;
-    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
-    int_t *supno = Glu_persist->supno;   
-    int_t *lsub, *xlsub, *usub, *xusub;
-    int_t nsupers;
-    int_t next_lind;      /* next available position in index[*] */
-    int_t next_lval;      /* next available position in nzval[*] */
-    int_t *index;         /* indices consist of headers and row subscripts */
-    doublecomplex *lusup, *uval; /* nonzero values in L and U */
-    doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
-    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
-    doublecomplex **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
-    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-
-    /*-- Counts to be used in factorization. --*/
-    int_t  *ToRecv, *ToSendD, **ToSendR;
-
-    /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  kseen;
-
-    /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  *ilsum;         /* starting position of each supernode in 
-			      the full array (local)                 */
-
-    /*-- Auxiliary arrays; freed on return --*/
-    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
-    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
-    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
-    int_t *Ucbs;       /* number of column blocks in a block row            */
-    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
-    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
-    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
-    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
-    doublecomplex *dense, *dense_col; /* SPA */
-    doublecomplex zero = {0.0, 0.0};
-    int_t  ldaspa;     /* LDA of SPA */
-    int_t mem_use = 0, iword, zword;
-#if ( PRNTlevel>=1 )
-    int_t nLblocks = 0, nUblocks = 0;
-#endif
-
-    /* Initialization. */
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
-    nsupers  = supno[n-1] + 1;
-    Astore   = A->Store;
-    a        = Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-#if ( PRNTlevel>=1 )
-    iword = sizeof(int_t);
-    zword = sizeof(doublecomplex);
-#endif
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Enter zdistribute()");
-#endif
-
-    if ( fact == SamePattern_SameRowPerm ) {
-	/* We can propagate the new values of A into the existing
-	   L and U data structures.            */
-	ilsum = Llu->ilsum;
-	ldaspa = Llu->ldalsum;
-	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-	nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-	if ( !(Urb_length = intCalloc_dist(nrbu)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	for (lb = 0; lb < nrbu; ++lb) 
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
-	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-	Unzval_br_ptr = Llu->Unzval_br_ptr;
-#if ( PRNTlevel>=1 )
-	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*zword;
-#endif
-	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-
-		/* Scatter A into SPA. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    dense_col[irow] = a[i];
-			}
-		    }
-		    dense_col += ldaspa;
-		}
-		
-		/* Gather the values of A from SPA into Unzval[]. */
-		for (lb = 0; lb < nrbu; ++lb) {
-		    index = Ufstnz_br_ptr[lb];
-		    if ( index && index[Urb_indptr[lb]] == jb ) {
-			uval = Unzval_br_ptr[lb];
-			len = Urb_indptr[lb] + UB_DESCRIPTOR;
-			gb = lb * grid->nprow + myrow;/* Global block number */
-			k = FstBlockC( gb+1 );
-			irow = ilsum[lb] - FstBlockC( gb );
-			for (jj = 0, dense_col = dense; jj < nsupc; ++jj) {
-			    j = index[len+jj];
-			    for (i = j; i < k; ++i) {
-				uval[Urb_length[lb]++] = dense_col[irow+i];
-				dense_col[irow+i] = zero;
-			    }
-			    dense_col += ldaspa;
-			}
-			Urb_indptr[lb] += UB_DESCRIPTOR + nsupc;
-		    }
-		} /* for lb ... */
-
-		/* Gather the values of A from SPA into Lnzval[]. */
-		ljb = LBj( jb, grid ); /* Local block number */
-		index = Lrowind_bc_ptr[ljb];
-		if ( index ) {
-		    nrbl = index[0];   /* Number of row blocks. */
-		    len = index[1];    /* LDA of lusup[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (jj = 0; jj < nrbl; ++jj) {
-			gb = index[next_lind++];
-			len1 = index[next_lind++]; /* Rows in the block. */
-			lb = LBi( gb, grid );
-			for (bnnz = 0; bnnz < len1; ++bnnz) {
-			    irow = index[next_lind++]; /* Global index. */
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    k = next_lval++;
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			} /* for bnnz ... */
-		    } /* for jj ... */
-		} /* if index ... */
-
-	    } /* if mycol == pc */
-	} /* for jb ... */
-
-	SUPERLU_FREE(dense);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-
-    } else {
-	/* No L and U data structures are available yet.
-	   We need to set up the L and U data structures and propagate
-	   the values of A into them.          */
-	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
-	xlsub = Glu_freeable->xlsub;
-	usub = Glu_freeable->usub;    /* compressed U subscripts */
-	xusub = Glu_freeable->xusub;
-    
-	if ( !(ToRecv = intCalloc_dist(nsupers)) )
-	    ABORT("Calloc fails for ToRecv[].");
-
-	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
-	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for ToSendR[].");
-	j = k * grid->npcol;
-	if ( !(index = intMalloc_dist(j)) )
-	    ABORT("Malloc fails for index[].");
-#if ( PRNTlevel>=1 )
-	mem_use = k*sizeof(int_t*) + (j + nsupers)*iword;
-#endif
-	for (i = 0; i < j; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
-
-	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-
-	/* Pointers to the beginning of each block row of U. */
-	if ( !(Unzval_br_ptr =
-	       (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
-	    ABORT("Malloc fails for Unzval_br_ptr[].");
-	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
-	
-	if ( !(ToSendD = intCalloc_dist(k)) )
-	    ABORT("Malloc fails for ToSendD[].");
-	if ( !(ilsum = intMalloc_dist(k+1)) )
-        ABORT("Malloc fails for ilsum[].");
-
-	/* Auxiliary arrays used to set up U block data structures.
-	   They are freed on return. */
-	if ( !(rb_marker = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for rb_marker[].");
-	if ( !(Urb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_length[].");
-	if ( !(Urb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Urb_indptr[].");
-	if ( !(Urb_fstnz = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Urb_fstnz[].");
-	if ( !(Ucbs = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Ucbs[].");
-#if ( PRNTlevel>=1 )	
-	mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword;
-#endif
-	/* Compute ldaspa and ilsum[]. */
-	ldaspa = 0;
-	ilsum[0] = 0;
-	for (gb = 0; gb < nsupers; ++gb) {
-	    if ( myrow == PROW( gb, grid ) ) {
-		i = SuperSize( gb );
-		ldaspa += i;
-		lb = LBi( gb, grid );
-		ilsum[lb + 1] = ilsum[lb] + i;
-	    }
-	}
-	
-            
-	/* ------------------------------------------------------------
-	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
-	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
-	   ------------------------------------------------------------*/
-	
-	/* Loop through each supernode column. */
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    fsupc = FstBlockC( jb );
-	    nsupc = SuperSize( jb );
-	    /* Loop through each column in the block. */
-	    for (j = fsupc; j < fsupc + nsupc; ++j) {
-		/* usub[*] contains only "first nonzero" in each segment. */
-		for (i = xusub[j]; i < xusub[j+1]; ++i) {
-		    irow = usub[i]; /* First nonzero of the segment. */
-		    gb = BlockNum( irow );
-		    kcol = PCOL( gb, grid );
-		    ljb = LBj( gb, grid );
-		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
-		    pr = PROW( gb, grid );
-		    lb = LBi( gb, grid );
-		    if ( mycol == pc ) {
-			if  ( myrow == pr ) {
-			    ToSendD[lb] = YES;
-			    /* Count nonzeros in entire block row. */
-			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
-			    if (rb_marker[lb] <= jb) {/* First see the block */
-				rb_marker[lb] = jb + 1;
-				Urb_fstnz[lb] += nsupc;
-				++Ucbs[lb]; /* Number of column blocks
-					       in block row lb. */
-#if ( PRNTlevel>=1 )
-				++nUblocks;
-#endif
-			    }
-			    ToRecv[gb] = 1;
-			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
-		    }
-		} /* for i ... */
-	    } /* for j ... */
-	} /* for jb ... */
-	
-	/* Set up the initial pointers for each block row in U. */
-	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	for (lb = 0; lb < nrbu; ++lb) {
-	    len = Urb_length[lb];
-	    rb_marker[lb] = 0; /* Reset block marker. */
-	    if ( len ) {
-		/* Add room for descriptors */
-		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
-		if ( !(index = intMalloc_dist(len1+1)) )
-		    ABORT("Malloc fails for Uindex[].");
-		Ufstnz_br_ptr[lb] = index;
-		/* XSL 4-23-01 */
-		if ( !(Unzval_br_ptr[lb] = doublecomplexCalloc_dist(len)) )
-		    ABORT("Calloc fails for Unzval_br_ptr[*][].");
-		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
-		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
-		index[0] = Ucbs[lb]; /* Number of column blocks */
-		index[1] = len;      /* Total length of nzval[] */
-		index[2] = len1;     /* Total length of index[] */
-		index[len1] = -1;    /* End marker */
-	    } else {
-		Ufstnz_br_ptr[lb] = NULL;
-		Unzval_br_ptr[lb] = NULL;
-	    }
-	    Urb_length[lb] = 0; /* Reset block length. */
-	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
-	} /* for lb ... */
-
-	SUPERLU_FREE(Urb_fstnz);
-	SUPERLU_FREE(Ucbs);
-#if ( PRNTlevel>=1 )
-        mem_use -= 2*k * iword;
-#endif
-	/* Auxiliary arrays used to set up L block data structures.
-	   They are freed on return.
-	   k is the number of local row blocks.   */
-	if ( !(Lrb_length = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for Lrb_length[].");
-	if ( !(Lrb_number = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_number[].");
-	if ( !(Lrb_indptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_indptr[].");
-	if ( !(Lrb_valptr = intMalloc_dist(k)) )
-	    ABORT("Malloc fails for Lrb_valptr[].");
-	if ( !(dense = doublecomplexCalloc_dist(ldaspa * sp_ienv_dist(3))) )
-	    ABORT("Calloc fails for SPA dense[].");
-
-	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
-	    ABORT("Calloc fails for bmod[].");
-#if ( PRNTlevel>=1 )	
-	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*zword;
-#endif
-	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-
-	/* Pointers to the beginning of each block column of L. */
-	if ( !(Lnzval_bc_ptr =
-	       (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
-	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
-	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
-	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
-	Lrowind_bc_ptr[k-1] = NULL;
-
-	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for fsendx_plist[].");
-	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
-	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
-	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
-	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
-#if ( PRNTlevel>=1 )
-	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
-#endif
-
-	/*------------------------------------------------------------
-	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
-	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
-	  ------------------------------------------------------------*/
-
-	for (jb = 0; jb < nsupers; ++jb) {
-	    pc = PCOL( jb, grid );
-	    if ( mycol == pc ) { /* Block column jb in my process column */
-		fsupc = FstBlockC( jb );
-		nsupc = SuperSize( jb );
-		ljb = LBj( jb, grid ); /* Local block number */
-		
-		/* Scatter A into SPA. */
-		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i];
-			if ( irow < fsupc ) continue; /* Skip U. XSL 4-23-01 */
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    dense_col[irow] = a[i];
-			}
-		    }
-		    dense_col += ldaspa;
-		}
-
-		jbrow = PROW( jb, grid );
-
-		/*------------------------------------------------
-		 * SET UP U BLOCKS.
-		 *------------------------------------------------*/
-
-		kseen = 0;
-		/* Loop through each column in the block column. */
-		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
-		    istart = xusub[j];
-		    for (i = istart; i < xusub[j+1]; ++i) {
-			irow = usub[i]; /* First nonzero in the segment. */
-			gb = BlockNum( irow );
-			pr = PROW( gb, grid );
-			if ( pr != jbrow ) 
-			    bsendx_plist[ljb][pr] = YES;
-			if ( myrow == pr ) {
-			    lb = LBi( gb, grid ); /* Local block number */
-			    index = Ufstnz_br_ptr[lb];
-			    if (rb_marker[lb] <= jb) {/* First see the block */
-				rb_marker[lb] = jb + 1;
-				index[Urb_indptr[lb]] = jb; /* Descriptor */
-				/* Initialize block length to 0. XSL 4-23-01 */
-				index[Urb_indptr[lb]+1] = 0;
-				Urb_indptr[lb] += UB_DESCRIPTOR;
-				len = Urb_indptr[lb];
-				for (k = 0; k < nsupc; ++k)
-				    index[len+k] = FstBlockC( gb+1 );
-				if ( gb != jb )/* Exclude diagonal block. */
-				    ++bmod[lb];/* Mod. count for back solve */
-				if ( kseen == 0 && myrow != jbrow ) {
-				    ++nbrecvx;
-				    kseen = 1;
-				}
-			    } else {
-				len = Urb_indptr[lb];/* Start fstnz in index */
-			    }
-			    jj = j - fsupc;
-			    index[len+jj] = irow;
-			} /* if myrow == pr ... */
-		    } /* for i ... */
-		} /* for j ... */
-#if 1
-		/* XSL 4-23-01 */
-		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
-		    /* Gather the initial values of A directly into Uval.
-		       (No SPA is involved.)    */
-		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
-			irow = asub[i]; 
-			if ( irow >= fsupc ) continue; /* Skip L */
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    index = Ufstnz_br_ptr[lb];
-			    uval = Unzval_br_ptr[lb];
-			    len = Urb_indptr[lb];
-			    jj = index[len]; /* First nonzero in segment */
-			    uval[Urb_length[lb] + irow - jj] = a[i];
-			}
-		    }
-		    /* Now increment the index pointer for each row block */
-		    for (lb = 0; lb < nrbu; ++lb) {
-			if ( rb_marker[lb] == jb+1 ) { /* Not an empty block */
-			    gb = lb*grid->nprow + myrow; /* Global block # */
-			    index = Ufstnz_br_ptr[lb];
-			    jj = index[Urb_indptr[lb]];
-			    k = FstBlockC( gb+1 ) - jj;
-			    Urb_length[lb] += k;
-			    /* Increment the block length */
-			    index[Urb_indptr[lb]+fsupc-j-1] += k;
-			    Urb_indptr[lb] += 1;
-			}
-		    }
-		} /* for j = fsupc ... */
-#else
-		/* Figure out how many nonzeros in each block, and gather
-		   the initial values of A from SPA into Uval. */
-		for (lb = 0; lb < nrbu; ++lb) {
-		    if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */
-			index = Ufstnz_br_ptr[lb];
-			uval = Unzval_br_ptr[lb];
-			len = Urb_indptr[lb];
-			gb = lb * grid->nprow + myrow;/* Global block number */
-			k = FstBlockC( gb+1 );
-			irow = ilsum[lb] - FstBlockC( gb );
-			for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) {
-			    j = index[len+jj];  /* First nonzero in segment. */
-			    bnnz += k - j;
-			    for (i = j; i < k; ++i) {
-				uval[Urb_length[lb]++] = dense_col[irow + i];
-				dense_col[irow + i] = zero;
-			    }
-			    dense_col += ldaspa;
-			}
-			index[len-1] = bnnz; /* Set block length in Descriptor */
-			Urb_indptr[lb] += nsupc;
-		    }
-		} /* for lb ... */
-#endif
-
-		/*------------------------------------------------
-		 * SET UP L BLOCKS.
-		 *------------------------------------------------*/
-
-		/* Count number of blocks and length of each block. */
-		nrbl = 0;
-		len = 0; /* Number of row subscripts I own. */
-		kseen = 0;
-		istart = xlsub[fsupc];
-		for (i = istart; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    gb = BlockNum( irow ); /* Global block number */
-		    pr = PROW( gb, grid ); /* Process row owning this block */
-		    if ( pr != jbrow )
-			fsendx_plist[ljb][pr] = YES;
-		    if ( myrow == pr ) {
-			lb = LBi( gb, grid );  /* Local block number */
-			if (rb_marker[lb] <= jb) { /* First see this block */
-			    rb_marker[lb] = jb + 1;
-			    Lrb_length[lb] = 1;
-			    Lrb_number[nrbl++] = gb;
-			    if ( gb != jb ) /* Exclude diagonal block. */
-				++fmod[lb]; /* Mod. count for forward solve */
-			    if ( kseen == 0 && myrow != jbrow ) {
-				++nfrecvx;
-				kseen = 1;
-			    }
-#if ( PRNTlevel>=1 )
-			    ++nLblocks;
-#endif
-			} else {
-			    ++Lrb_length[lb];
-			}
-			++len;
-		    }
-		} /* for i ... */
-
-		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
-		    /* Set up the initial pointers for each block in 
-		       index[] and nzval[]. */
-		    /* Add room for descriptors */
-		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-		    if ( !(index = intMalloc_dist(len1)) ) 
-			ABORT("Malloc fails for index[]");
-		    Lrowind_bc_ptr[ljb] = index;
-		    if ( !(Lnzval_bc_ptr[ljb] = 
-			   doublecomplexMalloc_dist(len*nsupc)) ) {
-			fprintf(stderr, "col block %d ", jb);
-			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
-		    }
-		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
-		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
-		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
-		    index[0] = nrbl;  /* Number of row blocks */
-		    index[1] = len;   /* LDA of the nzval[] */
-		    next_lind = BC_HEADER;
-		    next_lval = 0;
-		    for (k = 0; k < nrbl; ++k) {
-			gb = Lrb_number[k];
-			lb = LBi( gb, grid );
-			len = Lrb_length[lb];
-			Lrb_length[lb] = 0;  /* Reset vector of block length */
-			index[next_lind++] = gb; /* Descriptor */
-			index[next_lind++] = len; 
-			Lrb_indptr[lb] = next_lind;
-			Lrb_valptr[lb] = next_lval;
-			next_lind += len;
-			next_lval += len;
-		    }
-		    /* Propagate the compressed row subscripts to Lindex[], and
-		       the initial values of A from SPA into Lnzval[]. */
-		    lusup = Lnzval_bc_ptr[ljb];
-		    len = index[1];  /* LDA of lusup[] */
-		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
-			irow = lsub[i];
-			gb = BlockNum( irow );
-			if ( myrow == PROW( gb, grid ) ) {
-			    lb = LBi( gb, grid );
-			    k = Lrb_indptr[lb]++; /* Random access a block */
-			    index[k] = irow;
-			    k = Lrb_valptr[lb]++;
-			    irow = ilsum[lb] + irow - FstBlockC( gb );
-			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
-				lusup[k] = dense_col[irow];
-				dense_col[irow] = zero;
-				k += len;
-				dense_col += ldaspa;
-			    }
-			}
-		    } /* for i ... */
-		} else {
-		    Lrowind_bc_ptr[ljb] = NULL;
-		    Lnzval_bc_ptr[ljb] = NULL;
-		} /* if nrbl ... */
-
-	    } /* if mycol == pc */
-
-	} /* for jb ... */
-
-	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-	Llu->Unzval_br_ptr = Unzval_br_ptr;
-	Llu->ToRecv = ToRecv;
-	Llu->ToSendD = ToSendD;
-	Llu->ToSendR = ToSendR;
-	Llu->fmod = fmod;
-	Llu->fsendx_plist = fsendx_plist;
-	Llu->nfrecvx = nfrecvx;
-	Llu->bmod = bmod;
-	Llu->bsendx_plist = bsendx_plist;
-	Llu->nbrecvx = nbrecvx;
-	Llu->ilsum = ilsum;
-	Llu->ldalsum = ldaspa;
-	
-#if ( PRNTlevel>=1 )
-	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
-			   nLblocks, nUblocks);
-#endif
-
-	SUPERLU_FREE(rb_marker);
-	SUPERLU_FREE(Urb_length);
-	SUPERLU_FREE(Urb_indptr);
-	SUPERLU_FREE(Lrb_length);
-	SUPERLU_FREE(Lrb_number);
-	SUPERLU_FREE(Lrb_indptr);
-	SUPERLU_FREE(Lrb_valptr);
-	SUPERLU_FREE(dense);
-
-	/* Find the maximum buffer size. */
-	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
-		      MPI_MAX, grid->comm);
-
-    } /* if fact == SamePattern_SameRowPerm */
-
-#if ( DEBUGlevel>=1 )
-    /* Memory allocated but not freed:
-       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
-    CHECK_MALLOC(iam, "Exit zdistribute()");
-#endif
-
-    return (mem_use);
-} /* ZDISTRIBUTE */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zgsequ.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zgsequ.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,178 +0,0 @@
-
-
-/*
- * File name:	zgsequ.c
- * History:     Modified from LAPACK routine ZGEEQU
- */
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-zgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	    double *colcnd, double *amax, int_t *info)
-{
-/*    
-    Purpose   
-    =======   
-
-    ZGSEQU_DIST computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
-    R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) double*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= M:  the i-th row of A is exactly zero   
-                  >  M:  the (i-M)-th column of A is exactly zero   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex *Aval;
-    int i, j, irow;
-    double rcmin, rcmax;
-    double bignum, smlnum;
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("zgsequ_dist", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = (NCformat *) A->Store;
-    Aval = (doublecomplex *) Astore->nzval;
-    
-    /* Get machine constants. */
-    smlnum = dlamch_("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[i]) );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = 0; i < A->nrow; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    c[j] = SUPERLU_MAX( c[j], z_abs1(&Aval[i]) * r[irow] );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    return;
-
-} /* zgsequ_dist */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlangs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlangs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-
-
-/*
- * File name:	zlangs.c
- * History:     Modified from lapack routine ZLANGE
- */
-#include <math.h>
-#include "superlu_zdefs.h"
-
-double zlangs_dist(char *norm, SuperMatrix *A)
-{
-/* 
-    Purpose   
-    =======   
-
-    ZLANGS_DIST returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    ZLANGE returns the value   
-
-       ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                (   
-                ( norm1(A),         NORM = '1', 'O' or 'o'   
-                (   
-                ( normI(A),         NORM = 'I' or 'i'   
-                (   
-                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in ZLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int      i, j, irow;
-    double   value=0., sum;
-    double   *rwork;
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-	
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
-		value = SUPERLU_MAX( value, z_abs( &Aval[i]) );
-	
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += z_abs( &Aval[i] );
-	    value = SUPERLU_MAX(value,sum);
-	}
-	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
-	    ABORT("SUPERLU_MALLOC fails for rwork.");
-	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
-		irow = Astore->rowind[i];
-		rwork[irow] += z_abs( &Aval[i] );
-	    }
-	value = 0.;
-	for (i = 0; i < A->nrow; ++i)
-	    value = SUPERLU_MAX(value, rwork[i]);
-	
-	SUPERLU_FREE (rwork);
-	
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else
-	ABORT("Illegal norm specified.");
-
-    return (value);
-
-} /* zlangs_dist */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlaqgs.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlaqgs.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zlaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zlaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,132 +0,0 @@
-
-
-/*
- * File name:	zlaqgs.c
- * History:     Modified from LAPACK routine ZLAQGE
- */
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-zlaqgs_dist(SuperMatrix *A, double *r, double *c, 
-	    double rowcnd, double colcnd, double amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    ZLAQGS_DIST equilibrates a general sparse M by N matrix A using the row
-    and column scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
-    R       (input) double*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) double*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) double
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) double
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) double
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int i, j, irow;
-    double large, small, cj;
-    extern double dlamch_(char *);
-    double temp;
-
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = (NCformat *) A->Store;
-    Aval = (doublecomplex *) Astore->nzval;
-    
-    /* Initialize LARGE and SMALL. */
-    small = dlamch_("Safe minimum") / dlamch_("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    for (j = 0; j < A->ncol; ++j) {
-		cj = c[j];
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    zd_mult(&Aval[i], &Aval[i], cj);
-                }
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		zd_mult(&Aval[i], &Aval[i], r[irow]);
-	    }
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Row and column scaling */
-	for (j = 0; j < A->ncol; ++j) {
-	    cj = c[j];
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		temp = cj * r[irow];
-		zd_mult(&Aval[i], &Aval[i], temp);
-	    }
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* zlaqgs_dist */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zldperm.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zldperm.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zldperm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zldperm.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,159 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_zdefs.h"
-
-extern void mc64ad_(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
-		    int_t*, int_t [], int_t*, int_t[], int_t*, double [],
-		    int_t [], int_t []);
-
-void
-zldperm(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
-	doublecomplex nzval[], int_t *perm, double u[], double v[])
-{
-/*
- * Purpose
- * =======
- *
- *   ZLDPERM finds a row permutation so that the matrix has large
- *   entries on the diagonal.
- *
- * Arguments
- * =========
- *
- * job    (input) int
- *        Control the action. Possible values for JOB are:
- *        = 1 : Compute a row permutation of the matrix so that the
- *              permuted matrix has as many entries on its diagonal as
- *              possible. The values on the diagonal are of arbitrary size.
- *              HSL subroutine MC21A/AD is used for this.
- *        = 2 : Compute a row permutation of the matrix so that the smallest 
- *              value on the diagonal of the permuted matrix is maximized.
- *        = 3 : Compute a row permutation of the matrix so that the smallest
- *              value on the diagonal of the permuted matrix is maximized.
- *              The algorithm differs from the one used for JOB = 2 and may
- *              have quite a different performance.
- *        = 4 : Compute a row permutation of the matrix so that the sum
- *              of the diagonal entries of the permuted matrix is maximized.
- *        = 5 : Compute a row permutation of the matrix so that the product
- *              of the diagonal entries of the permuted matrix is maximized
- *              and vectors to scale the matrix so that the nonzero diagonal 
- *              entries of the permuted matrix are one in absolute value and 
- *              all the off-diagonal entries are less than or equal to one in 
- *              absolute value.
- *        Restriction: 1 <= JOB <= 5.
- *
- * n      (input) int
- *        The order of the matrix.
- *
- * nnz    (input) int
- *        The number of nonzeros in the matrix.
- *
- * adjncy (input) int*, of size nnz
- *        The adjacency structure of the matrix, which contains the row
- *        indices of the nonzeros.
- *
- * colptr (input) int*, of size n+1
- *        The pointers to the beginning of each column in ADJNCY.
- *
- * nzval  (input) doublecomplex*, of size nnz
- *        The nonzero values of the matrix. nzval[k] is the value of
- *        the entry corresponding to adjncy[k].
- *        It is not used if job = 1.
- *
- * perm   (output) int*, of size n
- *        The permutation vector. perm[i] = j means row i in the
- *        original matrix is in row j of the permuted matrix.
- *
- * u      (output) double*, of size n
- *        If job = 5, the natural logarithms of the row scaling factors. 
- *
- * v      (output) double*, of size n
- *        If job = 5, the natural logarithms of the column scaling factors. 
- *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
- */
- 
-    int_t i, liw, ldw, num;
-    int_t *iw, icntl[10], info[10];
-    double *dw;
-    double *nzval_abs = doubleMalloc_dist(nnz);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(0, "Enter zldperm()");
-#endif
-    liw = 5*n;
-    if ( job == 3 ) liw = 10*n + nnz;
-    if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
-    ldw = 3*n + nnz;
-    if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
-	    
-    /* Increment one to get 1-based indexing. */
-    for (i = 0; i <= n; ++i) ++colptr[i];
-    for (i = 0; i < nnz; ++i) ++adjncy[i];
-#if ( DEBUGlevel>=2 )
-    printf("LDPERM(): n %d, nnz %d\n", n, nnz);
-    PrintInt10("colptr", n+1, colptr);
-    PrintInt10("adjncy", nnz, adjncy);
-#endif
-	
-    /* 
-     * NOTE:
-     * =====
-     *
-     * MC64AD assumes that column permutation vector is defined as:
-     * perm(i) = j means column i of permuted A is in column j of original A.
-     *
-     * Since a symmetric permutation preserves the diagonal entries. Then
-     * by the following relation:
-     *     P'(A*P')P = P'A
-     * we can apply inverse(perm) to rows of A to get large diagonal entries.
-     * But, since 'perm' defined in MC64AD happens to be the reverse of
-     * SuperLU's definition of permutation vector, therefore, it is already
-     * an inverse for our purpose. We will thus use it directly.
-     *
-     */
-    mc64id_(icntl);
-#if 0
-    /* Suppress error and warning messages. */
-    icntl[0] = -1;
-    icntl[1] = -1;
-#endif
-
-    for (i = 0; i < nnz; ++i) nzval_abs[i] = z_abs1(&nzval[i]);
-    mc64ad_(&job, &n, &nnz, colptr, adjncy, nzval_abs, &num, perm,
-	    &liw, iw, &ldw, dw, icntl, info);
-
-#if ( DEBUGlevel>=2 )
-    PrintInt10("perm", n, perm);
-    printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num);
-#endif
-    if ( info[0] == 1 ) { /* Structurally singular */
-        printf(".. The last %d permutations:\n", n-num);
-	PrintInt10("perm", n-num, &perm[num]);
-    }
-
-    /* Restore to 0-based indexing. */
-    for (i = 0; i <= n; ++i) --colptr[i];
-    for (i = 0; i < nnz; ++i) --adjncy[i];
-    for (i = 0; i < n; ++i) --perm[i];
-
-    if ( job == 5 )
-        for (i = 0; i < n; ++i) {
-	    u[i] = dw[i];
-	    v[i] = dw[n+i];
-	}
-
-    SUPERLU_FREE(iw);
-    SUPERLU_FREE(dw);
-    SUPERLU_FREE(nzval_abs);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(0, "Exit zldperm()");
-#endif
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmemory.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmemory.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmemory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmemory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,118 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-#include "superlu_zdefs.h"
-
-
-/* Variables external to this file */
-extern LU_stack_t stack;
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int_t zQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
-		       mem_usage_t *mem_usage)
-{
-    register int_t dword, gb, iword, k, maxsup, nb, nsupers;
-    int_t *index, *xsup;
-    int iam, mycol, myrow;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-
-    iam = grid->iam;
-    myrow = MYROW( iam, grid );
-    mycol = MYCOL( iam, grid );
-    iword = sizeof(int_t);
-    dword = sizeof(doublecomplex);
-    maxsup = sp_ienv_dist(3);
-    nsupers = Glu_persist->supno[n-1] + 1;
-    xsup = Glu_persist->xsup;
-    mem_usage->for_lu = 0;
-
-    /* For L factor */
-    nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
-    for (k = 0; k < nb; ++k) {
-	gb = k * grid->npcol + mycol; /* Global block number. */
-	if ( gb < nsupers ) {
-	    index = Llu->Lrowind_bc_ptr[k];
-	    if ( index ) {
-		mem_usage->for_lu += (float)
-		    ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
-		mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
-	    }
-	}
-    }
-
-    /* For U factor */
-    nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
-    for (k = 0; k < nb; ++k) {
-	gb = k * grid->nprow + myrow; /* Global block number. */
-	if ( gb < nsupers ) {
-	    index = Llu->Ufstnz_br_ptr[k];
-	    if ( index ) {
-		mem_usage->for_lu += (float)(index[2] * iword);
-		mem_usage->for_lu += (float)(index[1] * dword);
-	    }
-	}
-    }
-
-    /* Working storage to support factorization */
-    mem_usage->total = mem_usage->for_lu;
-    mem_usage->total +=
-	(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
-		( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
-    /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
-    mem_usage->total +=
-	(float)( Llu->bufmax[0] * iword +  Llu->bufmax[1] * dword );
-    mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
-    k = CEILING( nsupers, grid->nprow );
-    mem_usage->total += (float)(2 * k * iword);
-
-    return 0;
-} /* zQuerySpace_dist */
-
-
-/*
- * Allocate storage for original matrix A
- */
-void
-zallocateA_dist(int_t n, int_t nnz, doublecomplex **a, int_t **asub, int_t **xa)
-{
-    *a    = (doublecomplex *) doublecomplexMalloc_dist(nnz);
-    *asub = (int_t *) intMalloc_dist(nnz);
-    *xa   = (int_t *) intMalloc_dist(n+1);
-}
-
-
-doublecomplex *doublecomplexMalloc_dist(int_t n)
-{
-    doublecomplex *buf;
-    buf = (doublecomplex *) 
-	SUPERLU_MALLOC(SUPERLU_MAX(1, n) * sizeof(doublecomplex)); 
-    return (buf);
-}
-
-doublecomplex *doublecomplexCalloc_dist(int_t n)
-{
-    doublecomplex *buf;
-    register int_t i;
-    doublecomplex zero = {0.0, 0.0};
-    buf = (doublecomplex *) 
-	SUPERLU_MALLOC(SUPERLU_MAX(1, n) * sizeof(doublecomplex));
-    if ( !buf ) return (buf);
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmyblas2.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmyblas2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zmyblas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zmyblas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,184 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		zmyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-#include "dcomplex.h"
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs )
-{
-    int k;
-    doublecomplex x0, x1, x2, x3, temp;
-    doublecomplex *M0;
-    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      	Mki0 = M0 + 1;
-      	Mki1 = Mki0 + ldm + 1;
-      	Mki2 = Mki1 + ldm + 1;
-      	Mki3 = Mki2 + ldm + 1;
-
-      	x0 = rhs[firstcol];
-      	zz_mult(&temp, &x0, Mki0); Mki0++;
-      	z_sub(&x1, &rhs[firstcol+1], &temp);
-      	zz_mult(&temp, &x0, Mki0); Mki0++;
-	z_sub(&x2, &rhs[firstcol+2], &temp);
-	zz_mult(&temp, &x1, Mki1); Mki1++;
-	z_sub(&x2, &x2, &temp);
-      	zz_mult(&temp, &x0, Mki0); Mki0++;
-	z_sub(&x3, &rhs[firstcol+3], &temp);
-	zz_mult(&temp, &x1, Mki1); Mki1++;
-	z_sub(&x3, &x3, &temp);
-	zz_mult(&temp, &x2, Mki2); Mki2++;
-	z_sub(&x3, &x3, &temp);
-
- 	rhs[++firstcol] = x1;
-      	rhs[++firstcol] = x2;
-      	rhs[++firstcol] = x3;
-      	++firstcol;
-    
-      	for (k = firstcol; k < ncol; k++) {
-	    zz_mult(&temp, &x0, Mki0); Mki0++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x1, Mki1); Mki1++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x2, Mki2); Mki2++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x3, Mki3); Mki3++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	}
-
-        M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-        Mki0 = M0 + 1;
-        Mki1 = Mki0 + ldm + 1;
-
-        x0 = rhs[firstcol];
-	zz_mult(&temp, &x0, Mki0); Mki0++;
-	z_sub(&x1, &rhs[firstcol+1], &temp);
-
-      	rhs[++firstcol] = x1;
-      	++firstcol;
-    
-      	for (k = firstcol; k < ncol; k++) {
-	    zz_mult(&temp, &x0, Mki0); Mki0++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x1, Mki1); Mki1++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	} 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-zusolve (
-	int ldm,	/* in */
-	int ncol,	/* in */
-	doublecomplex *M,	/* in */
-	doublecomplex *rhs	/* modified */
-)
-{
-    doublecomplex xj, temp;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++) {
-	    zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
-	    z_sub(&rhs[irow], &rhs[irow], &temp);
-	}
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void zmatvec (
-	int ldm,	/* in -- leading dimension of M */
-	int nrow,	/* in */ 
-	int ncol,	/* in */
-	doublecomplex *M,	/* in */
-	doublecomplex *vec,	/* in */
-	doublecomplex *Mxvec	/* in/out */
-)
-{
-    doublecomplex vi0, vi1, vi2, vi3;
-    doublecomplex *M0, temp;
-    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) {
-	    zz_mult(&temp, &vi0, Mki0); Mki0++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	    zz_mult(&temp, &vi1, Mki1); Mki1++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	    zz_mult(&temp, &vi2, Mki2); Mki2++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	    zz_mult(&temp, &vi3, Mki3); Mki3++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	}
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++) {
-	    zz_mult(&temp, &vi0, Mki0); Mki0++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	}
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zreadhb.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zreadhb.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zreadhb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zreadhb.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,264 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "dcomplex.h"
-#include "superlu_zdefs.h"
-
-/*
- * Prototypes
- */
-static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
-static void zReadValues(FILE *, int_t, doublecomplex *, int_t, int_t);
-static int zDumpLine(FILE *);
-static int ParseIntFormat(char *, int_t *, int_t *);
-static int ParseFloatFormat(char *, int_t *, int_t *);
-
-
-void
-zreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
-	     doublecomplex **nzval, int_t **rowind, int_t **colptr)
-{
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- *
- * Purpose
- * =======
- * 
- * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format 
- * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
- *
- */
-
-    register int_t i, numer_lines, rhscrd = 0;
-    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
-    char buf[100], type[4];
-
-    /* Line 1 */
-    fgets(buf, 100, fp);
-/*    fscanf(fp, "%72c", buf); buf[72] = 0;
-    if ( !iam ) printf("Title: %s", buf);
-    fscanf(fp, "%8c", key);  key[8] = 0;
-    if ( !iam ) printf("Key: %s\n", key);
-    zDumpLine(fp);*/
-
-    /* Line 2 */
-    for (i=0; i<5; i++) {
-	fscanf(fp, "%14c", buf); buf[14] = 0;
-	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
-	if (i == 3) numer_lines = tmp;
-	if (i == 4 && tmp) rhscrd = tmp;
-    }
-    zDumpLine(fp);
-
-    /* Line 3 */
-    fscanf(fp, "%3c", type);
-    fscanf(fp, "%11c", buf); /* pad */
-    type[3] = 0;
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) printf("Matrix type %s\n", type);
-#endif
-    
-    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
-    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
-    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
-    fscanf(fp, "%14c", buf); tmp = atoi(buf);
-    
-    if (tmp != 0)
-	  if ( !iam ) printf("This is not an assembled matrix!\n");
-    if (*nrow != *ncol)
-	if ( !iam ) printf("Matrix is not square.\n");
-    zDumpLine(fp);
-
-    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
-    zallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
-
-    /* Line 4: format statement */
-    fscanf(fp, "%16c", buf);
-    ParseIntFormat(buf, &colnum, &colsize);
-    fscanf(fp, "%16c", buf);
-    ParseIntFormat(buf, &rownum, &rowsize);
-    fscanf(fp, "%20c", buf);
-    ParseFloatFormat(buf, &valnum, &valsize);
-    fscanf(fp, "%20c", buf);
-    zDumpLine(fp);
-
-    /* Line 5: right-hand side */    
-    if ( rhscrd ) zDumpLine(fp); /* skip RHSFMT */
-    
-#if ( DEBUGlevel>=1 )
-    if ( !iam ) {
-	printf("%d rows, %d nonzeros\n", *nrow, *nonz);
-	printf("colnum %d, colsize %d\n", colnum, colsize);
-	printf("rownum %d, rowsize %d\n", rownum, rowsize);
-	printf("valnum %d, valsize %d\n", valnum, valsize);
-    }
-#endif
-    
-    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
-    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
-    if ( numer_lines ) {
-        zReadValues(fp, *nonz, *nzval, valnum, valsize);
-    }
-    
-    fclose(fp);
-
-}
-
-/* Eat up the rest of the current line */
-static int zDumpLine(FILE *fp)
-{
-    register int c;
-    while ((c = fgetc(fp)) != '\n') ;
-    return 0;
-}
-
-static int ParseIntFormat(char *buf, int_t *num, int_t *size)
-{
-    char *tmp;
-
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp);
-    while (*tmp != 'I' && *tmp != 'i') ++tmp;
-    ++tmp;
-    *size = atoi(tmp);
-    return 0;
-}
-
-static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
-{
-    char *tmp, *period;
-    
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp);
-    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
-	   && *tmp != 'F' && *tmp != 'f') {
-       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
-           num picked up refers to P, which should be skipped. */
-        if (*tmp=='p' || *tmp=='P') {
-           ++tmp;
-           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-        } else {
-           ++tmp;
-        }
-    }
-    ++tmp;
-    period = tmp;
-    while (*period != '.' && *period != ')') ++period ;
-    *period = '\0';
-    *size = atoi(tmp);
-    return 0;
-}
-
-static void
-ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
-{
-    register int_t i, j, item;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	    where[i++] = item - 1;
-	}
-    }
-}
-
-/* Read complex numbers as pairs of (real, imaginary) */
-static void zReadValues(FILE *fp, int_t n, doublecomplex *destination,
-			int_t perline, int_t persize)
-{
-    register int_t i, j, k, s, pair;
-    register double realpart;
-    char tmp, buf[100];
-    
-    i = pair = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    s = j*persize;
-	    for (k = 0; k < persize; ++k) /* No D_ format in C */
-		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
-	    if ( pair == 0 ) {
-	  	/* The value is real part */
-		realpart = atof(&buf[s]);
-		pair = 1;
-	    } else {
-		/* The value is imaginary part */
-	        destination[i].r = realpart;
-		destination[i++].i = atof(&buf[s]);
-		pair = 0;
-	    }
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	}
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zsp_blas2.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zsp_blas2.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zsp_blas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zsp_blas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,493 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-/*
- * File name:		sp_blas2.c
- * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
- */
-
-#include "superlu_zdefs.h"
-
-
-/* 
- * Function prototypes 
- */
-#ifndef USE_VENDOR_BLAS
-void zusolve(int, int, doublecomplex*, doublecomplex*);
-void zlsolve(int, int, doublecomplex*, doublecomplex*);
-void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
-#endif
-
-
-int
-sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
-	      SuperMatrix *U, doublecomplex *x, int *info)
-{
-/*
- *   Purpose
- *   =======
- *
- *   sp_ztrsv() solves one of the systems of equations   
- *       A*x = b,   or   A'*x = b,
- *   where b and x are n element vectors and A is a sparse unit , or   
- *   non-unit, upper or lower triangular matrix.   
- *   No test for singularity or near-singularity is included in this   
- *   routine. Such tests must be performed before calling this routine.   
- *
- *   Parameters   
- *   ==========   
- *
- *   uplo   - (input) char*
- *            On entry, uplo specifies whether the matrix is an upper or   
- *             lower triangular matrix as follows:   
- *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
- *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
- *
- *   trans  - (input) char*
- *             On entry, trans specifies the equations to be solved as   
- *             follows:   
- *                trans = 'N' or 'n'   A*x = b.   
- *                trans = 'T' or 't'   A'*x = b.   
- *                trans = 'C' or 'c'   A'*x = b.   
- *
- *   diag   - (input) char*
- *             On entry, diag specifies whether or not A is unit   
- *             triangular as follows:   
- *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
- *                diag = 'N' or 'n'   A is not assumed to be unit   
- *                                    triangular.   
- *	     
- *   L       - (input) SuperMatrix*
- *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes,
- *             i.e., L has types: Stype = SC, Dtype = Z, Mtype = TRLU.
- *
- *   U       - (input) SuperMatrix*
- *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = NC, Dtype = Z, Mtype = TRU.
- *    
- *   x       - (input/output) doublecomplex*
- *             Before entry, the incremented array X must contain the n   
- *             element right-hand side vector b. On exit, X is overwritten 
- *             with the solution vector x.
- *
- *   info    - (output) int*
- *             If *info = -i, the i-th argument had an illegal value.
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    SCformat *Lstore;
-    NCformat *Ustore;
-    doublecomplex   *Lval, *Uval;
-    int incx = 1, incy = 1;
-    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
-    doublecomplex comp_zero = {0.0, 0.0};
-    int nrow;
-    int fsupc, nsupr, nsupc, luptr, istart, irow;
-    int i, k, iptr, jcol;
-    doublecomplex *work;
-    flops_t solve_ops;
-    /*extern SuperLUStat_t SuperLUStat;*/
-
-    /* Test the input parameters */
-    *info = 0;
-    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
-    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2;
-    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("sp_ztrsv", &i);
-	return 0;
-    }
-
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-
-    if ( !(work = doublecomplexCalloc_dist(L->nrow)) )
-	ABORT("Malloc fails for work in sp_ztrsv().");
-    
-    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L)*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-		fsupc = L_FST_SUPC(k);
-		istart = L_SUB_START(fsupc);
-		nsupr = L_SUB_START(fsupc+1) - istart;
-		nsupc = L_FST_SUPC(k+1) - fsupc;
-		luptr = L_NZ_START(fsupc);
-		nrow = nsupr - nsupc;
-
-	        solve_ops += 4 * nsupc * (nsupc - 1);
-	        solve_ops += 8 * nrow * nsupc;
-
-		if ( nsupc == 1 ) {
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
-			irow = L_SUB(iptr);
-			++luptr;
-			zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#else
-		    ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx, 1, 1, 1);
-		
-		    zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
-#endif		
-#else
-		    zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
-		
-		    zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
-			&x[fsupc], &work[0] );
-#endif		
-		
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; ++i, ++iptr) {
-			irow = L_SUB(iptr);
-			z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
-			work[i] = comp_zero;
-
-		    }
-	 	}
-	    } /* for k ... */
-	    
-	} else {
-	    /* Form x := inv(U)*x */
-	    
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; k--) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-		
-    	        solve_ops += 4 * nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
-		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
-			irow = U_SUB(i);
-			zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx);
-#else
-		    ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx, 1, 1, 1);
-#endif
-#else		
-		    zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
-#endif		
-
-		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		        solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
-				i++) {
-			    irow = U_SUB(i);
-			zz_mult(&comp_zero, &x[jcol], &Uval[i]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
-		    	}
-                    }
-		}
-	    } /* for k ... */
-	    
-	}
-    } else { /* Form x := inv(A')*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L')*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 8 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-			zz_mult(&comp_zero, &x[irow], &Lval[i]);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
-			iptr++;
-		    }
-		}
-		
-		if ( nsupc > 1 ) {
-		    solve_ops += 4 * nsupc * (nsupc - 1);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-		    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx, 1, 1, 1);
-#endif
-#else
-		    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := inv(U')*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-			zz_mult(&comp_zero, &x[irow], &Uval[i]);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
-		    }
-		}
-
-		solve_ops += 4 * nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-		    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx, 1, 1, 1);
-#endif
-#else
-		    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#endif
-		}
-	    } /* for k ... */
-	}
-    }
-
-    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
-    SUPERLU_FREE(work);
-    return 0;
-}
-
-
-
-int
-sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A,
-	      doublecomplex *x, int incx, doublecomplex beta,
-	      doublecomplex *y, int incy)
-{
-/*  Purpose   
-    =======   
-
-    sp_zgemv()  performs one of the matrix-vector operations   
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-    where alpha and beta are scalars, x and y are vectors and A is a
-    sparse A->nrow by A->ncol matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - (input) char*
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-    ALPHA  - (input) doublecomplex
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-
-    X      - (input) doublecomplex*, array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-
-    INCX   - (input) int
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-
-    BETA   - (input) doublecomplex
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-
-    Y      - (output) doublecomplex*,  array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-             updated vector y.
-	     
-    INCY   - (input) int
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-
-    ==== Sparse Level 2 Blas routine.   
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int info;
-    doublecomplex temp, temp1;
-    int lenx, leny, i, j, irow;
-    int iy, jx, jy, kx, ky;
-    int notran;
-    doublecomplex comp_zero = {0.0, 0.0};
-    doublecomplex comp_one = {1.0, 0.0};
-
-    notran = lsame_(trans, "N");
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Test the input parameters */
-    info = 0;
-    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
-    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
-    else if (incx == 0) info = 5;
-    else if (incy == 0)	info = 8;
-    if (info != 0) {
-	xerbla_("sp_zgemv ", &info);
-	return 0;
-    }
-
-    /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || 
-	z_eq(&alpha, &comp_zero) && 
-	z_eq(&beta, &comp_one))
-	return 0;
-
-
-    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-       up the start points in  X  and  Y. */
-    if (lsame_(trans, "N")) {
-	lenx = A->ncol;
-	leny = A->nrow;
-    } else {
-	lenx = A->nrow;
-	leny = A->ncol;
-    }
-    if (incx > 0) kx = 0;
-    else kx =  - (lenx - 1) * incx;
-    if (incy > 0) ky = 0;
-    else ky =  - (leny - 1) * incy;
-
-    /* Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-    /* First form  y := beta*y. */
-    if ( !z_eq(&beta, &comp_one) ) {
-	if (incy == 1) {
-	    if ( z_eq(&beta, &comp_zero) )
-		for (i = 0; i < leny; ++i) y[i] = comp_zero;
-	    else
-		for (i = 0; i < leny; ++i) 
-		  zz_mult(&y[i], &beta, &y[i]);
-	} else {
-	    iy = ky;
-	    if ( z_eq(&beta, &comp_zero) )
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = comp_zero;
-		    iy += incy;
-		}
-	    else
-		for (i = 0; i < leny; ++i) {
-		    zz_mult(&y[iy], &beta, &y[iy]);
-		    iy += incy;
-		}
-	}
-    }
-    
-    if ( z_eq(&alpha, &comp_zero) ) return 0;
-
-    if ( notran ) {
-	/* Form  y := alpha*A*x + y. */
-	jx = kx;
-	if (incy == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		if ( !z_eq(&x[jx], &comp_zero) ) {
-		    zz_mult(&temp, &alpha, &x[jx]);
-		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-			irow = Astore->rowind[i];
-			zz_mult(&temp1, &temp,  &Aval[i]);
-			z_add(&y[irow], &y[irow], &temp1);
-		    }
-		}
-		jx += incx;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    } else {
-	/* Form  y := alpha*A'*x + y. */
-	jy = ky;
-	if (incx == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		temp = comp_zero;
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    zz_mult(&temp1, &Aval[i], &x[irow]);
-		    z_add(&temp, &temp, &temp1);
-		}
-		zz_mult(&temp1, &alpha, &temp);
-		z_add(&y[jy], &y[jy], &temp1);
-		jy += incy;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    }
-    return 0;
-} /* sp_zgemv */
-
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zsp_blas3.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,119 +0,0 @@
-/*
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- */
-
-/*
- * File name:		sp_blas3.c
- * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
- */
-
-#include "superlu_zdefs.h"
-
-
-int
-sp_zgemm_dist(char *transa, char *transb, int m, int n, int k, 
-	      doublecomplex alpha, SuperMatrix *A, doublecomplex *b, int ldb, 
-	      doublecomplex beta, doublecomplex *c, int ldc)
-{
-/*  Purpose   
-    =======   
-
-    sp_z performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of 
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - (input) char*
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-             Unchanged on exit.   
-
-    TRANSB - (input) char*
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-             Unchanged on exit.   
-
-    M      - (input) int   
-             On entry,  M  specifies  the number of rows of the matrix 
-	     op( A ) and of the matrix C.  M must be at least zero. 
-	     Unchanged on exit.   
-
-    N      - (input) int
-             On entry,  N specifies the number of columns of the matrix 
-	     op( B ) and the number of columns of the matrix C. N must be 
-	     at least zero.
-	     Unchanged on exit.   
-
-    K      - (input) int
-             On entry, K specifies the number of columns of the matrix 
-	     op( A ) and the number of rows of the matrix op( B ). K must 
-	     be at least  zero.   
-             Unchanged on exit.
-	     
-    ALPHA  - (input) doublecomplex
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = Z; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    B      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
-             part of the array B must contain the matrix B, otherwise 
-             the leading n by k part of the array B must contain the 
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - (input) int
-             On entry, LDB specifies the first dimension of B as declared 
-             in the calling (sub) program. LDB must be at least max( 1, n ).  
-             Unchanged on exit.   
-
-    BETA   - (input) doublecomplex
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then C need not be set on input.   
-
-    C      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading m by n part of the array C must 
-             contain the matrix C,  except when beta is zero, in which 
-             case C need not be set on entry.   
-             On exit, the array C is overwritten by the m by n matrix 
-	     ( alpha*op( A )*B + beta*C ).   
-
-    LDC    - (input) int
-             On entry, LDC specifies the first dimension of C as declared 
-             in the calling (sub)program. LDC must be at least max(1,m).   
-             Unchanged on exit.   
-
-    ==== Sparse Level 3 Blas routine.   
-*/
-    int    incx = 1, incy = 1;
-    int    j;
-
-    for (j = 0; j < n; ++j) {
-	sp_zgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
-    }
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zutil.c hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zutil.c
--- hypre-2.11.2/src/FEI_mv/DSuperLU/SRC/zutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/DSuperLU/SRC/zutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,492 +0,0 @@
-
-/*
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
- *
- */
-
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include "superlu_zdefs.h"
-
-void
-zCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz, 
-			    doublecomplex *nzval, int_t *rowind, int_t *colptr,
-			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NCformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = (NCformat *) A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->rowind = rowind;
-    Astore->colptr = colptr;
-}
-
-void
-zCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
-			       int_t nnz_loc, int_t m_loc, int_t fst_row,
-			       doublecomplex *nzval, int_t *colind, int_t *rowptr,
-			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NRformat_loc *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = (NRformat_loc *) A->Store;
-    Astore->nnz_loc = nnz_loc;
-    Astore->fst_row = fst_row;
-    Astore->m_loc = m_loc;
-    Astore->nzval = nzval;
-    Astore->colind = colind;
-    Astore->rowptr = rowptr;
-}
-
-/*
- * Convert a row compressed storage into a column compressed storage.
- */
-void
-zCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz, 
-                         doublecomplex *a, int_t *colind, int_t *rowptr,
-                         doublecomplex **at, int_t **rowind, int_t **colptr)
-{
-    register int i, j, col, relpos;
-    int_t *marker;
-
-    /* Allocate storage for another copy of the matrix. */
-    *at = (doublecomplex *) doublecomplexMalloc_dist(nnz);
-    *rowind = intMalloc_dist(nnz);
-    *colptr = intMalloc_dist(n+1);
-    marker = intCalloc_dist(n);
-    
-    /* Get counts of each column of A, and set up column pointers */
-    for (i = 0; i < m; ++i)
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
-    (*colptr)[0] = 0;
-    for (j = 0; j < n; ++j) {
-	(*colptr)[j+1] = (*colptr)[j] + marker[j];
-	marker[j] = (*colptr)[j];
-    }
-
-    /* Transfer the matrix into the compressed column storage. */
-    for (i = 0; i < m; ++i) {
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    col = colind[j];
-	    relpos = marker[col];
-	    (*rowind)[relpos] = i;
-	    (*at)[relpos] = a[j];
-	    ++marker[col];
-	}
-    }
-
-    SUPERLU_FREE(marker);
-}
-
-/* Copy matrix A into matrix B. */
-void
-zCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore, *Bstore;
-    int      ncol, nnz, i;
-
-    B->Stype = A->Stype;
-    B->Dtype = A->Dtype;
-    B->Mtype = A->Mtype;
-    B->nrow  = A->nrow;;
-    B->ncol  = ncol = A->ncol;
-    Astore   = (NCformat *) A->Store;
-    Bstore   = (NCformat *) B->Store;
-    Bstore->nnz = nnz = Astore->nnz;
-    for (i = 0; i < nnz; ++i)
-	((doublecomplex *)Bstore->nzval)[i] = ((doublecomplex *)Astore->nzval)[i];
-    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
-    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
-}
-
-
-void zPrint_CompCol_Matrix_dist(SuperMatrix *A)
-{
-    NCformat     *Astore;
-    register int i;
-    doublecomplex       *dp;
-    
-    printf("\nCompCol matrix: ");
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (NCformat *) A->Store;
-    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
-    if ( (dp = (doublecomplex *) Astore->nzval) != NULL ) {
-        printf("nzval:\n");
-        for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
-    }
-    printf("\nrowind:\n");
-    for (i = 0; i < Astore->nnz; ++i) printf("%d  ", Astore->rowind[i]);
-    printf("\ncolptr:\n");
-    for (i = 0; i <= A->ncol; ++i) printf("%d  ", Astore->colptr[i]);
-    printf("\nend CompCol matrix.\n");
-}
-
-void zPrint_Dense_Matrix_dist(SuperMatrix *A)
-{
-    DNformat     *Astore;
-    register int i;
-    doublecomplex       *dp;
-    
-    printf("\nDense matrix: ");
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (DNformat *) A->Store;
-    dp = (doublecomplex *) Astore->nzval;
-    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,Astore->lda);
-    printf("\nnzval: ");
-    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
-    printf("\nend Dense matrix.\n");
-}
-
-int zPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
-{
-    NRformat_loc     *Astore;
-    int_t i, nnz_loc, m_loc;
-    doublecomplex       *dp;
-    
-    printf("\n==== CompRowLoc matrix: ");
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (NRformat_loc *) A->Store;
-    printf("nrow %d, ncol %d\n", A->nrow,A->ncol);
-    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
-    printf("nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc,
-	   Astore->fst_row);
-    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
-    PrintInt10("colind", nnz_loc, Astore->colind);
-    if ( (dp = (doublecomplex *) Astore->nzval) != NULL )
-        PrintDoublecomplex("nzval", nnz_loc, dp);
-    printf("==== end CompRowLoc matrix\n");
-}
-
-int file_zPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
-{
-    NRformat_loc     *Astore;
-    int_t i, nnz_loc, m_loc;
-    doublecomplex       *dp;
-    
-    fprintf(fp, "\n==== CompRowLoc matrix: ");
-    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (NRformat_loc *) A->Store;
-    fprintf(fp, "nrow %d, ncol %d\n", A->nrow, A->ncol);
-    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
-    fprintf(fp, "nnz_loc %d, m_loc %d, fst_row %d\n", nnz_loc, m_loc,
-	    Astore->fst_row);
-    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
-    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
-    if ( (dp = (doublecomplex *) Astore->nzval) != NULL )
-        file_PrintDoublecomplex(fp, "nzval", nnz_loc, dp);
-    fprintf(fp, "==== end CompRowLoc matrix\n");
-}
-
-void
-zCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, doublecomplex *x,
-			  int_t ldx, Stype_t stype, Dtype_t dtype,
-			  Mtype_t mtype)
-{
-    DNformat    *Xstore;
-    
-    X->Stype = stype;
-    X->Dtype = dtype;
-    X->Mtype = mtype;
-    X->nrow = m;
-    X->ncol = n;
-    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
-    Xstore = (DNformat *) X->Store;
-    Xstore->lda = ldx;
-    Xstore->nzval = (doublecomplex *) x;
-}
-
-void
-zCopy_Dense_Matrix_dist(int_t M, int_t N, doublecomplex *X, int_t ldx,
-			doublecomplex *Y, int_t ldy)
-{
-/*
- *  Purpose
- *  =======
- *
- *  Copies a two-dimensional matrix X to another matrix Y.
- */
-    int    i, j;
-    
-    for (j = 0; j < N; ++j)
-        for (i = 0; i < M; ++i)
-            Y[i + j*ldy] = X[i + j*ldx];
-}
-
-void
-zCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, 
-			      doublecomplex *nzval, int_t *nzval_colptr,
-			      int_t *rowind, int_t *rowind_colptr,
-			      int_t *col_to_sup, int_t *sup_to_col,
-			      Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    SCformat *Lstore;
-
-    L->Stype = stype;
-    L->Dtype = dtype;
-    L->Mtype = mtype;
-    L->nrow = m;
-    L->ncol = n;
-    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
-    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
-    Lstore = L->Store;
-    Lstore->nnz = nnz;
-    Lstore->nsuper = col_to_sup[n];
-    Lstore->nzval = nzval;
-    Lstore->nzval_colptr = nzval_colptr;
-    Lstore->rowind = rowind;
-    Lstore->rowind_colptr = rowind_colptr;
-    Lstore->col_to_sup = col_to_sup;
-    Lstore->sup_to_col = sup_to_col;
-
-}
-
-void
-zGenXtrue_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx)
-{
-    int  i, j;
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) {
-	    if ( i % 2 ) x[i + j*ldx].r = 1.0;
-	    else x[i + j*ldx].r = 2.0;
-	    x[i + j*ldx].i = 0.0;
-	}
-}
-
-/*
- * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
- */
-void
-zFillRHS_dist(char *trans, int_t nrhs, doublecomplex *x, int_t ldx,
-	      SuperMatrix *A, doublecomplex *rhs, int_t ldb)
-{
-    doublecomplex one = {1.0, 0.0};
-    doublecomplex zero = {0.0, 0.0};
-
-    sp_zgemm_dist(trans, "N", A->nrow, nrhs, A->ncol, one, A,
-		  x, ldx, zero, rhs, ldb);
-
-}
-
-/* 
- * Fills a doublecomplex precision array with a given value.
- */
-void 
-zfill_dist(doublecomplex *a, int_t alen, doublecomplex dval)
-{
-    register int_t i;
-    for (i = 0; i < alen; i++) a[i] = dval;
-}
-
-
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void zinf_norm_error_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx,
-			  doublecomplex *xtrue, int_t ldxtrue,
-                          gridinfo_t *grid)
-{
-    double err, xnorm;
-    doublecomplex *x_work, *xtrue_work;
-    doublecomplex temp;
-    int i, j;
-
-    for (j = 0; j < nrhs; j++) {
-      x_work = &x[j*ldx];
-      xtrue_work = &xtrue[j*ldxtrue];
-      err = xnorm = 0.0;
-      for (i = 0; i < n; i++) {
-        z_sub(&temp, &x_work[i], &xtrue_work[i]);
-	err = SUPERLU_MAX(err, z_abs(&temp));
-	xnorm = SUPERLU_MAX(xnorm, z_abs(&x_work[i]));
-      }
-      err = err / xnorm;
-      printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
-    }
-}
-
-void PrintDoublecomplex(char *name, int_t len, doublecomplex *x)
-{
-    register int_t i;
-    
-    printf("%10s:\tReal\tImag\n", name);
-    for (i = 0; i < len; ++i)
-	printf("\t%d\t%.4f\t%.4f\n", i, x[i].r, x[i].i);
-}
-
-int file_PrintDoublecomplex(FILE *fp, char *name, int_t len, doublecomplex *x)
-{
-    register int_t i;
-    
-    fprintf(fp, "%10s:\tReal\tImag\n", name);
-    for (i = 0; i < len; ++i)
-	fprintf(fp, "\t%d\t%.4f\t%.4f\n", i, x[i].r, x[i].i);
-}
-
-/* 
- * Print the blocks in the factored matrix L.
- */
-void zPrintLblocks(int_t iam, int_t nsupers, gridinfo_t *grid,
-		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
-{
-    register int_t c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *index;
-    doublecomplex *nzval;
-
-    printf("\n(%d) L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam);
-    ncb = nsupers / grid->npcol;
-    extra = nsupers % grid->npcol;
-    mycol = MYCOL( iam, grid );
-    if ( mycol < extra ) ++ncb;
-    for (lb = 0; lb < ncb; ++lb) {
-	index = Llu->Lrowind_bc_ptr[lb];
-	if ( index ) { /* Not an empty column */
-	    nzval = Llu->Lnzval_bc_ptr[lb];
-	    nb = index[0];
-	    nsupr = index[1];
-	    gb = lb * grid->npcol + mycol;
-	    nsupc = SuperSize( gb );
-	    printf("(%d) block column %d (local), # row blocks %d\n",
-		   iam, lb, nb);
-	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
-		len = index[k+1];
-		printf("(%d) row-block %d: block # %d\tlength %d\n", 
-		       iam, c, index[k], len);
-		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
-		for (j = 0; j < nsupc; ++j) {
-		    PrintDoublecomplex("nzval", len, &nzval[r + j*nsupr]);
-		}
-		k += LB_DESCRIPTOR + len;
-		r += len;
-	    }
-	}
-	printf("(%d)", iam);
- 	PrintInt10("ToSendR[]", grid->npcol, Llu->ToSendR[lb]);
-	PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
-    }
-    printf("nfrecvx %4d\n", Llu->nfrecvx);
-    k = CEILING( nsupers, grid->nprow );
-    PrintInt10("fmod", k, Llu->fmod);
-    
-} /* ZPRINTLBLOCKS */
-
-
-/* 
- * Print the blocks in the factored matrix U.
- */
-void zPrintUblocks(int_t iam, int_t nsupers, gridinfo_t *grid, 
-		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
-{
-    register int_t c, extra, jb, k, lb, len, nb, nrb, nsupc;
-    register int_t myrow, r;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *index;
-    doublecomplex *nzval;
-
-    printf("\n(%d) U BLOCKS IN ROW-MAJOR ORDER -->\n", iam);
-    nrb = nsupers / grid->nprow;
-    extra = nsupers % grid->nprow;
-    myrow = MYROW( iam, grid );
-    if ( myrow < extra ) ++nrb;
-    for (lb = 0; lb < nrb; ++lb) {
-	index = Llu->Ufstnz_br_ptr[lb];
-	if ( index ) { /* Not an empty row */
-	    nzval = Llu->Unzval_br_ptr[lb];
-	    nb = index[0];
-	    printf("(%d) block row %d (local), # column blocks %d\n",
-		   iam, lb, nb);
-	    r  = 0;
-	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
-		jb = index[k];
-		len = index[k+1];
-		printf("(%d) col-block %d: block # %d\tlength %d\n", 
-		       iam, c, jb, index[k+1]);
-		nsupc = SuperSize( jb );
-		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
-		PrintDoublecomplex("nzval", len, &nzval[r]);
-		k += UB_DESCRIPTOR + nsupc;
-		r += len;
-	    }
-
-	    printf("(%d) ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
-	}
-    }
-} /* ZPRINTUBLOCKS */
-
-int
-zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm,
-                 gridinfo_t *grid)
-{
-  int_t procs = grid->nprow*grid->npcol;
-  fprintf(fp, "TotalIndSend %d\tTotalValSend %d\n", gsmv_comm->TotalIndSend,
-	  gsmv_comm->TotalValSend);
-  file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start);
-  file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend);
-  file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv);
-  file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend);
-  file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv);
-  file_PrintInt10(fp, "SendCounts", procs, gsmv_comm->SendCounts);
-  file_PrintInt10(fp, "RecvCounts", procs, gsmv_comm->RecvCounts);
-}
-
-
-/* cg5.cua
-            b = A*x           y = L\b
-   0        1 + 4.0000i       1.0000 + 4.0000i
-   1        0 + 5.0000i	      1.3529 + 5.4118i
-   2        1 + 4.0000i	      1.0000 + 4.0000i
-   3        2 + 3.0000i	      2.0000 + 3.0000i
-   4        1 + 4.0000i	      3.5882 + 4.3529i
-   5        1 + 4.0000i	      4.1250 + 3.3202i
-   6          + 5.0000i	      4.4640 + 3.8632i
-   7        2 + 3.0000i	      2.0000 + 3.0000i
-   8        2 + 3.0000i	      2.0000 + 3.0000i
-   9        1 + 4.0000i	      1.0000 + 4.0000i
-  10        1 + 4.0000i	      3.5882 + 4.3529i
-  11          + 5.0000i	           0 + 5.0000i
-  12        1 + 4.0000i	      5.1793 + 4.6604i
-  13        2 + 3.0000i	      2.0000 + 3.0000i
-  14        1 + 4.0000i	      1.0000 + 4.0000i
-  15          + 5.0000i	      1.3529 + 5.4118i
-  16        1 + 4.0000i	      4.0045 + 3.8950i
-  17          + 5.0000i	      3.0338 + 4.6248i
-  18        1 + 4.0000i	      5.4495 + 2.2703i
-  19          + 5.0000i	      4.0980 + 3.7290i
-  20          + 5.0000i	      4.2680 + 3.7739i
-  21          + 5.0000i	      5.3514 + 2.9480i
-  22        1 + 4.0000i	      4.4178 + 2.0476i
-  23        1 + 4.0000i	      3.5615 + 2.8322i
-  24          + 5.0000i	      4.7526 + 2.2605i
-*/
diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx hypre-2.13.0/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx
--- hypre-2.11.2/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/fei-hypre/FEI_HYPRE_Impl.cxx	2017-10-20 17:42:22.000000000 +0000
@@ -21,15 +21,6 @@
 #include <assert.h>
 #include <math.h>
 
-#if HAVE_SUPERLU_20
-#include "dsp_defs.h"
-#include "superlu_util.h"
-#endif
-#if HAVE_SUPERLU
-#include "slu_ddefs.h"
-#include "slu_util.h"
-#endif
-
 /*-------------------------------------------------------------------------
  MPI definitions 
  -------------------------------------------------------------------------*/
@@ -44,6 +35,15 @@
 
 #include "FEI_HYPRE_Impl.h"
 
+#ifdef HAVE_SUPERLU_20
+#include "dsp_defs.h"
+#include "superlu_util.h"
+#endif
+#ifdef HAVE_SUPERLU
+#include "slu_ddefs.h"
+#include "slu_util.h"
+#endif
+
 extern "C"
 {
   int HYPRE_LSI_Search(int *, int, int);
@@ -517,7 +517,7 @@
 {
    int  i, olevel;
    char param[256], param1[256];
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
    int  nprocs;
 #endif
 
@@ -566,7 +566,7 @@
          else if ( ! strcmp(param, "gmres") )   solverID_ = 1;
          else if ( ! strcmp(param, "cgs") )     solverID_ = 2;
          else if ( ! strcmp(param, "bicgstab")) solverID_ = 3;
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
          else if ( ! strcmp(param, "superlu") ) 
          {
             MPI_Comm_size( mpiComm_, &nprocs );
@@ -3338,13 +3338,14 @@
  -------------------------------------------------------------------------*/
 int FEI_HYPRE_Impl::solveUsingSuperLU()
 {
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
    int    localNRows, localNnz, *countArray, irow, jcol, *cscIA, *cscJA;
    int    colNum, index, *etree, permcSpec, lwork, panelSize, relax, info;
    int    *permC, *permR;
-   double *cscAA, diagPivotThresh, dropTol, *rVec, rnorm;
+   double *cscAA, diagPivotThresh, *rVec, rnorm;
    superlu_options_t slu_options;
    SuperLUStat_t     slu_stat;
+   GlobalLU_t        Glu;
    trans_t           trans;
    SuperMatrix superLU_Amat;
    SuperMatrix superLU_Lmat;
@@ -3408,7 +3409,6 @@
    slu_options.SymmetricMode = NO;
    sp_preorder(&slu_options, &superLU_Amat, permC, etree, &AC);
    diagPivotThresh = 1.0;
-   dropTol = 0.0;
    panelSize = sp_ienv(1);
    relax = sp_ienv(2);
    StatInit(&slu_stat);
@@ -3417,9 +3417,12 @@
    slu_options.Fact = DOFACT;
    slu_options.DiagPivotThresh = diagPivotThresh;
 
-   dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//   dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//          etree, NULL, lwork, permC, permR, &superLU_Lmat,
+//          &superLU_Umat, &slu_stat, &info);
+   dgstrf(&slu_options, &AC, relax, panelSize,
           etree, NULL, lwork, permC, permR, &superLU_Lmat,
-          &superLU_Umat, &slu_stat, &info);
+          &superLU_Umat, &Glu, &slu_stat, &info);
 
    Destroy_CompCol_Permuted(&AC);
    Destroy_CompCol_Matrix(&superLU_Amat);
diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx
--- hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx	2017-10-20 17:42:22.000000000 +0000
@@ -1376,8 +1376,8 @@
          if ( HYPreconID_ == HYMLI )
             HYPRE_LSI_MLISetParams(HYPrecon_, params[i]); 
 #else
-         if ( (HYOutputLevel_ & HYFEI_SPECIALMASK) >= 2 && mypid_ == 0 )
-            printf("       HYPRE_LSC::MLI SetParams - MLI unavailable.\n");
+//         if ( (HYOutputLevel_ & HYFEI_SPECIALMASK) >= 2 && mypid_ == 0 )
+//            printf("       HYPRE_LSC::MLI SetParams - MLI unavailable.\n");
 #endif
       }
 
@@ -4615,11 +4615,12 @@
 
 double HYPRE_LinSysCore::solveUsingSuperLU(int& status)
 {
+  double             rnorm=-1.0;
 #ifdef HAVE_SUPERLU
    int                i, nnz, nrows, ierr;
    int                rowSize, *colInd, *new_ia, *new_ja, *ind_array;
    int                nz_ptr, *partition, start_row, end_row;
-   double             *colVal, *new_a, rnorm=-1.0;
+   double             *colVal, *new_a;
    HYPRE_ParCSRMatrix A_csr;
    HYPRE_ParVector    r_csr;
    HYPRE_ParVector    b_csr;
@@ -4793,12 +4794,13 @@
 
 double HYPRE_LinSysCore::solveUsingSuperLUX(int& status)
 {
+   double             rnorm=-1.0;
 #ifdef HAVE_SUPERLU
    int                i, nnz, nrows, ierr;
    int                rowSize, *colInd, *new_ia, *new_ja, *ind_array;
    int                nz_ptr;
    int                *partition, start_row, end_row;
-   double             *colVal, *new_a, rnorm=-1.0;
+   double             *colVal, *new_a;
    HYPRE_ParCSRMatrix A_csr;
    HYPRE_ParVector    r_csr;
    HYPRE_ParVector    b_csr;
@@ -4812,6 +4814,7 @@
    double             rpg, rcond;
    void               *work=NULL;
    char               equed[1];
+   GlobalLU_t         Glu;
    mem_usage_t        mem_usage;
    superlu_options_t  slu_options;
    SuperLUStat_t      slu_stat;
@@ -4901,7 +4904,7 @@
    slu_options.Equil        = YES;
    slu_options.Trans        = NOTRANS;
    slu_options.Fact         = DOFACT;
-   slu_options.IterRefine   = DOUBLE;
+   slu_options.IterRefine   = SLU_DOUBLE;
    slu_options.DiagPivotThresh = 1.0;
    slu_options.PivotGrowth = YES;
    slu_options.ConditionNumber = YES;
@@ -4917,9 +4920,12 @@
    // solve
    //-------------------------------------------------------------------
 
+//   dgssvx(&slu_options, &A2, perm_c, perm_r, etree,
+//          equed, R, C, &L, &U, work, lwork, &B, &X, 
+//          &rpg, &rcond, ferr, berr, &mem_usage, &slu_stat, &info);
    dgssvx(&slu_options, &A2, perm_c, perm_r, etree,
           equed, R, C, &L, &U, work, lwork, &B, &X, 
-          &rpg, &rcond, ferr, berr, &mem_usage, &slu_stat, &info);
+          &rpg, &rcond, ferr, berr, &Glu, &mem_usage, &slu_stat, &info);
 
    //-------------------------------------------------------------------
    // print SuperLU internal information at the first step
diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c
--- hypre-2.11.2/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/fei-hypre/HYPRE_LSI_Dsuperlu.c	2017-10-20 17:42:22.000000000 +0000
@@ -33,13 +33,13 @@
 #include "dsuperlu_include.h"
 
 #ifdef HAVE_DSUPERLU
-#include "../DSuperLU/SRC/superlu_ddefs.h"
+#include "superlu_ddefs.h"
 
 typedef struct HYPRE_LSI_DSuperLU_Struct
 {
    MPI_Comm           comm_;
    HYPRE_ParCSRMatrix Amat_;
-   superlu_options_t  options_;
+   superlu_dist_options_t  options_;
    SuperMatrix        sluAmat_;
    ScalePermstruct_t  ScalePermstruct_;
    SuperLUStat_t      stat_;
@@ -186,8 +186,9 @@
    if (sluPtr->outputLevel_ < 2) sluPtr->options_.PrintStat = NO;
    ScalePermstructInit(sluPtr->globalNRows_, sluPtr->globalNRows_,
                        &(sluPtr->ScalePermstruct_));
-   LUstructInit(sluPtr->globalNRows_, sluPtr->globalNRows_, 
-                &(sluPtr->LUstruct_));
+//   LUstructInit(sluPtr->globalNRows_, sluPtr->globalNRows_, 
+//                &(sluPtr->LUstruct_));
+   LUstructInit(sluPtr->globalNRows_, &(sluPtr->LUstruct_));
    sluPtr->berr_[0] = 0.0;
    PStatInit(&(sluPtr->stat_));
    pdgssvx(&(sluPtr->options_), &(sluPtr->sluAmat_), 
diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx hypre-2.13.0/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx
--- hypre-2.11.2/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/fei-hypre/LLNL_FEI_Solver.cxx	2017-10-20 17:42:22.000000000 +0000
@@ -24,11 +24,11 @@
 #include "HYPRE.h"
 #include "LLNL_FEI_Solver.h"
 
-#if HAVE_SUPERLU_20
+#ifdef HAVE_SUPERLU_20
 #include "dsp_defs.h"
 #include "superlu_util.h"
 #endif
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
 #include "slu_ddefs.h"
 #include "slu_util.h"
 #endif
@@ -77,7 +77,7 @@
 int LLNL_FEI_Solver::parameters(int numParams, char **paramString)
 {
    int  i, olevel;
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
    int  nprocs;
 #endif
    char param[256], param1[256];
@@ -124,7 +124,7 @@
          else if ( !strcmp(param, "bicgstab")) solverID_ = 3;
          else if ( !strcmp(param, "superlu") ) 
          {
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
             MPI_Comm_size( mpiComm_, &nprocs );
             if ( nprocs == 1 ) solverID_ = 4;
             else
@@ -1101,15 +1101,16 @@
  -------------------------------------------------------------------------*/
 int LLNL_FEI_Solver::solveUsingSuperLU()
 {
-#if HAVE_SUPERLU
+#ifdef HAVE_SUPERLU
    int    localNRows, localNnz, *countArray, irow, jcol, *cscIA, *cscJA;
    int    colNum, index, *etree, permcSpec, lwork, panelSize, relax, info;
    int    *permC, *permR, *diagIA, *diagJA;
-   double *cscAA, diagPivotThresh, dropTol, *rVec, rnorm;
+   double *cscAA, diagPivotThresh, *rVec, rnorm;
    double *diagAA;
    trans_t           trans;
    superlu_options_t slu_options;
    SuperLUStat_t     slu_stat;
+   GlobalLU_t        Glu;
    SuperMatrix superLU_Amat;
    SuperMatrix superLU_Lmat;
    SuperMatrix superLU_Umat;
@@ -1172,7 +1173,6 @@
    slu_options.SymmetricMode = NO;
    sp_preorder(&slu_options, &superLU_Amat, permC, etree, &AC);
    diagPivotThresh = 1.0;
-   dropTol = 0.0;
    panelSize = sp_ienv(1);
    relax = sp_ienv(2);
    StatInit(&slu_stat);
@@ -1180,9 +1180,12 @@
    slu_options.ColPerm = MY_PERMC;
    slu_options.DiagPivotThresh = diagPivotThresh;
 
-   dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//   dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//          etree, NULL, lwork, permC, permR, &superLU_Lmat,
+//          &superLU_Umat, &slu_stat, &info);
+   dgstrf(&slu_options, &AC, relax, panelSize,
           etree, NULL, lwork, permC, permR, &superLU_Lmat,
-          &superLU_Umat, &slu_stat, &info);
+          &superLU_Umat, &Glu, &slu_stat, &info);
 
    Destroy_CompCol_Permuted(&AC);
    Destroy_CompCol_Matrix(&superLU_Amat);
diff -Nru hypre-2.11.2/src/FEI_mv/fei-hypre/Makefile hypre-2.13.0/src/FEI_mv/fei-hypre/Makefile
--- hypre-2.11.2/src/FEI_mv/fei-hypre/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/fei-hypre/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -10,18 +10,17 @@
 # $Revision$
 #EHEADER**********************************************************************
 
-
 include ../../config/Makefile.config
 
 BASE_DIR = ${HYPRE_FEI_BASE_DIR}
 
-SUPERLU_INCLUDE = -I$(srcdir)/../SuperLU/SRC
-SUPERLU_LIB     = -L$(srcdir)/../SuperLU
+#SUPERLU_INCLUDE = -I$(srcdir)/../SuperLU/SRC
+#SUPERLU_LIB     = -L$(srcdir)/../SuperLU
 
 CINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include
 CXXINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include
-CDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DHAVE_MLI
-CXXDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DMPICH_SKIP_MPICXX -DHAVE_MLI
+CDEFS = -DBOOL_NOT_SUPPORTED
+CXXDEFS = ${CDEFS} -DMPICH_SKIP_MPICXX
 
 C_COMPILE_FLAGS = \
  ${CDEFS}\
@@ -41,6 +40,7 @@
  -I$(srcdir)/../../distributed_ls\
  -I$(srcdir)/../fei-base\
  ${SUPERLU_INCLUDE}\
+ ${DSUPERLU_INCLUDE}\
  ${CINCLUDES}
 
 CXX_COMPILE_FLAGS = \
@@ -61,6 +61,7 @@
  -I$(srcdir)/../femli\
  -I$(srcdir)/../fei-base\
  ${SUPERLU_INCLUDE}\
+ ${DSUPERLU_INCLUDE}\
  ${CXXINCLUDES}
 
 HEADERS =\
diff -Nru hypre-2.11.2/src/FEI_mv/femli/Makefile hypre-2.13.0/src/FEI_mv/femli/Makefile
--- hypre-2.11.2/src/FEI_mv/femli/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/femli/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,7 @@
  -I.\
  -I../..\
  -I$(srcdir)\
+ -I$(srcdir)/../../blas\
  -I$(srcdir)/../../lapack\
  -I$(srcdir)/../../utilities\
  -I$(srcdir)/../../IJ_mv\
@@ -31,7 +32,7 @@
  -I$(srcdir)/../../distributed_ls\
  -I$(srcdir)/../../FEI_mv/fei-hypre\
  -I$(srcdir)/../../FEI_mv/femli\
- -I$(srcdir)/../../FEI_mv/SuperLU/SRC
+ ${SUPERLU_INCLUDE}
 
 C_COMPILE_FLAGS =\
  -DMLI_SUPERLU\
diff -Nru hypre-2.11.2/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx hypre-2.13.0/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx
--- hypre-2.11.2/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/femli/mli_solver_seqsuperlu.cxx	2017-10-20 17:42:22.000000000 +0000
@@ -110,12 +110,13 @@
    int      nrows, iP, startRow, nnz, *csrIA, *csrJA, *cscJA, *cscIA;
    int      irow, icol, *rowArray, *countArray, colNum, index, nSubRows;
    int      *etree, permcSpec, lwork, panelSize, relax, info, rowCnt;
-   double   *csrAA, *cscAA, dropTol;
+   double   *csrAA, *cscAA;
    hypre_ParCSRMatrix  *hypreA;
    hypre_CSRMatrix     *ADiag;
    SuperMatrix         AC, superLU_Amat;
    superlu_options_t   slu_options;
    SuperLUStat_t       slu_stat;
+   GlobalLU_t          Glu;
 
    /* ---------------------------------------------------------------
     * fetch matrix
@@ -281,14 +282,16 @@
          slu_options.Fact = DOFACT;
          slu_options.SymmetricMode = NO;
          sp_preorder(&slu_options, &superLU_Amat, permCs_[iP], etree, &AC);
-         dropTol = 0.0;
          panelSize = sp_ienv(1);
          relax = sp_ienv(2);
          StatInit(&slu_stat);
          lwork = 0;
-         dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//         dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//                etree,NULL,lwork,permCs_[iP],permRs_[iP],
+//                &(superLU_Lmats[iP]),&(superLU_Umats[iP]),&slu_stat,&info);
+         dgstrf(&slu_options, &AC, relax, panelSize,
                 etree,NULL,lwork,permCs_[iP],permRs_[iP],
-                &(superLU_Lmats[iP]),&(superLU_Umats[iP]),&slu_stat,&info);
+                &(superLU_Lmats[iP]),&(superLU_Umats[iP]),&Glu,&slu_stat,&info);
          Destroy_CompCol_Permuted(&AC);
          Destroy_CompCol_Matrix(&superLU_Amat);
          delete [] etree;
@@ -351,14 +354,16 @@
          slu_options.Fact = DOFACT;
          slu_options.SymmetricMode = NO;
          sp_preorder(&slu_options, &superLU_Amat, permCs_[iP], etree, &AC);
-         dropTol = 0.0;
          panelSize = sp_ienv(1);
          relax = sp_ienv(2);
          StatInit(&slu_stat);
          lwork = 0;
-         dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//         dgstrf(&slu_options, &AC, dropTol, relax, panelSize,
+//                etree,NULL,lwork,permRs_[iP],permCs_[iP],&(superLU_Lmats[iP]),
+//                &(superLU_Umats[iP]),&slu_stat,&info);
+         dgstrf(&slu_options, &AC, relax, panelSize,
                 etree,NULL,lwork,permRs_[iP],permCs_[iP],&(superLU_Lmats[iP]),
-                &(superLU_Umats[iP]),&slu_stat,&info);
+                &(superLU_Umats[iP]),&Glu,&slu_stat,&info);
          Destroy_CompCol_Permuted(&AC);
          Destroy_CompCol_Matrix(&superLU_Amat);
          delete [] etree;
diff -Nru hypre-2.11.2/src/FEI_mv/femli/mli_solver_superlu.cxx hypre-2.13.0/src/FEI_mv/femli/mli_solver_superlu.cxx
--- hypre-2.11.2/src/FEI_mv/femli/mli_solver_superlu.cxx	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/femli/mli_solver_superlu.cxx	2017-10-20 17:42:22.000000000 +0000
@@ -59,12 +59,13 @@
    int      nnz, row_num, irow, i, j, rowSize, *cols, *recvCntArray;
    int      *dispArray, itemp, *cntArray, icol, colNum, index;
    int      *etree, permcSpec, lwork, panel_size, relax, info, mypid, nprocs;
-   double   *vals, *csrAA, *gcsrAA, *gcscAA, diagPivotThresh, dropTol;
+   double   *vals, *csrAA, *gcsrAA, *gcscAA, diagPivotThresh;
    MPI_Comm mpiComm;
    hypre_ParCSRMatrix *hypreA;
    SuperMatrix        AC;
    superlu_options_t  slu_options;
    SuperLUStat_t      slu_stat;
+   GlobalLU_t         Glu;
 
    /* ---------------------------------------------------------------
     * fetch matrix
@@ -232,7 +233,6 @@
    slu_options.SymmetricMode = NO;
    sp_preorder(&slu_options, &superLU_Amat, permC_, etree, &AC);
    diagPivotThresh = 1.0;
-   dropTol = 0.0;
    panel_size = sp_ienv(1);
    relax = sp_ienv(2);
    StatInit(&slu_stat);
@@ -240,9 +240,12 @@
    slu_options.ColPerm = MY_PERMC;
    slu_options.DiagPivotThresh = diagPivotThresh;
 
-   dgstrf(&slu_options, &AC, dropTol, relax, panel_size,
+//   dgstrf(&slu_options, &AC, dropTol, relax, panel_size,
+//          etree, NULL, lwork, permC_, permR_, &superLU_Lmat,
+//          &superLU_Umat, &slu_stat, &info);
+   dgstrf(&slu_options, &AC, relax, panel_size,
           etree, NULL, lwork, permC_, permR_, &superLU_Lmat,
-          &superLU_Umat, &slu_stat, &info);
+          &superLU_Umat, &Glu, &slu_stat, &info);
    Destroy_CompCol_Permuted(&AC);
    Destroy_CompCol_Matrix(&superLU_Amat);
    delete [] etree;
diff -Nru hypre-2.11.2/src/FEI_mv/femli/mli_utils.c hypre-2.13.0/src/FEI_mv/femli/mli_utils.c
--- hypre-2.11.2/src/FEI_mv/femli/mli_utils.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/femli/mli_utils.c	2017-10-20 17:42:22.000000000 +0000
@@ -27,9 +27,7 @@
 #include "mli_utils.h"
 #include "HYPRE_IJ_mv.h"
 #include "../fei-hypre/HYPRE_parcsr_fgmres.h"
-#ifdef HYPRE_USING_HYPRE_LAPACK
-#include "hypre_lapack.h"
-#endif
+#include "_hypre_lapack.h"
 
 /*--------------------------------------------------------------------------
  * external function 
@@ -1146,12 +1144,6 @@
 int MLI_Utils_SVD(double *uArray, double *sArray, double *vtArray, 
     double *workArray, int m, int n, int workLen)
 {
-    /* prototype */
-#if 0
-    void hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(char *, char *, integer *, 
-        integer *, doublereal *, int *, doublereal *, doublereal *, integer *, 
-        doublereal *, integer *, doublereal *, integer *, integer *);
-#endif
 #ifndef MIN
 #define MIN(a,b) ((a)<(b)?(a):(b))
 #endif
@@ -1161,16 +1153,12 @@
     int info;
     info = -1;
 #else
-    extern int hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(char *, char *, int *, 
-        int *, double *, int *, double *, double *, int *, 
-        double *, int *, double *, int *, int *);
-
     char jobu  = 'O'; /* overwrite input with U */
     char jobvt = 'S'; /* return rows of V in vtArray */
     int  dim = MIN(m,n);
     int  info;
 
-    hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(&jobu, &jobvt, &m, &n, uArray,
+    hypre_dgesvd(&jobu, &jobvt, &m, &n, uArray,
         &m, sArray, (double *) NULL, &m, vtArray, &dim, workArray, 
         &workLen, &info);
 #endif
@@ -1189,17 +1177,13 @@
 #ifdef HYPRE_USING_ESSL
     info = -1;
 #else
-
     char jobu  = 'O'; /* overwrite input with U */
     char jobvt = 'N';
     double *sArray = (double *) malloc(n*sizeof(double));
     int workLen = 5*n;
     double *workArray = (double *) malloc(workLen*sizeof(double));
-    extern int hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(char *, char *, int *,
-        int *, double *, int *, double *, double *, int *,
-        double *, int *, double *, int *, int *);
 
-    hypre_F90_NAME_LAPACK(dgesvd, DGESVD)(&jobu, &jobvt, &n, &n, uArray,
+    hypre_dgesvd(&jobu, &jobvt, &n, &n, uArray,
         &n, sArray, NULL, &n, NULL, &n, workArray, &workLen, &info);
 
     free(workArray);
diff -Nru hypre-2.11.2/src/FEI_mv/Makefile hypre-2.13.0/src/FEI_mv/Makefile
--- hypre-2.11.2/src/FEI_mv/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -10,7 +10,6 @@
 # $Revision$
 #EHEADER**********************************************************************
 
-
 include ../config/Makefile.config
 
 all: 
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/CMakeLists.txt hypre-2.13.0/src/FEI_mv/SuperLU/CMakeLists.txt
--- hypre-2.11.2/src/FEI_mv/SuperLU/CMakeLists.txt	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/CMakeLists.txt	1970-01-01 00:00:00.000000000 +0000
@@ -1,59 +0,0 @@
-
-set(HYPRE_superlu_HEADERS
-  SRC/Cnames.h
-  SRC/colamd.h
-  SRC/old_colamd.h
-  SRC/slu_cdefs.h
-  SRC/slu_Cnames.h
-  SRC/slu_dcomplex.h
-  SRC/slu_ddefs.h
-  SRC/slu_scomplex.h
-  SRC/slu_sdefs.h
-  SRC/slu_util.h
-  SRC/slu_zdefs.h
-  SRC/supermatrix.h
-)
-
-set(HYPRE_superlu_SRCS 
-  SRC/colamd.c 
-  SRC/dcolumn_bmod.c 
-  SRC/dcolumn_dfs.c 
-  SRC/dcopy_to_ucol.c 
-  SRC/dgscon.c 
-  SRC/dgsequ.c 
-  SRC/dgsrfs.c 
-  SRC/dgssv.c 
-  SRC/dgssvx.c 
-  SRC/dgstrf.c 
-  SRC/dgstrs.c 
-  SRC/dlacon.c 
-  SRC/dlangs.c 
-  SRC/dlaqgs.c 
-  SRC/dmemory.c 
-  SRC/dpanel_bmod.c 
-  SRC/dpanel_dfs.c 
-  SRC/dpivotgrowth.c 
-  SRC/dpivotL.c 
-  SRC/dpruneL.c 
-  SRC/dreadhb.c 
-  SRC/dsnode_bmod.c 
-  SRC/dsnode_dfs.c 
-  SRC/dsp_blas2.c 
-  SRC/dsp_blas3.c 
-  SRC/dutil.c 
-  SRC/get_perm_c.c 
-  SRC/heap_relax_snode.c 
-  SRC/memory.c 
-  SRC/mmd.c 
-  SRC/relax_snode.c 
-  SRC/sp_coletree.c 
-  SRC/sp_ienv.c 
-  SRC/sp_preorder.c 
-  SRC/superlu_timer.c 
-  SRC/slu_util.c
-)
-
-add_definitions(-DNO_TIMER)
-add_library(HYPRE_superlu OBJECT ${HYPRE_superlu_SRCS})
-
-install (FILES ${HYPRE_superlu_HEADERS} DESTINATION include)
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/Makefile hypre-2.13.0/src/FEI_mv/SuperLU/Makefile
--- hypre-2.11.2/src/FEI_mv/SuperLU/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,73 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU
-#
-#  Module:          Makefile
-#
-#  Purpose:         Top-level Makefile
-#
-#  Creation date:   October 2, 1995
-#
-#  Modified:        February 4, 1997  Version 1.0
-#		    November 15, 1997 Version 1.1
-#		    September 1, 1999 Version 2.0
-#		    October 15, 2003 Version 3.0
-#
-############################################################################
-
-############################
-# for hypre
-############################
-all:
-	( cd SRC; $(MAKE) all)
-
-install:
-	( cd SRC; $(MAKE) install)
-
-clean:
-	( cd SRC; $(MAKE) clean)
-
-distclean: clean
-
-############################
-
-#include make.inc
-
-#all: install lib testing
-
-#lib: superlulib tmglib
-
-#clean: cleanlib cleantesting
-
-#install:
-#	( cd INSTALL; $(MAKE) )
-#	( cd INSTALL; cp lsame.c ../SRC/; \
-#	  cp dlamch.c ../SRC/; cp slamch.c ../SRC/ )
-
-blaslib:
-	( cd CBLAS; $(MAKE) )
-
-superlulib:
-	( cd SRC; $(MAKE) )
-
-tmglib:
-	( cd TESTING/MATGEN; $(MAKE) )
-
-matlabmex:
-	( cd MATLAB; $(MAKE) )
-
-testing:
-	( cd TESTING ; $(MAKE) )
-
-cleanlib:
-	( cd SRC; $(MAKE) clean )
-	( cd TESTING/MATGEN; $(MAKE) clean )
-	( cd CBLAS; $(MAKE) clean )
-
-cleantesting:
-	( cd INSTALL; $(MAKE) clean )
-	( cd TESTING; $(MAKE) clean )
-	( cd MATLAB;  $(MAKE) clean )
-	( cd EXAMPLE; $(MAKE) clean )
-	( cd FORTRAN; $(MAKE) clean )
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/Makefile.superlu hypre-2.13.0/src/FEI_mv/SuperLU/Makefile.superlu
--- hypre-2.11.2/src/FEI_mv/SuperLU/Makefile.superlu	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/Makefile.superlu	1970-01-01 00:00:00.000000000 +0000
@@ -1,56 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU
-#
-#  Module:          Makefile
-#
-#  Purpose:         Top-level Makefile
-#
-#  Creation date:   October 2, 1995
-#
-#  Modified:        February 4, 1997  Version 1.0
-#		    November 15, 1997 Version 1.1
-#		    September 1, 1999 Version 2.0
-#		    October 15, 2003 Version 3.0
-#
-############################################################################
-
-include make.inc
-
-all: install lib testing
-
-lib: superlulib tmglib
-
-clean: cleanlib cleantesting
-
-install:
-	( cd INSTALL; $(MAKE) )
-#	( cd INSTALL; cp lsame.c ../SRC/; \
-#	  cp dlamch.c ../SRC/; cp slamch.c ../SRC/ )
-
-blaslib:
-	( cd CBLAS; $(MAKE) )
-
-superlulib:
-	( cd SRC; $(MAKE) )
-
-tmglib:
-	( cd TESTING/MATGEN; $(MAKE) )
-
-matlabmex:
-	( cd MATLAB; $(MAKE) )
-
-testing:
-	( cd TESTING ; $(MAKE) )
-
-cleanlib:
-	( cd SRC; $(MAKE) clean )
-	( cd TESTING/MATGEN; $(MAKE) clean )
-	( cd CBLAS; $(MAKE) clean )
-
-cleantesting:
-	( cd INSTALL; $(MAKE) clean )
-	( cd TESTING; $(MAKE) clean )
-	( cd MATLAB;  $(MAKE) clean )
-	( cd EXAMPLE; $(MAKE) clean )
-	( cd FORTRAN; $(MAKE) clean )
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/make.inc hypre-2.13.0/src/FEI_mv/SuperLU/make.inc
--- hypre-2.11.2/src/FEI_mv/SuperLU/make.inc	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/make.inc	1970-01-01 00:00:00.000000000 +0000
@@ -1,46 +0,0 @@
-############################################################################
-#
-#  Program:         SuperLU
-#
-#  Module:          make.inc
-#
-#  Purpose:         Top-level Definitions
-#
-#  Creation date:   October 2, 1995
-#
-#  Modified:	    February 4, 1997  Version 1.0
-#		    November 15, 1997 Version 1.1
-#		    September 1, 1999 Version 2.0
-#
-############################################################################
-#
-#
-#  The name of the libraries to be created/linked to
-#
-TMGLIB       = libtmglib.a
-SUPERLULIB   = libsuperlu_3.0.a
-BLASLIB      = ../libblas.a
-
-#
-#  The archiver and the flag(s) to use when building archive (library)
-#  If your system has no ranlib, set RANLIB = echo.
-#
-ARCH         = ar
-ARCHFLAGS    = cr
-RANLIB       = ranlib
-
-CC           = cc
-CFLAGS       = -xO3 -xcg92
-FORTRAN	     = f77
-FFLAGS       = -O
-LOADER       = cc
-LOADOPTS     = -xO3
-
-#
-#  C preprocessor defs for compilation (-DNoChange, -DAdd_, or -DUpCase)
-#
-CDEFS        = -DAdd_
-#
-# The directory in which Matlab is installed
-#
-MATLAB	     = /usr/sww/pkg/matlab
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/README hypre-2.13.0/src/FEI_mv/SuperLU/README
--- hypre-2.11.2/src/FEI_mv/SuperLU/README	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/README	1970-01-01 00:00:00.000000000 +0000
@@ -1,157 +0,0 @@
-		SuperLU (Version 3.0)
-		=====================
-
-Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
-
-All rights reserved. 
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-(1) Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer. 
-(2) Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution. 
-(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of
-Energy nor the names of its contributors may be used to endorse or promote
-products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
-  
-
-SuperLU contains a set of subroutines to solve a sparse linear system 
-A*X=B. It uses Gaussian elimination with partial pivoting (GEPP). 
-The columns of A may be preordered before factorization; the 
-preordering for sparsity is completely separate from the factorization.
-
-SuperLU is implemented in ANSI C, and must be compiled with standard 
-ANSI C compilers. It provides functionality for both real and complex
-matrices, in both single and double precision. The file names for the 
-single-precision real version start with letter "s" (such as sgstrf.c);
-the file names for the double-precision real version start with letter "d"
-(such as dgstrf.c); the file names for the single-precision complex
-version start with letter "c" (such as cgstrf.c); the file names
-for the double-precision complex version start with letter "z" 
-(such as zgstrf.c).
-
-
-SuperLU contains the following directory structure:
-
-    SuperLU/README    instructions on installation
-    SuperLU/CBLAS/    needed BLAS routines in C, not necessarily fast
-    SuperLU/EXAMPLE/  example programs
-    SuperLU/FORTRAN/  Fortran interface
-    SuperLU/INSTALL/  test machine dependent parameters; the Users' Guide.
-    SuperLU/MAKE_INC/ sample machine-specific make.inc files
-    SuperLU/MATLAB/   Matlab mex-file interface
-    SuperLU/SRC/      C source code, to be compiled into the superlu.a library
-    SuperLU/TESTING/  driver routines to test correctness
-    SuperLU/Makefile  top level Makefile that does installation and testing
-    SuperLU/make.inc  compiler, compile flags, library definitions and C
-                      preprocessor definitions, included in all Makefiles.
-                      (You may need to edit it to be suitable for your system
-                       before compiling the whole package.)
-
-
-Before installing the package, please examine the three things dependent 
-on your system setup:
-
-1. Edit the make.inc include file.
-   This make include file is referenced inside each of the Makefiles
-   in the various subdirectories. As a result, there is no need to 
-   edit the Makefiles in the subdirectories. All information that is
-   machine specific has been defined in this include file. 
-
-   Example machine-specific make.inc include files are provided 
-   in the MAKE_INC/ directory for several systems, such as 
-   IBM RS/6000, DEC Alpha, SunOS 4.x, SunOS 5.x (Solaris), HP-PA and
-   SGI Iris 4.x.  When you have selected the machine to which you wish 
-   to install SuperLU, copy the appropriate sample include file (if one 
-   is present) into make.inc. For example, if you wish to run 
-   SuperLU on an IBM RS/6000, you can do
-
-       cp MAKE_INC/make.rs6k make.inc
-   
-   For the systems other than listed above, slight modifications to the 
-   make.inc file will need to be made.
-   
-2. The BLAS library.
-   If there is BLAS library available on your machine, you may define
-   the following in the file SuperLU/make.inc:
-        BLASDEF = -DUSE_VENDOR_BLAS
-        BLASLIB = <BLAS library you wish to link with>
-
-   The CBLAS/ subdirectory contains the part of the C BLAS needed by 
-   SuperLU package. However, these codes are intended for use only if there 
-   is no faster implementation of the BLAS already available on your machine.
-   In this case, you should do the following:
-
-    1) In SuperLU/make.inc, undefine (comment out) BLASDEF, and define:
-          BLASLIB = ../blas$(PLAT).a
-
-    2) Go to the SuperLU/ directory, type:
-          make blaslib
-       to make the BLAS library from the routines in the CBLAS/ subdirectory.
-
-3. C preprocessor definition CDEFS.
-   In the header file SRC/slu_Cnames.h, we use macros to determine how
-   C routines should be named so that they are callable by Fortran.
-   (Some vendor-supplied BLAS libraries do not have C interface. So the 
-    re-naming is needed in order for the SuperLU BLAS calls (in C) to 
-    interface with the Fortran-style BLAS.)
-   The possible options for CDEFS are:
-
-       o -DAdd_: Fortran expects a C routine to have an underscore
-		 postfixed to the name;
-       o -DNoChange: Fortran expects a C routine name to be identical to
-		     that compiled by C;
-       o -DUpCase: Fortran expects a C routine name to be all uppercase.
-   
-4. The Matlab MEX-file interface.
-   The MATLAB/ subdirectory includes Matlab C MEX-files, so that 
-   our factor and solve routines can be called as alternatives to those
-   built into Matlab. In the file SuperLU/make.inc, define MATLAB to be the 
-   directory in which Matlab is installed on your system, for example:
-
-       MATLAB = /usr/local/matlab
-
-   At the SuperLU/ directory, type "make matlabmex" to build the MEX-file
-   interface. After you have built the interface, you may go to the MATLAB/ 
-   directory to test the correctness by typing (in Matlab):
-       trysuperlu
-       trylusolve
-
-A Makefile is provided in each subdirectory. The installation can be done
-completely automatically by simply typing "make" at the top level.
-The test results are in the files below:
-       INSTALL/install.out
-       TESTING/stest.out
-       TESTING/dtest.out
-       TESTING/ctest.out
-       TESTING/ztest.out
-
-
-
------------------
-| RELEASE NOTES |
------------------
-* Version 3.0, 10-15-03
-   - add "options" and "stat" argument for the driver routines
-     DGSSV/DGSSVX. This interface is more user-friendly and flexible.
-   - add more examples in EXAMPLE/
-   - add a "symmetric mode" with better performance when the matrix is
-     symmetric, or diagonal dominant, or positive definite, or nearly so.
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,361 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_cdefs.h"
-
-/* 
- * Function prototypes 
- */
-void cusolve(int, int, complex*, complex*);
-void clsolve(int, int, complex*, complex*);
-void cmatvec(int, int, int, complex*, complex*, complex*);
-
-
-
-/* Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-ccolumn_bmod (
-	     const int  jcol,	  /* in */
-	     const int  nseg,	  /* in */
-	     complex     *dense,	  /* in */
-	     complex     *tempv,	  /* working array */
-	     int        *segrep,  /* in */
-	     int        *repfnz,  /* in */
-	     int        fpanelc,  /* in -- first column in the current panel */
-	     GlobalLU_t *Glu,     /* modified */
-	     SuperLUStat_t *stat  /* output */
-	     )
-{
-/*
- * Purpose:
- * ========
- *    Performs numeric block updates (sup-col) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int         incx = 1, incy = 1;
-    complex      alpha, beta;
-    
-    /* krep = representative of current k-th supernode
-     * fsupc = first supernodal column
-     * nsupc = no of columns in supernode
-     * nsupr = no of rows in supernode (used as leading dimension)
-     * luptr = location of supernodal LU-block in storage
-     * kfnz = first nonz in the k-th supernodal segment
-     * no_zeros = no of leading zeros in a supernodal U-segment
-     */
-    complex       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          fsupc, nsupc, nsupr, segsze;
-    int          nrow;	  /* No of rows in the matrix of matrix-vector */
-    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
-    register int lptr, kfnz, isub, irow, i;
-    register int no_zeros, new_next; 
-    int          ufirst, nextlu;
-    int          fst_col; /* First column within small LU update */
-    int          d_fsupc; /* Distance between the first column of the current
-			     panel and the first column of the current snode. */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    complex       *lusup;
-    int          *xlusup;
-    int          nzlumax;
-    complex       *tempv1;
-    complex      zero = {0.0, 0.0};
-    complex      one = {1.0, 0.0};
-    complex      none = {-1.0, 0.0};
-    complex	 comp_temp, comp_temp1;
-    int          mem_error;
-    flops_t      *ops = stat->ops;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    nzlumax = Glu->nzlumax;
-    jcolp1 = jcol + 1;
-    jsupno = supno[jcol];
-    
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-
-	krep = segrep[k];
-	k--;
-	ksupno = supno[krep];
-	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
-
-	    fsupc = xsup[ksupno];
-	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-  	    /* Distance from the current supernode to the current panel; 
-	       d_fsupc=0 if fsupc > fpanelc. */
-  	    d_fsupc = fst_col - fsupc; 
-
-	    luptr = xlusup[fst_col] + d_fsupc;
-	    lptr = xlsub[fsupc] + d_fsupc;
-
-	    kfnz = repfnz[krep];
-	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
-
-	    segsze = krep - kfnz + 1;
-	    nsupc = krep - fst_col + 1;
-	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	    nrow = nsupr - d_fsupc - nsupc;
-	    krep_ind = lptr + nsupc - 1;
-
-
-
-
-	    /* 
-	     * Case 1: Update U-segment of size 1 -- col-col update 
-	     */
-	    if ( segsze == 1 ) {
-	  	ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc;
-
-		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-		    c_sub(&dense[irow], &dense[irow], &comp_temp);
-		    luptr++;
-		}
-
-	    } else if ( segsze <= 3 ) {
-		ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc-1;
-		ukj1 = dense[lsub[krep_ind - 1]];
-		luptr1 = luptr - nsupr;
-
-		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
-		    cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		    c_sub(&ukj, &ukj, &comp_temp);
-		    dense[lsub[krep_ind]] = ukj;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-			cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-			cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			c_add(&comp_temp, &comp_temp, &comp_temp1);
-			c_sub(&dense[irow], &dense[irow], &comp_temp);
-		    }
-		} else { /* Case 3: 3cols-col update */
-		    ukj2 = dense[lsub[krep_ind - 2]];
-		    luptr2 = luptr1 - nsupr;
-  		    cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
-		    c_sub(&ukj1, &ukj1, &comp_temp);
-
-		    cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		    cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-		    c_add(&comp_temp, &comp_temp, &comp_temp1);
-		    c_sub(&ukj, &ukj, &comp_temp);
-
-		    dense[lsub[krep_ind]] = ukj;
-		    dense[lsub[krep_ind-1]] = ukj1;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-			luptr2++;
-			cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-			cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			c_add(&comp_temp, &comp_temp, &comp_temp1);
-			cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-			c_add(&comp_temp, &comp_temp, &comp_temp1);
-			c_sub(&dense[irow], &dense[irow], &comp_temp);
-		    }
-		}
-
-
-	    } else {
-	  	/*
-		 * Case: sup-col update
-		 * Perform a triangular solve and block update,
-		 * then scatter the result of sup-col update to dense
-		 */
-
-		no_zeros = kfnz - fst_col;
-
-	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
-	        isub = lptr + no_zeros;
-	        for (i = 0; i < segsze; i++) {
-	  	    irow = lsub[isub];
-		    tempv[i] = dense[irow];
-		    ++isub; 
-	        }
-
-	        /* Dense triangular solve -- start effective triangle */
-		luptr += nsupr * no_zeros + no_zeros; 
-		
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#else		
-		ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#endif		
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-                alpha = one;
-                beta = zero;
-#ifdef _CRAY
-		CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-		cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		clsolve ( nsupr, segsze, &lusup[luptr], tempv );
-
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-		cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		
-		
-                /* Scatter tempv[] into SPA dense[] as a temporary storage */
-                isub = lptr + no_zeros;
-                for (i = 0; i < segsze; i++) {
-                    irow = lsub[isub];
-                    dense[irow] = tempv[i];
-                    tempv[i] = zero;
-                    ++isub;
-                }
-
-		/* Scatter tempv1[] into SPA dense[] */
-		for (i = 0; i < nrow; i++) {
-		    irow = lsub[isub];
-		    c_sub(&dense[irow], &dense[irow], &tempv1[i]);
-		    tempv1[i] = zero;
-		    ++isub;
-		}
-	    }
-	    
-	} /* if jsupno ... */
-
-    } /* for each segment... */
-
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    nextlu = xlusup[jcol];
-    fsupc = xsup[jsupno];
-
-    /* Copy the SPA dense into L\U[*,j] */
-    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
-    while ( new_next > nzlumax ) {
-	if (mem_error = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
-	    return (mem_error);
-	lusup = Glu->lusup;
-	lsub = Glu->lsub;
-    }
-
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-        dense[irow] = zero;
-	++nextlu;
-    }
-
-    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */
-
-    /* For more updates within the panel (also within the current supernode), 
-     * should start from the first column of the panel, or the first column 
-     * of the supernode, whichever is bigger. There are 2 cases:
-     *    1) fsupc < fpanelc, then fst_col := fpanelc
-     *    2) fsupc >= fpanelc, then fst_col := fsupc
-     */
-    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-    if ( fst_col < jcol ) {
-
-  	/* Distance between the current supernode and the current panel.
-	   d_fsupc=0 if fsupc >= fpanelc. */
-  	d_fsupc = fst_col - fsupc;
-
-	lptr = xlsub[fsupc] + d_fsupc;
-	luptr = xlusup[fst_col] + d_fsupc;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	nsupc = jcol - fst_col;	/* Excluding jcol */
-	nrow = nsupr - d_fsupc - nsupc;
-
-	/* Points to the beginning of jcol in snode L\U(jsupno) */
-	ufirst = xlusup[jcol] + d_fsupc;	
-
-	ops[TRSV] += 4 * nsupc * (nsupc - 1);
-	ops[GEMV] += 8 * nrow * nsupc;
-	
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#else
-	ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#endif
-	
-	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
-
-#ifdef _CRAY
-	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-	cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-
-	cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
-		&lusup[ufirst], tempv );
-	
-        /* Copy updates from tempv[*] into lusup[*] */
-	isub = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    c_sub(&lusup[isub], &lusup[isub], &tempv[i]);
-	    tempv[i] = zero;
-	    ++isub;
-	}
-
-#endif
-	
-	
-    } /* if fst_col < jcol ... */ 
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccolumn_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,269 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-/* What type of supernodes we want */
-#define T2_SUPER
-
-int
-ccolumn_dfs(
-	   const int  m,         /* in - number of rows in the matrix */
-	   const int  jcol,      /* in */
-	   int        *perm_r,   /* in */
-	   int        *nseg,     /* modified - with new segments appended */
-	   int        *lsub_col, /* in - defines the RHS vector to start the dfs */
-	   int        *segrep,   /* modified - with new segments appended */
-	   int        *repfnz,   /* modified */
-	   int        *xprune,   /* modified */
-	   int        *marker,   /* modified */
-	   int        *parent,	 /* working array */
-	   int        *xplore,   /* working array */
-	   GlobalLU_t *Glu       /* modified */
-	   )
-{
-/* 
- * Purpose
- * =======
- *   "column_dfs" performs a symbolic factorization on column jcol, and
- *   decide the supernode boundary.
- *
- *   This routine does not use numeric values, but only use the RHS 
- *   row indices to start the dfs.
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives. The routine returns a list of such supernodal 
- *   representatives in topological order of the dfs that generates them.
- *   The location of the first nonzero in each such supernodal segment
- *   (supernodal entry location) is also returned.
- *
- * Local parameters
- * ================
- *   nseg: no of segments in current U[*,j]
- *   jsuper: jsuper=EMPTY if column j does not belong to the same
- *	supernode as j-1. Otherwise, jsuper=nsuper.
- *
- *   marker2: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- * Return value
- * ============
- *     0  success;
- *   > 0  number of bytes allocated when run out of space.
- *
- */
-    int     jcolp1, jcolm1, jsuper, nsuper, nextl;
-    int     k, krep, krow, kmark, kperm;
-    int     *marker2;           /* Used for small panel LU */
-    int	    fsupc;		/* First column of a snode */
-    int     myfnz;		/* First nonz column of a U-segment */
-    int	    chperm, chmark, chrep, kchild;
-    int     xdfs, maxdfs, kpar, oldrep;
-    int     jptr, jm1ptr;
-    int     ito, ifrom, istop;	/* Used to compress row subscripts */
-    int     mem_error;
-    int     *xsup, *supno, *lsub, *xlsub;
-    int     nzlmax;
-    static  int  first = 1, maxsuper;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	first = 0;
-    }
-    jcolp1  = jcol + 1;
-    jcolm1  = jcol - 1;
-    nsuper  = supno[jcol];
-    jsuper  = nsuper;
-    nextl   = xlsub[jcol];
-    marker2 = &marker[2*m];
-
-
-    /* For each nonzero in A[*,jcol] do dfs */
-    for (k = 0; lsub_col[k] != EMPTY; k++) {
-
-	krow = lsub_col[k];
-    	lsub_col[k] = EMPTY;
-	kmark = marker2[krow];    	
-
-	/* krow was visited before, go to the next nonz */
-        if ( kmark == jcol ) continue; 
-
-	/* For each unmarked nbr krow of jcol
-	 *	krow is in L: place it in structure of L[*,jcol]
-	 */
-	marker2[krow] = jcol;
-	kperm = perm_r[krow];
-
-   	if ( kperm == EMPTY ) {
-	    lsub[nextl++] = krow; 	/* krow is indexed into A */
-	    if ( nextl >= nzlmax ) {
-		if ( mem_error = cLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-		    return (mem_error);
-		lsub = Glu->lsub;
-	    }
-            if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */
-  	} else {
-	    /*	krow is in U: if its supernode-rep krep
-	     *	has been explored, update repfnz[*]
-	     */
-	    krep = xsup[supno[kperm]+1] - 1;
-	    myfnz = repfnz[krep];
-
-	    if ( myfnz != EMPTY ) {	/* Visited before */
-	    	if ( myfnz > kperm ) repfnz[krep] = kperm;
-		/* continue; */
-	    }
-	    else {
-		/* Otherwise, perform dfs starting at krep */
-		oldrep = EMPTY;
-	 	parent[krep] = oldrep;
-	  	repfnz[krep] = kperm;
-		xdfs = xlsub[krep];
-	  	maxdfs = xprune[krep];
-
-		do {
-		    /* 
-		     * For each unmarked kchild of krep 
-		     */
-		    while ( xdfs < maxdfs ) {
-
-		   	kchild = lsub[xdfs];
-			xdfs++;
-		  	chmark = marker2[kchild];
-
-		   	if ( chmark != jcol ) { /* Not reached yet */
-		   	    marker2[kchild] = jcol;
-		   	    chperm = perm_r[kchild];
-
-		   	    /* Case kchild is in L: place it in L[*,k] */
-		   	    if ( chperm == EMPTY ) {
-			    	lsub[nextl++] = kchild;
-				if ( nextl >= nzlmax ) {
-				    if ( mem_error =
-					 cLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) )
-					return (mem_error);
-				    lsub = Glu->lsub;
-				}
-				if ( chmark != jcolm1 ) jsuper = EMPTY;
-			    } else {
-		    	    	/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-			         *   been explored, update its repfnz[*]
-			         */
-		   	    	chrep = xsup[supno[chperm]+1] - 1;
-		   		myfnz = repfnz[chrep];
-		   		if ( myfnz != EMPTY ) { /* Visited before */
-				    if ( myfnz > chperm )
-     				  	repfnz[chrep] = chperm;
-				} else {
-		        	    /* Continue dfs at super-rep of kchild */
-		   		    xplore[krep] = xdfs;	
-		   		    oldrep = krep;
-		   		    krep = chrep; /* Go deeper down G(L^t) */
-				    parent[krep] = oldrep;
-		    		    repfnz[krep] = chperm;
-		   		    xdfs = xlsub[krep];     
-				    maxdfs = xprune[krep];
-				} /* else */
-
-			   } /* else */
-
-			} /* if */
-
-		    } /* while */
-
-		    /* krow has no more unexplored nbrs;
-	   	     *    place supernode-rep krep in postorder DFS.
-	   	     *    backtrack dfs to its parent
-		     */
-		    segrep[*nseg] = krep;
-		    ++(*nseg);
-		    kpar = parent[krep]; /* Pop from stack, mimic recursion */
-		    if ( kpar == EMPTY ) break; /* dfs done */
-		    krep = kpar;
-		    xdfs = xplore[krep];
-		    maxdfs = xprune[krep];
-
-		} while ( kpar != EMPTY ); 	/* Until empty stack */
-
-	    } /* else */
-
-	} /* else */
-
-    } /* for each nonzero ... */
-
-    /* Check to see if j belongs in the same supernode as j-1 */
-    if ( jcol == 0 ) { /* Do nothing for column 0 */
-	nsuper = supno[0] = 0;
-    } else {
-   	fsupc = xsup[nsuper];
-	jptr = xlsub[jcol];	/* Not compressed yet */
-	jm1ptr = xlsub[jcolm1];
-
-#ifdef T2_SUPER
-	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
-#endif
-	/* Make sure the number of columns in a supernode doesn't
-	   exceed threshold. */
-	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
-
-	/* If jcol starts a new supernode, reclaim storage space in
-	 * lsub from the previous supernode. Note we only store
-	 * the subscript set of the first and last columns of
-   	 * a supernode. (first for num values, last for pruning)
-	 */
-	if ( jsuper == EMPTY ) {	/* starts a new supernode */
-	    if ( (fsupc < jcolm1-1) ) {	/* >= 3 columns in nsuper */
-#ifdef CHK_COMPRESS
-		printf("  Compress lsub[] at super %d-%d\n", fsupc, jcolm1);
-#endif
-	        ito = xlsub[fsupc+1];
-		xlsub[jcolm1] = ito;
-		istop = ito + jptr - jm1ptr;
-		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
-		xlsub[jcol] = istop;
-		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
-		    lsub[ito] = lsub[ifrom];
-		nextl = ito;            /* = istop + length(jcol) */
-	    }
-	    nsuper++;
-	    supno[jcol] = nsuper;
-	} /* if a new supernode */
-
-    }	/* else: jcol > 0 */ 
-    
-    /* Tidy up the pointers before exit */
-    xsup[nsuper+1] = jcolp1;
-    supno[jcolp1]  = nsuper;
-    xprune[jcol]   = nextl;	/* Initialize upper bound for pruning */
-    xlsub[jcolp1]  = nextl;
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ccopy_to_ucol.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,103 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-int
-ccopy_to_ucol(
-	      int        jcol,	  /* in */
-	      int        nseg,	  /* in */
-	      int        *segrep,  /* in */
-	      int        *repfnz,  /* in */
-	      int        *perm_r,  /* in */
-	      complex     *dense,   /* modified - reset to zero on return */
-	      GlobalLU_t *Glu      /* modified */
-	      )
-{
-/* 
- * Gather from SPA dense[*] to global ucol[*].
- */
-    int ksub, krep, ksupno;
-    int i, k, kfnz, segsze;
-    int fsupc, isub, irow;
-    int jsupno, nextu;
-    int new_next, mem_error;
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-    complex    *ucol;
-    int       *usub, *xusub;
-    int       nzumax;
-
-    complex zero = {0.0, 0.0};
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    nzumax  = Glu->nzumax;
-    
-    jsupno = supno[jcol];
-    nextu  = xusub[jcol];
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-	krep = segrep[k--];
-	ksupno = supno[krep];
-
-	if ( ksupno != jsupno ) { /* Should go into ucol[] */
-	    kfnz = repfnz[krep];
-	    if ( kfnz != EMPTY ) {	/* Nonzero U-segment */
-
-	    	fsupc = xsup[ksupno];
-	        isub = xlsub[fsupc] + kfnz - fsupc;
-	        segsze = krep - kfnz + 1;
-
-		new_next = nextu + segsze;
-		while ( new_next > nzumax ) {
-		    if (mem_error = cLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu))
-			return (mem_error);
-		    ucol = Glu->ucol;
-		    if (mem_error = cLUMemXpand(jcol, nextu, USUB, &nzumax, Glu))
-			return (mem_error);
-		    usub = Glu->usub;
-		    lsub = Glu->lsub;
-		}
-		
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    usub[nextu] = perm_r[irow];
-		    ucol[nextu] = dense[irow];
-		    dense[irow] = zero;
-		    nextu++;
-		    isub++;
-		} 
-
-	    }
-
-	}
-
-    } /* for each segment... */
-
-    xusub[jcol + 1] = nextu;      /* Close U[*,jcol] */
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgscon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgscon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgscon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,143 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	cgscon.c
- * History:     Modified from lapack routines CGECON.
- */
-#include <math.h>
-#include "slu_cdefs.h"
-
-void
-cgscon(char *norm, SuperMatrix *L, SuperMatrix *U,
-       float anorm, float *rcond, SuperLUStat_t *stat, int *info)
-{
-/*
-    Purpose   
-    =======   
-
-    CGSCON estimates the reciprocal of the condition number of a general 
-    real matrix A, in either the 1-norm or the infinity-norm, using   
-    the LU factorization computed by CGETRF.   
-
-    An estimate is obtained for norm(inv(A)), and the reciprocal of the   
-    condition number is computed as   
-       RCOND = 1 / ( norm(A) * norm(inv(A)) ).   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    NORM    (input) char*
-            Specifies whether the 1-norm condition number or the   
-            infinity-norm condition number is required:   
-            = '1' or 'O':  1-norm;   
-            = 'I':         Infinity-norm.
-	    
-    L       (input) SuperMatrix*
-            The factor L from the factorization Pr*A*Pc=L*U as computed by
-            cgstrf(). Use compressed row subscripts storage for supernodes,
-            i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
- 
-    U       (input) SuperMatrix*
-            The factor U from the factorization Pr*A*Pc=L*U as computed by
-            cgstrf(). Use column-wise storage scheme, i.e., U has types:
-            Stype = SLU_NC, Dtype = SLU_C, Mtype = TRU.
-	    
-    ANORM   (input) float
-            If NORM = '1' or 'O', the 1-norm of the original matrix A.   
-            If NORM = 'I', the infinity-norm of the original matrix A.
-	    
-    RCOND   (output) float*
-            The reciprocal of the condition number of the matrix A,   
-            computed as RCOND = 1/(norm(A) * norm(inv(A))).
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    int    kase, kase1, onenrm, i;
-    float ainvnm;
-    complex *work;
-    extern int crscl_(int *, complex *, complex *, int *);
-
-    extern int clacon_(int *, complex *, complex *, float *, int *);
-
-    
-    /* Test the input parameters. */
-    *info = 0;
-    onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O");
-    if (! onenrm && ! lsame_(norm, "I")) *info = -1;
-    else if (L->nrow < 0 || L->nrow != L->ncol ||
-             L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU)
-	 *info = -2;
-    else if (U->nrow < 0 || U->nrow != U->ncol ||
-             U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU) 
-	*info = -3;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("cgscon", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    *rcond = 0.;
-    if ( L->nrow == 0 || U->nrow == 0) {
-	*rcond = 1.;
-	return;
-    }
-
-    work = complexCalloc( 3*L->nrow );
-
-
-    if ( !work )
-	ABORT("Malloc fails for work arrays in cgscon.");
-    
-    /* Estimate the norm of inv(A). */
-    ainvnm = 0.;
-    if ( onenrm ) kase1 = 1;
-    else kase1 = 2;
-    kase = 0;
-
-    do {
-	clacon_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase);
-
-	if (kase == 0) break;
-
-	if (kase == kase1) {
-	    /* Multiply by inv(L). */
-	    sp_ctrsv("L", "No trans", "Unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(U). */
-	    sp_ctrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info);
-	    
-	} else {
-
-	    /* Multiply by inv(U'). */
-	    sp_ctrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(L'). */
-	    sp_ctrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info);
-	    
-	}
-
-    } while ( kase != 0 );
-
-    /* Compute the estimate of the reciprocal condition number. */
-    if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm;
-
-    SUPERLU_FREE (work);
-    return;
-
-} /* cgscon */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsequ.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,184 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	cgsequ.c
- * History:     Modified from LAPACK routine CGEEQU
- */
-#include <math.h>
-#include "slu_cdefs.h"
-
-void
-cgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd,
-	float *colcnd, float *amax, int *info)
-{
-/*    
-    Purpose   
-    =======   
-
-    CGSEQU computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NC; Dtype = SLU_C; Mtype = SLU_GE.
-	    
-    R       (output) float*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) float*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) float*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) float*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) float*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= A->nrow:  the i-th row of A is exactly zero   
-                  >  A->ncol:  the (i-M)-th column of A is exactly zero   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    complex   *Aval;
-    int i, j, irow;
-    float rcmin, rcmax;
-    float bignum, smlnum;
-    extern double slamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("cgsequ", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Get machine constants. */
-    smlnum = slamch_("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    r[irow] = SUPERLU_MAX( r[irow], c_abs1(&Aval[i]) );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = 0; i < A->nrow; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    c[j] = SUPERLU_MAX( c[j], c_abs1(&Aval[i]) * r[irow] );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    return;
-
-} /* cgsequ */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsrfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgsrfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgsrfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,447 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	cgsrfs.c
- * History:     Modified from lapack routine CGERFS
- */
-#include <math.h>
-#include "slu_cdefs.h"
-
-void
-cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
-       int *perm_c, int *perm_r, char *equed, float *R, float *C,
-       SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr,
-       SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose   
- *   =======   
- *
- *   CGSRFS improves the computed solution to a system of linear   
- *   equations and provides error bounds and backward error estimates for 
- *   the solution.   
- *
- *   If equilibration was performed, the system becomes:
- *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- *   Arguments   
- *   =========   
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *   
- *   A       (input) SuperMatrix*
- *           The original matrix A in the system, or the scaled A if
- *           equilibration was done. The type of A can be:
- *           Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_GE.
- *    
- *   L       (input) SuperMatrix*
- *	     The factor L from the factorization Pr*A*Pc=L*U. Use
- *           compressed row subscripts storage for supernodes, 
- *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
- * 
- *   U       (input) SuperMatrix*
- *           The factor U from the factorization Pr*A*Pc=L*U as computed by
- *           cgstrf(). Use column-wise storage scheme, 
- *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU.
- *
- *   perm_c  (input) int*, dimension (A->ncol)
- *	     Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *
- *   perm_r  (input) int*, dimension (A->nrow)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- *   equed   (input) Specifies the form of equilibration that was done.
- *           = 'N': No equilibration.
- *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *           = 'C': Column equilibration, i.e., A was postmultiplied by
- *                  diag(C).
- *           = 'B': Both row and column equilibration, i.e., A was replaced 
- *                  by diag(R)*A*diag(C).
- *
- *   R       (input) float*, dimension (A->nrow)
- *           The row scale factors for A.
- *           If equed = 'R' or 'B', A is premultiplied by diag(R).
- *           If equed = 'N' or 'C', R is not accessed.
- * 
- *   C       (input) float*, dimension (A->ncol)
- *           The column scale factors for A.
- *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
- *           If equed = 'N' or 'R', C is not accessed.
- *
- *   B       (input) SuperMatrix*
- *           B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
- *           The right hand side matrix B.
- *           if equed = 'R' or 'B', B is premultiplied by diag(R).
- *
- *   X       (input/output) SuperMatrix*
- *           X has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
- *           On entry, the solution matrix X, as computed by cgstrs().
- *           On exit, the improved solution matrix X.
- *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
- *               in order to obtain the solution to the original system.
- *
- *   FERR    (output) float*, dimension (B->ncol)   
- *           The estimated forward error bound for each solution vector   
- *           X(j) (the j-th column of the solution matrix X).   
- *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *           is an estimated upper bound for the magnitude of the largest 
- *           element in (X(j) - XTRUE) divided by the magnitude of the   
- *           largest element in X(j).  The estimate is as reliable as   
- *           the estimate for RCOND, and is almost always a slight   
- *           overestimate of the true error.
- *
- *   BERR    (output) float*, dimension (B->ncol)   
- *           The componentwise relative backward error of each solution   
- *           vector X(j) (i.e., the smallest relative change in   
- *           any element of A or B that makes X(j) an exact solution).
- *
- *   stat     (output) SuperLUStat_t*
- *            Record the statistics on runtime and floating-point operation count.
- *            See util.h for the definition of 'SuperLUStat_t'.
- *
- *   info    (output) int*   
- *           = 0:  successful exit   
- *            < 0:  if INFO = -i, the i-th argument had an illegal value   
- *
- *    Internal Parameters   
- *    ===================   
- *
- *    ITMAX is the maximum number of steps of iterative refinement.   
- *
- */  
-
-#define ITMAX 5
-    
-    /* Table of constant values */
-    int    ione = 1;
-    complex ndone = {-1., 0.};
-    complex done = {1., 0.};
-    
-    /* Local variables */
-    NCformat *Astore;
-    complex   *Aval;
-    SuperMatrix Bjcol;
-    DNformat *Bstore, *Xstore, *Bjcol_store;
-    complex   *Bmat, *Xmat, *Bptr, *Xptr;
-    int      kase;
-    float   safe1, safe2;
-    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
-    int      ldb, ldx, nrhs;
-    float   s, xk, lstres, eps, safmin;
-    char     transc[1];
-    trans_t  transt;
-    complex   *work;
-    float   *rwork;
-    int      *iwork;
-    extern double slamch_(char *);
-    extern int clacon_(int *, complex *, complex *, float *, int *);
-#ifdef _CRAY
-    extern int CCOPY(int *, complex *, int *, complex *, int *);
-    extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *);
-#else
-    extern int ccopy_(int *, complex *, int *, complex *, int *);
-    extern int caxpy_(int *, complex *, complex *, int *, complex *, int *);
-#endif
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    Bstore = B->Store;
-    Xstore = X->Store;
-    Bmat   = Bstore->nzval;
-    Xmat   = Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-    
-    /* Test the input parameters */
-    *info = 0;
-    notran = (trans == NOTRANS);
-    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
- 	      L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU )
-	*info = -3;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
- 	      U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU )
-	*info = -4;
-    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
- 	      B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE )
-        *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
- 	      X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE )
-	*info = -11;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("cgsrfs", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || nrhs == 0) {
-	for (j = 0; j < nrhs; ++j) {
-	    ferr[j] = 0.;
-	    berr[j] = 0.;
-	}
-	return;
-    }
-
-    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-    colequ = lsame_(equed, "C") || lsame_(equed, "B");
-    
-    /* Allocate working space */
-    work = complexMalloc(2*A->nrow);
-    rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) );
-    iwork = intMalloc(A->nrow);
-    if ( !work || !rwork || !iwork ) 
-        ABORT("Malloc fails for work/rwork/iwork.");
-    
-    if ( notran ) {
-	*(unsigned char *)transc = 'N';
-        transt = TRANS;
-    } else {
-	*(unsigned char *)transc = 'T';
-	transt = NOTRANS;
-    }
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = slamch_("Epsilon");
-    safmin = slamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-    /* Compute the number of nonzeros in each row (or column) of A */
-    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
-    if ( notran ) {
-	for (k = 0; k < A->ncol; ++k)
-	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
-		++iwork[Astore->rowind[i]];
-    } else {
-	for (k = 0; k < A->ncol; ++k)
-	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
-    }	
-
-    /* Copy one column of RHS B into Bjcol. */
-    Bjcol.Stype = B->Stype;
-    Bjcol.Dtype = B->Dtype;
-    Bjcol.Mtype = B->Mtype;
-    Bjcol.nrow  = B->nrow;
-    Bjcol.ncol  = 1;
-    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
-    Bjcol_store = Bjcol.Store;
-    Bjcol_store->lda = ldb;
-    Bjcol_store->nzval = work; /* address aliasing */
-	
-    /* Do for each right hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	Bptr = &Bmat[j*ldb];
-	Xptr = &Xmat[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-	    
-#ifdef _CRAY
-	    CCOPY(&A->nrow, Bptr, &ione, work, &ione);
-#else
-	    ccopy_(&A->nrow, Bptr, &ione, work, &ione);
-#endif
-	    sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione);
-
-	    /* Compute componentwise relative backward error from formula 
-	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
-	       where abs(Z) is the componentwise absolute value of the matrix
-	       or vector Z.  If the i-th component of the denominator is less
-	       than SAFE2, then SAFE1 is added to the i-th component of the   
-	       numerator and denominator before dividing. */
-
-	    for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] );
-	    
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    if (notran) {
-		for (k = 0; k < A->ncol; ++k) {
-		    xk = c_abs1( &Xptr[k] );
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-			rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk;
-		}
-	    } else {
-		for (k = 0; k < A->ncol; ++k) {
-		    s = 0.;
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-			irow = Astore->rowind[i];
-			s += c_abs1(&Aval[i]) * c_abs1(&Xptr[irow]);
-		    }
-		    rwork[k] += s;
-		}
-	    }
-	    s = 0.;
-	    for (i = 0; i < A->nrow; ++i) {
-		if (rwork[i] > safe2)
-		    s = SUPERLU_MAX( s, c_abs1(&work[i]) / rwork[i] );
-		else
-		    s = SUPERLU_MAX( s, (c_abs1(&work[i]) + safe1) / 
-				(rwork[i] + safe1) );
-	    }
-	    berr[j] = s;
-
-	    /* Test stopping criterion. Continue iterating if   
-	       1) The residual BERR(J) is larger than machine epsilon, and   
-	       2) BERR(J) decreased by at least a factor of 2 during the   
-	          last iteration, and   
-	       3) At most ITMAX iterations tried. */
-
-	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
-		/* Update solution and try again. */
-		cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-#ifdef _CRAY
-		CAXPY(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#else
-		caxpy_(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#endif
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-        
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-	/* Bound error from formula:
-	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
-	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
-          where   
-            norm(Z) is the magnitude of the largest component of Z   
-            inv(op(A)) is the inverse of op(A)   
-            abs(Z) is the componentwise absolute value of the matrix or
-	       vector Z   
-            NZ is the maximum number of nonzeros in any row of A, plus 1   
-            EPS is machine epsilon   
-
-          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
-          is incremented by SAFE1 if the i-th component of   
-          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   
-
-          Use CLACON to estimate the infinity-norm of the matrix   
-             inv(op(A)) * diag(W),   
-          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
-	
-	for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] );
-	
-	/* Compute abs(op(A))*abs(X) + abs(B). */
-	if ( notran ) {
-	    for (k = 0; k < A->ncol; ++k) {
-		xk = c_abs1( &Xptr[k] );
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-		    rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk;
-	    }
-	} else {
-	    for (k = 0; k < A->ncol; ++k) {
-		s = 0.;
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    xk = c_abs1( &Xptr[irow] );
-		    s += c_abs1(&Aval[i]) * xk;
-		}
-		rwork[k] += s;
-	    }
-	}
-	
-	for (i = 0; i < A->nrow; ++i)
-	    if (rwork[i] > safe2)
-		rwork[i] = c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i];
-	    else
-		rwork[i] = c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;
-	kase = 0;
-
-	do {
-	    clacon_(&A->nrow, &work[A->nrow], work,
-		    &ferr[j], &kase);
-	    if (kase == 0) break;
-
-	    if (kase == 1) {
-		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) {
-		        cs_mult(&work[i], &work[i], C[i]);
-	            }
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->nrow; ++i) {
-		        cs_mult(&work[i], &work[i], R[i]);
-                    }
-
-		cgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		for (i = 0; i < A->nrow; ++i) {
-		    cs_mult(&work[i], &work[i], rwork[i]);
-	 	}
-	    } else {
-		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
-		for (i = 0; i < A->nrow; ++i) {
-		    cs_mult(&work[i], &work[i], rwork[i]);
-		}
-		
-		cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) {
-		        cs_mult(&work[i], &work[i], C[i]);
-		    }
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->ncol; ++i) {
-		        cs_mult(&work[i], &work[i], R[i]);  
-		    }
-	    }
-	    
-	} while ( kase != 0 );
-
-	/* Normalize error. */
-	lstres = 0.;
- 	if ( notran && colequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, C[i] * c_abs1( &Xptr[i]) );
-  	} else if ( !notran && rowequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, R[i] * c_abs1( &Xptr[i]) );
-	} else {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, c_abs1( &Xptr[i]) );
-	}
-	if ( lstres != 0. )
-	    ferr[j] /= lstres;
-
-    } /* for each RHS j ... */
-    
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(rwork);
-    SUPERLU_FREE(iwork);
-    SUPERLU_FREE(Bjcol.Store);
-
-    return;
-
-} /* cgsrfs */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssv.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,221 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_cdefs.h"
-
-void
-cgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-      SuperMatrix *L, SuperMatrix *U, SuperMatrix *B,
-      SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * CGSSV solves the system of linear equations A*X=B, using the
- * LU factorization from CGSTRF. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *
- *      1.1. Permute the columns of A, forming A*Pc, where Pc
- *           is a permutation matrix. For more details of this step, 
- *           see sp_preorder.c.
- *
- *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
- *           by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      1.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
- *      above algorithm to the transpose of A:
- *
- *      2.1. Permute columns of transpose(A) (rows of A),
- *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
- *           determined by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      2.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- * 
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR; Dtype = SLU_C; Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- * perm_c  (input/output) int*
- *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
- *         which defines the permutation matrix Pc; perm_c[i] = j means 
- *         column i of A is in position j in A*Pc.
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
- *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
- *            On exit, perm_c may be overwritten by the product of the input
- *            perm_c and a permutation that postorders the elimination tree
- *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *            is already in postorder.
- *         Otherwise, it is an output argument.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined 
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->RowPerm = MY_PERMR or
- *            options->Fact = SamePattern_SameRowPerm, perm_r is an
- *            input argument.
- *         otherwise it is an output argument.
- *
- * L       (output) SuperMatrix*
- *         The factor L from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
- *         
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *	   = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *   
- */
-    DNformat *Bstore;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int      lwork = 0, *etree, i;
-    
-    /* Set default values for some parameters */
-    float   drop_tol = 0.;
-    int      panel_size;     /* panel size */
-    int      relax;          /* no of columns in a relaxed snodes */
-    int      permc_spec;
-    trans_t  trans = NOTRANS;
-    double   *utime;
-    double   t;	/* Temporary time */
-
-    /* Test the input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    if ( options->Fact != DOFACT ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	 A->Dtype != SLU_C || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-	B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE )
-	*info = -7;
-    if ( *info != 0 ) {
-	i = -(*info);
-	xerbla_("cgssv", &i);
-	return;
-    }
-
-    utime = stat->utime;
-
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	trans = TRANS;
-    } else {
-        if ( A->Stype == SLU_NC ) AA = A;
-    }
-
-    t = SuperLU_timer_();
-    /*
-     * Get column permutation vector perm_c[], according to permc_spec:
-     *   permc_spec = NATURAL:  natural ordering 
-     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-     *   permc_spec = COLAMD:   approximate minimum degree column ordering
-     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-     */
-    permc_spec = options->ColPerm;
-    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-      get_perm_c(permc_spec, AA, perm_c);
-    utime[COLPERM] = SuperLU_timer_() - t;
-
-    etree = intMalloc(A->ncol);
-
-    t = SuperLU_timer_();
-    sp_preorder(options, AA, perm_c, etree, &AC);
-    utime[ETREE] = SuperLU_timer_() - t;
-
-    panel_size = sp_ienv(1);
-    relax = sp_ienv(2);
-
-    /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	  relax, panel_size, sp_ienv(3), sp_ienv(4));*/
-    t = SuperLU_timer_(); 
-    /* Compute the LU factorization of A. */
-    cgstrf(options, &AC, drop_tol, relax, panel_size,
-	   etree, NULL, lwork, perm_c, perm_r, L, U, stat, info);
-    utime[FACT] = SuperLU_timer_() - t;
-
-    t = SuperLU_timer_();
-    if ( *info == 0 ) {
-        /* Solve the system A*X=B, overwriting B with X. */
-        cgstrs (trans, L, U, perm_c, perm_r, B, stat, info);
-    }
-    utime[SOLVE] = SuperLU_timer_() - t;
-
-    SUPERLU_FREE (etree);
-    Destroy_CompCol_Permuted(&AC);
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssvx.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgssvx.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgssvx.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,614 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_cdefs.h"
-
-void
-cgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-       int *etree, char *equed, float *R, float *C,
-       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
-       SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, 
-       float *rcond, float *ferr, float *berr, 
-       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * CGSSVX solves the system of linear equations A*X=B or A'*X=B, using
- * the LU factorization from cgstrf(). Error bounds on the solution and
- * a condition estimate are also provided. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *  
- *      1.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A is
- *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
- *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
- *           = TRANS or CONJ).
- *
- *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
- *           matrix that usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the matrix A (after equilibration if options->Equil = YES)
- *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
- *
- *      1.4. Compute the reciprocal pivot growth factor.
- *
- *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form of 
- *           A is used to estimate the condition number of the matrix A. If
- *           the reciprocal of the condition number is less than machine
- *           precision, info = A->ncol+1 is returned as a warning, but the
- *           routine still goes on to solve for X and computes error bounds
- *           as described below.
- *
- *      1.6. The system of equations is solved for X using the factored form
- *           of A.
- *
- *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      1.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R)
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
- *      to the transpose of A:
- *
- *      2.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A' is
- *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
- *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
- *
- *      2.2. Permute columns of transpose(A) (rows of A), 
- *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
- *           usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the transpose(A) (after equilibration if 
- *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
- *           permutation Pr determined by partial pivoting.
- *
- *      2.4. Compute the reciprocal pivot growth factor.
- *
- *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form 
- *           of transpose(A) is used to estimate the condition number of the
- *           matrix A. If the reciprocal of the condition number
- *           is less than machine precision, info = A->nrow+1 is returned as
- *           a warning, but the routine still goes on to solve for X and
- *           computes error bounds as described below.
- *
- *      2.6. The system of equations is solved for X using the factored form
- *           of transpose(A).
- *
- *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      2.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input/output) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- *         On entry, If options->Fact = FACTORED and equed is not 'N', 
- *         then A must have been equilibrated by the scaling factors in
- *         R and/or C.  
- *         On exit, A is not modified if options->Equil = NO, or if 
- *         options->Equil = YES but equed = 'N' on exit.
- *         Otherwise, if options->Equil = YES and equed is not 'N',
- *         A is scaled as follows:
- *         If A->Stype = SLU_NC:
- *           equed = 'R':  A := diag(R) * A
- *           equed = 'C':  A := A * diag(C)
- *           equed = 'B':  A := diag(R) * A * diag(C).
- *         If A->Stype = SLU_NR:
- *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
- *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
- *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
- *
- * perm_c  (input/output) int*
- *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
- *         which defines the permutation matrix Pc; perm_c[i] = j means
- *         column i of A is in position j in A*Pc.
- *         On exit, perm_c may be overwritten by the product of the input
- *         perm_c and a permutation that postorders the elimination tree
- *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *         is already in postorder.
- *
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *         will try to use the input perm_r, unless a certain threshold
- *         criterion is violated. In that case, perm_r is overwritten by a
- *         new permutation determined by partial pivoting or diagonal
- *         threshold pivoting.
- *         Otherwise, perm_r is output argument.
- * 
- * etree   (input/output) int*,  dimension (A->ncol)
- *         Elimination tree of Pc'*A'*A*Pc.
- *         If options->Fact != FACTORED and options->Fact != DOFACT,
- *         etree is an input argument, otherwise it is an output argument.
- *         Note: etree is a vector of parent pointers for a forest whose
- *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- * equed   (input/output) char*
- *         Specifies the form of equilibration that was done.
- *         = 'N': No equilibration.
- *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
- *         = 'B': Both row and column equilibration, i.e., A was replaced 
- *                by diag(R)*A*diag(C).
- *         If options->Fact = FACTORED, equed is an input argument,
- *         otherwise it is an output argument.
- *
- * R       (input/output) float*, dimension (A->nrow)
- *         The row scale factors for A or transpose(A).
- *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
- *         If equed = 'N' or 'C', R is not accessed.
- *         If options->Fact = FACTORED, R is an input argument,
- *             otherwise, R is output.
- *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
- *             of R must be positive.
- * 
- * C       (input/output) float*, dimension (A->ncol)
- *         The column scale factors for A or transpose(A).
- *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
- *         If equed = 'N' or 'R', C is not accessed.
- *         If options->Fact = FACTORED, C is an input argument,
- *             otherwise, C is output.
- *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
- *             of C must be positive.
- *         
- * L       (output) SuperMatrix*
- *	   The factor L from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
- *
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU.
- *
- * work    (workspace/output) void*, size (lwork) (in bytes)
- *         User supplied workspace, should be large enough
- *         to hold data structures for factors L and U.
- *         On exit, if fact is not 'F', L and U point to this array.
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               mem_usage->total_needed; no other side effects.
- *
- *         See argument 'mem_usage' for memory usage statistics.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         If B->ncol = 0, only LU decomposition is performed, the triangular
- *                         solve is skipped.
- *         On exit,
- *            if equed = 'N', B is not modified; otherwise
- *            if A->Stype = SLU_NC:
- *               if options->Trans = NOTRANS and equed = 'R' or 'B',
- *                  B is overwritten by diag(R)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
- *                  B is overwritten by diag(C)*B;
- *            if A->Stype = SLU_NR:
- *               if options->Trans = NOTRANS and equed = 'C' or 'B',
- *                  B is overwritten by diag(C)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
- *                  B is overwritten by diag(R)*B.
- *
- * X       (output) SuperMatrix*
- *         X has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. 
- *         If info = 0 or info = A->ncol+1, X contains the solution matrix
- *         to the original system of equations. Note that A and B are modified
- *         on exit if equed is not 'N', and the solution to the equilibrated
- *         system is inv(diag(C))*X if options->Trans = NOTRANS and
- *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
- *         and equed = 'R' or 'B'.
- *
- * recip_pivot_growth (output) float*
- *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
- *         The infinity norm is used. If recip_pivot_growth is much less
- *         than 1, the stability of the LU factorization could be poor.
- *
- * rcond   (output) float*
- *         The estimate of the reciprocal condition number of the matrix A
- *         after equilibration (if done). If rcond is less than the machine
- *         precision (in particular, if rcond = 0), the matrix is singular
- *         to working precision. This condition is indicated by a return
- *         code of info > 0.
- *
- * FERR    (output) float*, dimension (B->ncol)   
- *         The estimated forward error bound for each solution vector   
- *         X(j) (the j-th column of the solution matrix X).   
- *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *         is an estimated upper bound for the magnitude of the largest 
- *         element in (X(j) - XTRUE) divided by the magnitude of the   
- *         largest element in X(j).  The estimate is as reliable as   
- *         the estimate for RCOND, and is almost always a slight   
- *         overestimate of the true error.
- *         If options->IterRefine = NOREFINE, ferr = 1.0.
- *
- * BERR    (output) float*, dimension (B->ncol)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *         If options->IterRefine = NOREFINE, berr = 1.0.
- *
- * mem_usage (output) mem_usage_t*
- *         Record the memory usage statistics, consisting of following fields:
- *         - for_lu (float)
- *           The amount of space used in bytes for L\U data structures.
- *         - total_needed (float)
- *           The amount of space needed in bytes to perform factorization.
- *         - expansions (int)
- *           The number of memory expansions during the LU factorization.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit   
- *         < 0: if info = -i, the i-th argument had an illegal value   
- *         > 0: if info = i, and i is   
- *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
- *                    been completed, but the factor U is exactly   
- *                    singular, so the solution and error bounds   
- *                    could not be computed.   
- *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
- *                    precision, meaning that the matrix is singular to
- *                    working precision. Nevertheless, the solution and
- *                    error bounds are computed because there are a number
- *                    of situations where the computed solution can be more
- *                    accurate than the value of RCOND would suggest.   
- *              > A->ncol+1: number of bytes allocated when memory allocation
- *                    failure occurred, plus A->ncol.
- *
- */
-
-    DNformat  *Bstore, *Xstore;
-    complex    *Bmat, *Xmat;
-    int       ldb, ldx, nrhs;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int       colequ, equil, nofact, notran, rowequ, permc_spec;
-    trans_t   trant;
-    char      norm[1];
-    int       i, j, info1;
-    float    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
-    int       relax, panel_size;
-    float    diag_pivot_thresh, drop_tol;
-    double    t0;      /* temporary time */
-    double    *utime;
-
-    /* External functions */
-    extern float clangs(char *, SuperMatrix *);
-    extern double slamch_(char *);
-
-    Bstore = B->Store;
-    Xstore = X->Store;
-    Bmat   = Bstore->nzval;
-    Xmat   = Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-
-    *info = 0;
-    nofact = (options->Fact != FACTORED);
-    equil = (options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    if ( nofact ) {
-	*(unsigned char *)equed = 'N';
-	rowequ = FALSE;
-	colequ = FALSE;
-    } else {
-	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-	colequ = lsame_(equed, "C") || lsame_(equed, "B");
-	smlnum = slamch_("Safe minimum");
-	bignum = 1. / smlnum;
-    }
-
-#if 0
-printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
-       options->Fact, options->Trans, *equed);
-#endif
-
-    /* Test the input parameters */
-    if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern &&
-	options->Fact != SamePattern_SameRowPerm &&
-	!notran && options->Trans != TRANS && options->Trans != CONJ &&
-	!equil && options->Equil != NO)
-	*info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	      A->Dtype != SLU_C || A->Mtype != SLU_GE )
-	*info = -2;
-    else if (options->Fact == FACTORED &&
-	     !(rowequ || colequ || lsame_(equed, "N")))
-	*info = -6;
-    else {
-	if (rowequ) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, R[j]);
-		rcmax = SUPERLU_MAX(rcmax, R[j]);
-	    }
-	    if (rcmin <= 0.) *info = -7;
-	    else if ( A->nrow > 0)
-		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else rowcnd = 1.;
-	}
-	if (colequ && *info == 0) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, C[j]);
-		rcmax = SUPERLU_MAX(rcmax, C[j]);
-	    }
-	    if (rcmin <= 0.) *info = -8;
-	    else if (A->nrow > 0)
-		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else colcnd = 1.;
-	}
-	if (*info == 0) {
-	    if ( lwork < -1 ) *info = -12;
-	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      B->Stype != SLU_DN || B->Dtype != SLU_C || 
-		      B->Mtype != SLU_GE )
-		*info = -13;
-	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      (B->ncol != 0 && B->ncol != X->ncol) ||
-                      X->Stype != SLU_DN ||
-		      X->Dtype != SLU_C || X->Mtype != SLU_GE )
-		*info = -14;
-	}
-    }
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("cgssvx", &i);
-	return;
-    }
-    
-    /* Initialization for factor parameters */
-    panel_size = sp_ienv(1);
-    relax      = sp_ienv(2);
-    diag_pivot_thresh = options->DiagPivotThresh;
-    drop_tol   = 0.0;
-
-    utime = stat->utime;
-    
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	if ( notran ) { /* Reverse the transpose argument. */
-	    trant = TRANS;
-	    notran = 0;
-	} else {
-	    trant = NOTRANS;
-	    notran = 1;
-	}
-    } else { /* A->Stype == SLU_NC */
-	trant = options->Trans;
-	AA = A;
-    }
-
-    if ( nofact && equil ) {
-	t0 = SuperLU_timer_();
-	/* Compute row and column scalings to equilibrate the matrix A. */
-	cgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
-	
-	if ( info1 == 0 ) {
-	    /* Equilibrate matrix A. */
-	    claqgs(AA, R, C, rowcnd, colcnd, amax, equed);
-	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
-	}
-	utime[EQUIL] = SuperLU_timer_() - t0;
-    }
-
-    if ( nrhs > 0 ) {
-        /* Scale the right hand side if equilibration was performed. */
-        if ( notran ) {
-	    if ( rowequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-                        cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]);
-	            }
-	    }
-        } else if ( colequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-                    cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]);
-	        }
-        }
-    }
-
-    if ( nofact ) {
-	
-        t0 = SuperLU_timer_();
-	/*
-	 * Gnet column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-            get_perm_c(permc_spec, AA, perm_c);
-	utime[COLPERM] = SuperLU_timer_() - t0;
-
-	t0 = SuperLU_timer_();
-	sp_preorder(options, AA, perm_c, etree, &AC);
-	utime[ETREE] = SuperLU_timer_() - t0;
-    
-/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	       relax, panel_size, sp_ienv(3), sp_ienv(4));
-	fflush(stdout); */
-	
-	/* Compute the LU factorization of A*Pc. */
-	t0 = SuperLU_timer_();
-	cgstrf(options, &AC, drop_tol, relax, panel_size,
-	       etree, work, lwork, perm_c, perm_r, L, U, stat, info);
-	utime[FACT] = SuperLU_timer_() - t0;
-	
-	if ( lwork == -1 ) {
-	    mem_usage->total_needed = *info - A->ncol;
-	    return;
-	}
-    }
-
-    if ( options->PivotGrowth ) {
-        if ( *info > 0 ) {
-	    if ( *info <= A->ncol ) {
-	        /* Compute the reciprocal pivot growth factor of the leading
-	           rank-deficient *info columns of A. */
-	        *recip_pivot_growth = cPivotGrowth(*info, AA, perm_c, L, U);
-	    }
-	    return;
-        }
-
-        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
-        *recip_pivot_growth = cPivotGrowth(A->ncol, AA, perm_c, L, U);
-    }
-
-    if ( options->ConditionNumber ) {
-        /* Estimate the reciprocal of the condition number of A. */
-        t0 = SuperLU_timer_();
-        if ( notran ) {
-	    *(unsigned char *)norm = '1';
-        } else {
-	    *(unsigned char *)norm = 'I';
-        }
-        anorm = clangs(norm, AA);
-        cgscon(norm, L, U, anorm, rcond, stat, info);
-        utime[RCOND] = SuperLU_timer_() - t0;
-    }
-    
-    if ( nrhs > 0 ) {
-        /* Compute the solution matrix X. */
-        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
-            for (i = 0; i < B->nrow; i++)
-	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
-    
-        t0 = SuperLU_timer_();
-        cgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
-        utime[SOLVE] = SuperLU_timer_() - t0;
-    
-        /* Use iterative refinement to improve the computed solution and compute
-           error bounds and backward error estimates for it. */
-        t0 = SuperLU_timer_();
-        if ( options->IterRefine != NOREFINE ) {
-            cgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
-                   X, ferr, berr, stat, info);
-        } else {
-            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
-        }
-        utime[REFINE] = SuperLU_timer_() - t0;
-
-        /* Transform the solution matrix X to a solution of the original system. */
-        if ( notran ) {
-	    if ( colequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-                        cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]);
-	            }
-	    }
-        } else if ( rowequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-                    cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]);
-                }
-        }
-    } /* end if nrhs > 0 */
-
-    if ( options->ConditionNumber ) {
-        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
-        if ( *rcond < slamch_("E") ) *info = A->ncol + 1;
-    }
-
-    if ( nofact ) {
-        cQuerySpace(L, U, mem_usage);
-        Destroy_CompCol_Permuted(&AC);
-    }
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrf.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrf.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,433 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-void
-cgstrf (superlu_options_t *options, SuperMatrix *A, float drop_tol,
-        int relax, int panel_size, int *etree, void *work, int lwork,
-        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * CGSTRF computes an LU factorization of a general sparse m-by-n
- * matrix A using partial pivoting with row interchanges.
- * The factorization has the form
- *     Pr * A = L * U
- * where Pr is a row permutation matrix, L is lower triangular with unit
- * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
- * triangular (upper trapezoidal if A->nrow < A->ncol).
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = SLU_NCP; Dtype = SLU_C; Mtype = SLU_GE.
- *
- * drop_tol (input) float (NOT IMPLEMENTED)
- *	    Drop tolerance parameter. At step j of the Gaussian elimination,
- *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
- *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
- *
- * relax    (input) int
- *          To control degree of relaxing supernodes. If the number
- *          of nodes (columns) in a subtree of the elimination tree is less
- *          than relax, this subtree is considered as one supernode,
- *          regardless of the row structures of those columns.
- *
- * panel_size (input) int
- *          A panel consists of at most panel_size consecutive columns.
- *
- * etree    (input) int*, dimension (A->ncol)
- *          Elimination tree of A'*A.
- *          Note: etree is a vector of parent pointers for a forest whose
- *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *          On input, the columns of A should be permuted so that the
- *          etree is in a certain postorder.
- *
- * work     (input/output) void*, size (lwork) (in bytes)
- *          User-supplied work space and space for the output data structures.
- *          Not referenced if lwork = 0;
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               *info; no other side effects.
- *
- * perm_c   (input) int*, dimension (A->ncol)
- *	    Column permutation vector, which defines the 
- *          permutation matrix Pc; perm_c[i] = j means column i of A is 
- *          in position j in A*Pc.
- *          When searching for diagonal, perm_c[*] is applied to the
- *          row subscripts of A, so that diagonal threshold pivoting
- *          can find the diagonal of A, rather than that of A*Pc.
- *
- * perm_r   (input/output) int*, dimension (A->nrow)
- *          Row permutation vector which defines the permutation matrix Pr,
- *          perm_r[i] = j means row i of A is in position j in Pr*A.
- *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *             will try to use the input perm_r, unless a certain threshold
- *             criterion is violated. In that case, perm_r is overwritten by
- *             a new permutation determined by partial pivoting or diagonal
- *             threshold pivoting.
- *          Otherwise, perm_r is output argument;
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = SLU_NC, 
- *          Dtype = SLU_C, Mtype = SLU_TRU.
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info     (output) int*
- *          = 0: successful exit
- *          < 0: if info = -i, the i-th argument had an illegal value
- *          > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                and division by zero will occur if it is used to solve a
- *                system of equations.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol. If lwork = -1, it is
- *                the estimated amount of space needed, plus A->ncol.
- *
- * ======================================================================
- *
- * Local Working Arrays: 
- * ======================
- *   m = number of rows in the matrix
- *   n = number of columns in the matrix
- *
- *   xprune[0:n-1]: xprune[*] points to locations in subscript 
- *	vector lsub[*]. For column i, xprune[i] denotes the point where 
- *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
- *	to be traversed for symbolic factorization.
- *
- *   marker[0:3*m-1]: marker[i] = j means that node i has been 
- *	reached when working on column j.
- *	Storage: relative to original row subscripts
- *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
- *	      see cpanel_dfs.c; marker2 is used for inner-factorization,
- *            see ccolumn_dfs.c.
- *
- *   parent[0:m-1]: parent vector used during dfs
- *      Storage: relative to new row subscripts
- *
- *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
- *	unexplored neighbor of i in lsub[*]
- *
- *   segrep[0:nseg-1]: contains the list of supernodal representatives
- *	in topological order of the dfs. A supernode representative is the 
- *	last column of a supernode.
- *      The maximum size of segrep[] is n.
- *
- *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
- *	supernodal representative r, repfnz[r] is the location of the first 
- *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
- *	indicates the supernode r has been explored.
- *	NOTE: There are W of them, each used for one column of a panel. 
- *
- *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
- *      the panel diagonal. These are filled in during cpanel_dfs(), and are
- *      used later in the inner LU factorization within the panel.
- *	panel_lsub[]/dense[] pair forms the SPA data structure.
- *	NOTE: There are W of them.
- *
- *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
- *	    	   NOTE: there are W of them.
- *
- *   tempv[0:*]: real temporary used for dense numeric kernels;
- *	The size of this array is defined by NUM_TEMPV() in csp_defs.h.
- *
- */
-    /* Local working arrays */
-    NCPformat *Astore;
-    int       *iperm_r = NULL; /* inverse of perm_r; used when 
-                                  options->Fact == SamePattern_SameRowPerm */
-    int       *iperm_c; /* inverse of perm_c */
-    int       *iwork;
-    complex    *cwork;
-    int	      *segrep, *repfnz, *parent, *xplore;
-    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
-    int	      *xprune;
-    int	      *marker;
-    complex    *dense, *tempv;
-    int       *relax_end;
-    complex    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int       *xsup, *supno;
-    int       *xlsub, *xlusup, *xusub;
-    int       nzlumax;
-    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
-
-    /* Local scalars */
-    fact_t    fact = options->Fact;
-    double    diag_pivot_thresh = options->DiagPivotThresh;
-    int       pivrow;   /* pivotal row number in the original matrix A */
-    int       nseg1;	/* no of segments in U-column above panel row jcol */
-    int       nseg;	/* no of segments in each U-column */
-    register int jcol;	
-    register int kcol;	/* end column of a relaxed snode */
-    register int icol;
-    register int i, k, jj, new_next, iinfo;
-    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
-    int       w_def;	/* upper bound on panel width */
-    int       usepr, iperm_r_allocated = 0;
-    int       nnzL, nnzU;
-    int       *panel_histo = stat->panel_histo;
-    flops_t   *ops = stat->ops;
-
-    iinfo    = 0;
-    m        = A->nrow;
-    n        = A->ncol;
-    min_mn   = SUPERLU_MIN(m, n);
-    Astore   = A->Store;
-    a        = Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-
-    /* Allocate storage common to the factor routines */
-    *info = cLUMemInit(fact, work, lwork, m, n, Astore->nnz,
-                       panel_size, L, U, &Glu, &iwork, &cwork);
-    if ( *info ) return;
-    
-    xsup    = Glu.xsup;
-    supno   = Glu.supno;
-    xlsub   = Glu.xlsub;
-    xlusup  = Glu.xlusup;
-    xusub   = Glu.xusub;
-    
-    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
-	     &repfnz, &panel_lsub, &xprune, &marker);
-    cSetRWork(m, panel_size, cwork, &dense, &tempv);
-    
-    usepr = (fact == SamePattern_SameRowPerm);
-    if ( usepr ) {
-	/* Compute the inverse of perm_r */
-	iperm_r = (int *) intMalloc(m);
-	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
-	iperm_r_allocated = 1;
-    }
-    iperm_c = (int *) intMalloc(n);
-    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
-
-    /* Identify relaxed snodes */
-    relax_end = (int *) intMalloc(n);
-    if ( options->SymmetricMode == YES ) {
-        heap_relax_snode(n, etree, relax, marker, relax_end); 
-    } else {
-        relax_snode(n, etree, relax, marker, relax_end); 
-    }
-    
-    ifill (perm_r, m, EMPTY);
-    ifill (marker, m * NO_MARKER, EMPTY);
-    supno[0] = -1;
-    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
-    w_def    = panel_size;
-
-    /* 
-     * Work on one "panel" at a time. A panel is one of the following: 
-     *	   (a) a relaxed supernode at the bottom of the etree, or
-     *	   (b) panel_size contiguous columns, defined by the user
-     */
-    for (jcol = 0; jcol < min_mn; ) {
-
-	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
-   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
-	    panel_histo[kcol-jcol+1]++;
-
-	    /* --------------------------------------
-	     * Factorize the relaxed supernode(jcol:kcol) 
-	     * -------------------------------------- */
-	    /* Determine the union of the row structure of the snode */
-	    if ( (*info = csnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
-				    xprune, marker, &Glu)) != 0 )
-		return;
-
-            nextu    = xusub[jcol];
-	    nextlu   = xlusup[jcol];
-	    jsupno   = supno[jcol];
-	    fsupc    = xsup[jsupno];
-	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
-	    nzlumax = Glu.nzlumax;
-	    while ( new_next > nzlumax ) {
-		if ( (*info = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
-		    return;
-	    }
-    
-	    for (icol = jcol; icol<= kcol; icol++) {
-		xusub[icol+1] = nextu;
-		
-    		/* Scatter into SPA dense[*] */
-    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
-        	    dense[asub[k]] = a[k];
-
-	       	/* Numeric update within the snode */
-	        csnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);
-
-		if ( (*info = cpivotL(icol, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-		
-#ifdef DEBUG
-		cprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-	    jcol = icol;
-
-	} else { /* Work on one panel of panel_size columns */
-	    
-	    /* Adjust panel_size so that a panel won't overlap with the next 
-	     * relaxed snode.
-	     */
-	    panel_size = w_def;
-	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
-		if ( relax_end[k] != EMPTY ) {
-		    panel_size = k - jcol;
-		    break;
-		}
-	    if ( k == min_mn ) panel_size = min_mn - jcol;
-	    panel_histo[panel_size]++;
-
-	    /* symbolic factor on a panel of columns */
-	    cpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
-		      dense, panel_lsub, segrep, repfnz, xprune,
-		      marker, parent, xplore, &Glu);
-	    
-	    /* numeric sup-panel updates in topological order */
-	    cpanel_bmod(m, panel_size, jcol, nseg1, dense,
-		        tempv, segrep, repfnz, &Glu, stat);
-	    
-	    /* Sparse LU within the panel, and below panel diagonal */
-    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
- 		k = (jj - jcol) * m; /* column index for w-wide arrays */
-
-		nseg = nseg1;	/* Begin after all the panel segments */
-
-	    	if ((*info = ccolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
-					segrep, &repfnz[k], xprune, marker,
-					parent, xplore, &Glu)) != 0) return;
-
-	      	/* Numeric updates */
-	    	if ((*info = ccolumn_bmod(jj, (nseg - nseg1), &dense[k],
-					 tempv, &segrep[nseg1], &repfnz[k],
-					 jcol, &Glu, stat)) != 0) return;
-		
-	        /* Copy the U-segments to ucol[*] */
-		if ((*info = ccopy_to_ucol(jj, nseg, segrep, &repfnz[k],
-					  perm_r, &dense[k], &Glu)) != 0)
-		    return;
-
-	    	if ( (*info = cpivotL(jj, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-
-		/* Prune columns (0:jj-1) using column jj */
-	    	cpruneL(jj, perm_r, pivrow, nseg, segrep,
-                        &repfnz[k], xprune, &Glu);
-
-		/* Reset repfnz[] for this column */
-	    	resetrep_col (nseg, segrep, &repfnz[k]);
-		
-#ifdef DEBUG
-		cprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-   	    jcol += panel_size;	/* Move to the next panel */
-
-	} /* else */
-
-    } /* for */
-
-    *info = iinfo;
-    
-    if ( m > n ) {
-	k = 0;
-        for (i = 0; i < m; ++i) 
-            if ( perm_r[i] == EMPTY ) {
-    		perm_r[i] = n + k;
-		++k;
-	    }
-    }
-
-    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
-    fixupL(min_mn, perm_r, &Glu);
-
-    cLUWorkFree(iwork, cwork, &Glu); /* Free work space and compress storage */
-
-    if ( fact == SamePattern_SameRowPerm ) {
-        /* L and U structures may have changed due to possibly different
-	   pivoting, even though the storage is available.
-	   There could also be memory expansions, so the array locations
-           may have changed, */
-        ((SCformat *)L->Store)->nnz = nnzL;
-	((SCformat *)L->Store)->nsuper = Glu.supno[n];
-	((SCformat *)L->Store)->nzval = Glu.lusup;
-	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
-	((SCformat *)L->Store)->rowind = Glu.lsub;
-	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
-	((NCformat *)U->Store)->nnz = nnzU;
-	((NCformat *)U->Store)->nzval = Glu.ucol;
-	((NCformat *)U->Store)->rowind = Glu.usub;
-	((NCformat *)U->Store)->colptr = Glu.xusub;
-    } else {
-        cCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, 
-	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
-			         Glu.xsup, SLU_SC, SLU_C, SLU_TRLU);
-    	cCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, 
-			       Glu.usub, Glu.xusub, SLU_NC, SLU_C, SLU_TRU);
-    }
-    
-    ops[FACT] += ops[TRSV] + ops[GEMV];	
-    
-    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
-    SUPERLU_FREE (iperm_c);
-    SUPERLU_FREE (relax_end);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cgstrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cgstrs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,345 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-
-/* 
- * Function prototypes 
- */
-void cusolve(int, int, complex*, complex*);
-void clsolve(int, int, complex*, complex*);
-void cmatvec(int, int, int, complex*, complex*, complex*);
-
-
-void
-cgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U,
-        int *perm_c, int *perm_r, SuperMatrix *B,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * CGSTRS solves a system of linear equations A*X=B or A'*X=B
- * with A sparse and B dense, using the LU factorization computed by
- * CGSTRF.
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *
- * L       (input) SuperMatrix*
- *         The factor L from the factorization Pr*A*Pc=L*U as computed by
- *         cgstrf(). Use compressed row subscripts storage for supernodes,
- *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
- *
- * U       (input) SuperMatrix*
- *         The factor U from the factorization Pr*A*Pc=L*U as computed by
- *         cgstrf(). Use column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU.
- *
- * perm_c  (input) int*, dimension (L->ncol)
- *	   Column permutation vector, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * perm_r  (input) int*, dimension (L->nrow)
- *         Row permutation vector, which defines the permutation matrix Pr; 
- *         perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
-#endif
-    int      incx = 1, incy = 1;
-#ifdef USE_VENDOR_BLAS
-    complex   alpha = {1.0, 0.0}, beta = {1.0, 0.0};
-    complex   *work_col;
-#endif
-    complex   temp_comp;
-    DNformat *Bstore;
-    complex   *Bmat;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    complex   *Lval, *Uval;
-    int      fsupc, nrow, nsupr, nsupc, luptr, istart, irow;
-    int      i, j, k, iptr, jcol, n, ldb, nrhs;
-    complex   *work, *rhs_work, *soln;
-    flops_t  solve_ops;
-    void cprint_soln();
-
-    /* Test input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    ldb = Bstore->lda;
-    nrhs = B->ncol;
-    if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
-	      L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU )
-	*info = -2;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
-	      U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU )
-	*info = -3;
-    else if ( ldb < SUPERLU_MAX(0, L->nrow) ||
-	      B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("cgstrs", &i);
-	return;
-    }
-
-    n = L->nrow;
-    work = complexCalloc(n * nrhs);
-    if ( !work ) ABORT("Malloc fails for local work[].");
-    soln = complexMalloc(n);
-    if ( !soln ) ABORT("Malloc fails for local soln[].");
-
-    Bmat = Bstore->nzval;
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-    
-    if ( trans == NOTRANS ) {
-	/* Permute right hand sides to form Pr*B */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-	/* Forward solve PLy=Pb. */
-	for (k = 0; k <= Lstore->nsuper; k++) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    nrow = nsupr - nsupc;
-
-	    solve_ops += 4 * nsupc * (nsupc - 1) * nrhs;
-	    solve_ops += 8 * nrow * nsupc * nrhs;
-	    
-	    if ( nsupc == 1 ) {
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-	    	    luptr = L_NZ_START(fsupc);
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){
-			irow = L_SUB(iptr);
-			++luptr;
-			cc_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]);
-			c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp);
-		    }
-		}
-	    } else {
-	    	luptr = L_NZ_START(fsupc);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("N", strlen("N"));
-		ftcs3 = _cptofcd("U", strlen("U"));
-		CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#else
-		ctrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		cgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#endif
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    work_col = &work[j*n];
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			c_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]);
-			work_col[i].r = 0.0;
-	                work_col[i].i = 0.0;
-			iptr++;
-		    }
-		}
-#else		
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    clsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
-		    cmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
-			    &rhs_work[fsupc], &work[0] );
-
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			c_sub(&rhs_work[irow], &rhs_work[irow], &work[i]);
-			work[i].r = 0.;
-	                work[i].i = 0.;
-			iptr++;
-		    }
-		}
-#endif		    
-	    } /* else ... */
-	} /* for L-solve */
-
-#ifdef DEBUG
-  	printf("After L-solve: y=\n");
-	cprint_soln(n, nrhs, Bmat);
-#endif
-
-	/*
-	 * Back solve Ux=y.
-	 */
-	for (k = Lstore->nsuper; k >= 0; k--) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    luptr = L_NZ_START(fsupc);
-
-	    solve_ops += 4 * nsupc * (nsupc + 1) * nrhs;
-
-	    if ( nsupc == 1 ) {
-		rhs_work = &Bmat[0];
-		for (j = 0; j < nrhs; j++) {
-		    c_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]);
-		    rhs_work += ldb;
-		}
-	    } else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("U", strlen("U"));
-		ftcs3 = _cptofcd("N", strlen("N"));
-		CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#else
-		ctrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#endif
-#else		
-		for (j = 0; j < nrhs; j++)
-		    cusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] );
-#endif		
-	    }
-
-	    for (j = 0; j < nrhs; ++j) {
-		rhs_work = &Bmat[j*ldb];
-		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){
-			irow = U_SUB(i);
-			cc_mult(&temp_comp, &rhs_work[jcol], &Uval[i]);
-			c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp);
-		    }
-		}
-	    }
-	    
-	} /* for U-solve */
-
-#ifdef DEBUG
-  	printf("After U-solve: x=\n");
-	cprint_soln(n, nrhs, Bmat);
-#endif
-
-	/* Compute the final solution X := Pc*X. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-        stat->ops[SOLVE] = solve_ops;
-
-    } else { /* Solve A'*X=B or CONJ(A)*X=B */
-	/* Permute right hand sides to form Pc'*B. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-	stat->ops[SOLVE] = 0;
-        if (trans == TRANS) {
-	    for (k = 0; k < nrhs; ++k) {
-	        /* Multiply by inv(U'). */
-	        sp_ctrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info);
-	    
-	        /* Multiply by inv(L'). */
-	        sp_ctrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info);
-	    }
-         } else { /* trans == CONJ */
-            for (k = 0; k < nrhs; ++k) {                
-                /* Multiply by conj(inv(U')). */
-                sp_ctrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info);
-                
-                /* Multiply by conj(inv(L')). */
-                sp_ctrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info);
-	    }
-         }
-	/* Compute the final solution X := Pr'*X (=inv(Pr)*X) */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-    }
-
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(soln);
-}
-
-/*
- * Diagnostic print of the solution vector 
- */
-void
-cprint_soln(int n, int nrhs, complex *soln)
-{
-    int i;
-
-    for (i = 0; i < n; i++) 
-  	printf("\t%d: %.4f\n", i, soln[i]);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clacon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clacon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clacon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,213 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <math.h>
-#include "slu_Cnames.h"
-#include "slu_scomplex.h"
-
-int
-clacon_(int *n, complex *v, complex *x, float *est, int *kase)
-
-{
-/*
-    Purpose   
-    =======   
-
-    CLACON estimates the 1-norm of a square matrix A.   
-    Reverse communication is used for evaluating matrix-vector products. 
-  
-
-    Arguments   
-    =========   
-
-    N      (input) INT
-           The order of the matrix.  N >= 1.   
-
-    V      (workspace) COMPLEX PRECISION array, dimension (N)   
-           On the final return, V = A*W,  where  EST = norm(V)/norm(W)   
-           (W is not returned).   
-
-    X      (input/output) COMPLEX PRECISION array, dimension (N)   
-           On an intermediate return, X should be overwritten by   
-                 A * X,   if KASE=1,   
-                 A' * X,  if KASE=2,
-           where A' is the conjugate transpose of A,
-           and CLACON must be re-called with all the other parameters   
-           unchanged.   
-
-
-    EST    (output) FLOAT PRECISION   
-           An estimate (a lower bound) for norm(A).   
-
-    KASE   (input/output) INT
-           On the initial call to CLACON, KASE should be 0.   
-           On an intermediate return, KASE will be 1 or 2, indicating   
-           whether X should be overwritten by A * X  or A' * X.   
-           On the final return from CLACON, KASE will again be 0.   
-
-    Further Details   
-    ======= =======   
-
-    Contributed by Nick Higham, University of Manchester.   
-    Originally named CONEST, dated March 16, 1988.   
-
-    Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of 
-    a real or complex matrix, with applications to condition estimation", 
-    ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988.   
-    ===================================================================== 
-*/
-
-    /* Table of constant values */
-    int c__1 = 1;
-    complex      zero = {0.0, 0.0};
-    complex      one = {1.0, 0.0};
-
-    /* System generated locals */
-    float d__1;
-    
-    /* Local variables */
-    static int iter;
-    static int jump, jlast;
-    static float altsgn, estold;
-    static int i, j;
-    float temp;
-    float safmin;
-    extern double slamch_(char *);
-    extern int icmax1_(int *, complex *, int *);
-    extern double scsum1_(int *, complex *, int *);
-
-    safmin = slamch_("Safe minimum");
-    if ( *kase == 0 ) {
-	for (i = 0; i < *n; ++i) {
-	    x[i].r = 1. / (float) (*n);
-	    x[i].i = 0.;
-	}
-	*kase = 1;
-	jump = 1;
-	return 0;
-    }
-
-    switch (jump) {
-	case 1:  goto L20;
-	case 2:  goto L40;
-	case 3:  goto L70;
-	case 4:  goto L110;
-	case 5:  goto L140;
-    }
-
-    /*     ................ ENTRY   (JUMP = 1)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY A*X. */
-  L20:
-    if (*n == 1) {
-	v[0] = x[0];
-	*est = c_abs(&v[0]);
-	/*        ... QUIT */
-	goto L150;
-    }
-    *est = scsum1_(n, x, &c__1);
-
-    for (i = 0; i < *n; ++i) {
-	d__1 = c_abs(&x[i]);
-	if (d__1 > safmin) {
-	    d__1 = 1 / d__1;
-	    x[i].r *= d__1;
-	    x[i].i *= d__1;
-	} else {
-	    x[i] = one;
-	}
-    }
-    *kase = 2;
-    jump = 2;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 2)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */
-L40:
-    j = icmax1_(n, &x[0], &c__1);
-    --j;
-    iter = 2;
-
-    /*     MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */
-L50:
-    for (i = 0; i < *n; ++i) x[i] = zero;
-    x[j] = one;
-    *kase = 1;
-    jump = 3;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 3)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L70:
-#ifdef _CRAY
-    CCOPY(n, x, &c__1, v, &c__1);
-#else
-    ccopy_(n, x, &c__1, v, &c__1);
-#endif
-    estold = *est;
-    *est = scsum1_(n, v, &c__1);
-
-
-L90:
-    /*     TEST FOR CYCLING. */
-    if (*est <= estold) goto L120;
-
-    for (i = 0; i < *n; ++i) {
-	d__1 = c_abs(&x[i]);
-	if (d__1 > safmin) {
-	    d__1 = 1 / d__1;
-	    x[i].r *= d__1;
-	    x[i].i *= d__1;
-	} else {
-	    x[i] = one;
-	}
-    }
-    *kase = 2;
-    jump = 4;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 4)   
-	   X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */
-L110:
-    jlast = j;
-    j = icmax1_(n, &x[0], &c__1);
-    --j;
-    if (x[jlast].r != (d__1 = x[j].r, fabs(d__1)) && iter < 5) {
-	++iter;
-	goto L50;
-    }
-
-    /*     ITERATION COMPLETE.  FINAL STAGE. */
-L120:
-    altsgn = 1.;
-    for (i = 1; i <= *n; ++i) {
-	x[i-1].r = altsgn * ((float)(i - 1) / (float)(*n - 1) + 1.);
-	x[i-1].i = 0.;
-	altsgn = -altsgn;
-    }
-    *kase = 1;
-    jump = 5;
-    return 0;
-    
-    /*     ................ ENTRY   (JUMP = 5)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L140:
-    temp = scsum1_(n, x, &c__1) / (float)(*n * 3) * 2.;
-    if (temp > *est) {
-#ifdef _CRAY
-	CCOPY(n, &x[0], &c__1, &v[0], &c__1);
-#else
-	ccopy_(n, &x[0], &c__1, &v[0], &c__1);
-#endif
-	*est = temp;
-    }
-
-L150:
-    *kase = 0;
-    return 0;
-
-} /* clacon_ */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clangs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/clangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/clangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,110 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	clangs.c
- * History:     Modified from lapack routine CLANGE
- */
-#include <math.h>
-#include "slu_cdefs.h"
-
-float clangs(char *norm, SuperMatrix *A)
-{
-/* 
-    Purpose   
-    =======   
-
-    CLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    CLANGE returns the value   
-
-       CLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                (   
-                ( norm1(A),         NORM = '1', 'O' or 'o'   
-                (   
-                ( normI(A),         NORM = 'I' or 'i'   
-                (   
-                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in CLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NCformat *Astore;
-    complex   *Aval;
-    int      i, j, irow;
-    float   value, sum;
-    float   *rwork;
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-	
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
-		value = SUPERLU_MAX( value, c_abs( &Aval[i]) );
-	
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += c_abs( &Aval[i] );
-	    value = SUPERLU_MAX(value,sum);
-	}
-	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) )
-	    ABORT("SUPERLU_MALLOC fails for rwork.");
-	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
-		irow = Astore->rowind[i];
-		rwork[irow] += c_abs( &Aval[i] );
-	    }
-	value = 0.;
-	for (i = 0; i < A->nrow; ++i)
-	    value = SUPERLU_MAX(value, rwork[i]);
-	
-	SUPERLU_FREE (rwork);
-	
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else
-	ABORT("Illegal norm specified.");
-
-    return (value);
-
-} /* clangs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/claqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/claqgs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/claqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/claqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,138 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	claqgs.c
- * History:     Modified from LAPACK routine CLAQGE
- */
-#include <math.h>
-#include "slu_cdefs.h"
-
-void
-claqgs(SuperMatrix *A, float *r, float *c, 
-	float rowcnd, float colcnd, float amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    CLAQGS equilibrates a general sparse M by N matrix A using the row and   
-    scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = NC; Dtype = SLU_C; Mtype = GE.
-	    
-    R       (input) float*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) float*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) float
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) float
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) float
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NCformat *Astore;
-    complex   *Aval;
-    int i, j, irow;
-    float large, small, cj;
-    extern double slamch_(char *);
-    float temp;
-
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Initialize LARGE and SMALL. */
-    small = slamch_("Safe minimum") / slamch_("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    for (j = 0; j < A->ncol; ++j) {
-		cj = c[j];
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    cs_mult(&Aval[i], &Aval[i], cj);
-                }
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		cs_mult(&Aval[i], &Aval[i], r[irow]);
-	    }
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Row and column scaling */
-	for (j = 0; j < A->ncol; ++j) {
-	    cj = c[j];
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		temp = cj * r[irow];
-		cs_mult(&Aval[i], &Aval[i], temp);
-	    }
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* claqgs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmemory.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmemory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmemory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,680 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_cdefs.h"
-
-/* Constants */
-#define NO_MEMTYPE  4      /* 0: lusup;
-			      1: ucol;
-			      2: lsub;
-			      3: usub */
-#define GluIntArray(n)   (5 * (n) + 5)
-
-/* Internal prototypes */
-void  *cexpand (int *, MemType,int, int, GlobalLU_t *);
-int   cLUWorkInit (int, int, int, int **, complex **, LU_space_t);
-void  copy_mem_complex (int, void *, void *);
-void  cStackCompress (GlobalLU_t *);
-void  cSetupSpace (void *, int, LU_space_t *);
-void  *cuser_malloc (int, int);
-void  cuser_free (int, int);
-
-/* External prototypes (in memory.c - prec-indep) */
-extern void    copy_mem_int    (int, void *, void *);
-extern void    user_bcopy      (char *, char *, int);
-
-/* Headers for 4 types of dynamatically managed memory */
-typedef struct e_node {
-    int size;      /* length of the memory that has been used */
-    void *mem;     /* pointer to the new malloc'd store */
-} ExpHeader;
-
-typedef struct {
-    int  size;
-    int  used;
-    int  top1;  /* grow upward, relative to &array[0] */
-    int  top2;  /* grow downward */
-    void *array;
-} LU_stack_t;
-
-/* Variables local to this file */
-static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */
-static LU_stack_t stack;
-static int no_expand;
-
-/* Macros to manipulate stack */
-#define StackFull(x)         ( x + stack.used >= stack.size )
-#define NotDoubleAlign(addr) ( (long int)addr & 7 )
-#define DoubleAlign(addr)    ( ((long int)addr + 7) & ~7L )
-#define TempSpace(m, w)      ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \
-			      (w + 1) * m * sizeof(complex) )
-#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
-
-
-
-
-/*
- * Setup the memory model to be used for factorization.
- *    lwork = 0: use system malloc;
- *    lwork > 0: use user-supplied work[] space.
- */
-void cSetupSpace(void *work, int lwork, LU_space_t *MemModel)
-{
-    if ( lwork == 0 ) {
-	*MemModel = SYSTEM; /* malloc/free */
-    } else if ( lwork > 0 ) {
-	*MemModel = USER;   /* user provided space */
-	stack.used = 0;
-	stack.top1 = 0;
-	stack.top2 = (lwork/4)*4; /* must be word addressable */
-	stack.size = stack.top2;
-	stack.array = (void *) work;
-    }
-}
-
-
-
-void *cuser_malloc(int bytes, int which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-
-void cuser_free(int bytes, int which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total_needed (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int cQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    register int n, iword, dword, panel_size = sp_ienv(1);
-
-    Lstore = L->Store;
-    Ustore = U->Store;
-    n = L->ncol;
-    iword = sizeof(int);
-    dword = sizeof(complex);
-
-    /* For LU factors */
-    mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] *
-				 dword + Lstore->rowind_colptr[n] * iword );
-    mem_usage->for_lu += (float)( (n + 1) * iword +
-				 Ustore->colptr[n] * (dword + iword) );
-
-    /* Working storage to support factorization */
-    mem_usage->total_needed = mem_usage->for_lu +
-	(float)( (2 * panel_size + 4 + NO_MARKER) * n * iword +
-		(panel_size + 1) * n * dword );
-
-    mem_usage->expansions = --no_expand;
-
-    return 0;
-} /* cQuerySpace */
-
-/*
- * Allocate storage for the data structures common to all factor routines.
- * For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     If lwork = -1, return the estimated amount of space required, plus n;
- *     otherwise, return the amount of space actually allocated when
- *     memory allocation failure occurred.
- */
-int
-cLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz,
-	  int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu,
-	  int **iwork, complex **dwork)
-{
-    int      info, iword, dword;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    int      *xsup, *supno;
-    int      *lsub, *xlsub;
-    complex   *lusup;
-    int      *xlusup;
-    complex   *ucol;
-    int      *usub, *xusub;
-    int      nzlmax, nzumax, nzlumax;
-    int      FILL = sp_ienv(6);
-    
-    Glu->n    = n;
-    no_expand = 0;
-    iword     = sizeof(int);
-    dword     = sizeof(complex);
-
-    if ( !expanders )	
-        expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader));
-    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
-    
-    if ( fact != SamePattern_SameRowPerm ) {
-	/* Guess for L\U factors */
-	nzumax = nzlumax = FILL * annz;
-	nzlmax = SUPERLU_MAX(1, FILL/4.) * annz;
-
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else {
-	    cSetupSpace(work, lwork, &Glu->MemModel);
-	}
-	
-#if ( PRNTlevel >= 1 )
-	printf("cLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", 
-	       FILL, nzlmax, nzumax);
-	fflush(stdout);
-#endif	
-	
-	/* Integer pointers for L\U factors */
-	if ( Glu->MemModel == SYSTEM ) {
-	    xsup   = intMalloc(n+1);
-	    supno  = intMalloc(n+1);
-	    xlsub  = intMalloc(n+1);
-	    xlusup = intMalloc(n+1);
-	    xusub  = intMalloc(n+1);
-	} else {
-	    xsup   = (int *)cuser_malloc((n+1) * iword, HEAD);
-	    supno  = (int *)cuser_malloc((n+1) * iword, HEAD);
-	    xlsub  = (int *)cuser_malloc((n+1) * iword, HEAD);
-	    xlusup = (int *)cuser_malloc((n+1) * iword, HEAD);
-	    xusub  = (int *)cuser_malloc((n+1) * iword, HEAD);
-	}
-
-	lusup = (complex *) cexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	ucol  = (complex *) cexpand( &nzumax, UCOL, 0, 0, Glu );
-	lsub  = (int *)    cexpand( &nzlmax, LSUB, 0, 0, Glu );
-	usub  = (int *)    cexpand( &nzumax, USUB, 0, 1, Glu );
-
-	while ( !lusup || !ucol || !lsub || !usub ) {
-	    if ( Glu->MemModel == SYSTEM ) {
-		SUPERLU_FREE(lusup); 
-		SUPERLU_FREE(ucol); 
-		SUPERLU_FREE(lsub); 
-		SUPERLU_FREE(usub);
-	    } else {
-		cuser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD);
-	    }
-	    nzlumax /= 2;
-	    nzumax /= 2;
-	    nzlmax /= 2;
-	    if ( nzlumax < annz ) {
-		printf("Not enough memory to perform factorization.\n");
-		return (cmemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-	    }
-#if ( PRNTlevel >= 1)
-	    printf("cLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", 
-		   nzlmax, nzumax);
-	    fflush(stdout);
-#endif
-	    lusup = (complex *) cexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	    ucol  = (complex *) cexpand( &nzumax, UCOL, 0, 0, Glu );
-	    lsub  = (int *)    cexpand( &nzlmax, LSUB, 0, 0, Glu );
-	    usub  = (int *)    cexpand( &nzumax, USUB, 0, 1, Glu );
-	}
-	
-    } else {
-	/* fact == SamePattern_SameRowPerm */
-	Lstore   = L->Store;
-	Ustore   = U->Store;
-	xsup     = Lstore->sup_to_col;
-	supno    = Lstore->col_to_sup;
-	xlsub    = Lstore->rowind_colptr;
-	xlusup   = Lstore->nzval_colptr;
-	xusub    = Ustore->colptr;
-	nzlmax   = Glu->nzlmax;    /* max from previous factorization */
-	nzumax   = Glu->nzumax;
-	nzlumax  = Glu->nzlumax;
-	
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else if ( lwork == 0 ) {
-	    Glu->MemModel = SYSTEM;
-	} else {
-	    Glu->MemModel = USER;
-	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
-	    stack.size = stack.top2;
-	}
-	
-	lsub  = expanders[LSUB].mem  = Lstore->rowind;
-	lusup = expanders[LUSUP].mem = Lstore->nzval;
-	usub  = expanders[USUB].mem  = Ustore->rowind;
-	ucol  = expanders[UCOL].mem  = Ustore->nzval;;
-	expanders[LSUB].size         = nzlmax;
-	expanders[LUSUP].size        = nzlumax;
-	expanders[USUB].size         = nzumax;
-	expanders[UCOL].size         = nzumax;	
-    }
-
-    Glu->xsup    = xsup;
-    Glu->supno   = supno;
-    Glu->lsub    = lsub;
-    Glu->xlsub   = xlsub;
-    Glu->lusup   = lusup;
-    Glu->xlusup  = xlusup;
-    Glu->ucol    = ucol;
-    Glu->usub    = usub;
-    Glu->xusub   = xusub;
-    Glu->nzlmax  = nzlmax;
-    Glu->nzumax  = nzumax;
-    Glu->nzlumax = nzlumax;
-    
-    info = cLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel);
-    if ( info )
-	return ( info + cmemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-    
-    ++no_expand;
-    return 0;
-    
-} /* cLUMemInit */
-
-/* Allocate known working storage. Returns 0 if success, otherwise
-   returns the number of bytes allocated so far when failure occurred. */
-int
-cLUWorkInit(int m, int n, int panel_size, int **iworkptr, 
-            complex **dworkptr, LU_space_t MemModel)
-{
-    int    isize, dsize, extra;
-    complex *old_ptr;
-    int    maxsuper = sp_ienv(3),
-           rowblk   = sp_ienv(4);
-
-    isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int);
-    dsize = (m * panel_size +
-	     NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(complex);
-    
-    if ( MemModel == SYSTEM ) 
-	*iworkptr = (int *) intCalloc(isize/sizeof(int));
-    else
-	*iworkptr = (int *) cuser_malloc(isize, TAIL);
-    if ( ! *iworkptr ) {
-	fprintf(stderr, "cLUWorkInit: malloc fails for local iworkptr[]\n");
-	return (isize + n);
-    }
-
-    if ( MemModel == SYSTEM )
-	*dworkptr = (complex *) SUPERLU_MALLOC(dsize);
-    else {
-	*dworkptr = (complex *) cuser_malloc(dsize, TAIL);
-	if ( NotDoubleAlign(*dworkptr) ) {
-	    old_ptr = *dworkptr;
-	    *dworkptr = (complex*) DoubleAlign(*dworkptr);
-	    *dworkptr = (complex*) ((double*)*dworkptr - 1);
-	    extra = (char*)old_ptr - (char*)*dworkptr;
-#ifdef DEBUG	    
-	    printf("cLUWorkInit: not aligned, extra %d\n", extra);
-#endif	    
-	    stack.top2 -= extra;
-	    stack.used += extra;
-	}
-    }
-    if ( ! *dworkptr ) {
-	fprintf(stderr, "malloc fails for local dworkptr[].");
-	return (isize + dsize + n);
-    }
-	
-    return 0;
-}
-
-
-/*
- * Set up pointers for real working arrays.
- */
-void
-cSetRWork(int m, int panel_size, complex *dworkptr,
-	 complex **dense, complex **tempv)
-{
-    complex zero = {0.0, 0.0};
-
-    int maxsuper = sp_ienv(3),
-        rowblk   = sp_ienv(4);
-    *dense = dworkptr;
-    *tempv = *dense + panel_size*m;
-    cfill (*dense, m * panel_size, zero);
-    cfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero);     
-}
-	
-/*
- * Free the working storage used by factor routines.
- */
-void cLUWorkFree(int *iwork, complex *dwork, GlobalLU_t *Glu)
-{
-    if ( Glu->MemModel == SYSTEM ) {
-	SUPERLU_FREE (iwork);
-	SUPERLU_FREE (dwork);
-    } else {
-	stack.used -= (stack.size - stack.top2);
-	stack.top2 = stack.size;
-/*	cStackCompress(Glu);  */
-    }
-    
-    SUPERLU_FREE (expanders);	
-    expanders = 0;
-}
-
-/* Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-cLUMemXpand(int jcol,
-	   int next,          /* number of elements currently in the factors */
-	   MemType mem_type,  /* which type of memory to expand  */
-	   int *maxlen,       /* modified - maximum length of a data structure */
-	   GlobalLU_t *Glu    /* modified - global LU data structures */
-	   )
-{
-    void   *new_mem;
-    
-#ifdef DEBUG    
-    printf("cLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n",
-	   jcol, next, *maxlen, mem_type);
-#endif    
-
-    if (mem_type == USUB) 
-    	new_mem = cexpand(maxlen, mem_type, next, 1, Glu);
-    else
-	new_mem = cexpand(maxlen, mem_type, next, 0, Glu);
-    
-    if ( !new_mem ) {
-	int    nzlmax  = Glu->nzlmax;
-	int    nzumax  = Glu->nzumax;
-	int    nzlumax = Glu->nzlumax;
-    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
-    	return (cmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n);
-    }
-
-    switch ( mem_type ) {
-      case LUSUP:
-	Glu->lusup   = (complex *) new_mem;
-	Glu->nzlumax = *maxlen;
-	break;
-      case UCOL:
-	Glu->ucol   = (complex *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-      case LSUB:
-	Glu->lsub   = (int *) new_mem;
-	Glu->nzlmax = *maxlen;
-	break;
-      case USUB:
-	Glu->usub   = (int *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-    }
-    
-    return 0;
-    
-}
-
-
-
-void
-copy_mem_complex(int howmany, void *old, void *new)
-{
-    register int i;
-    complex *dold = old;
-    complex *dnew = new;
-    for (i = 0; i < howmany; i++) dnew[i] = dold[i];
-}
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-void
-*cexpand (
-	 int *prev_len,   /* length used from previous call */
-	 MemType type,    /* which part of the memory to expand */
-	 int len_to_copy, /* size of the memory to be copied to new store */
-	 int keep_prev,   /* = 1: use prev_len;
-			     = 0: compute new_len to expand */
-	 GlobalLU_t *Glu  /* modified - global LU data structures */
-	)
-{
-    float    EXPAND = 1.5;
-    float    alpha;
-    void     *new_mem, *old_mem;
-    int      new_len, tries, lword, extra, bytes_to_copy;
-
-    alpha = EXPAND;
-
-    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
-        new_len = *prev_len;
-    else {
-	new_len = alpha * *prev_len;
-    }
-    
-    if ( type == LSUB || type == USUB ) lword = sizeof(int);
-    else lword = sizeof(complex);
-
-    if ( Glu->MemModel == SYSTEM ) {
-	new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-	if ( no_expand != 0 ) {
-	    tries = 0;
-	    if ( keep_prev ) {
-		if ( !new_mem ) return (NULL);
-	    } else {
-		while ( !new_mem ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-		}
-	    }
-	    if ( type == LSUB || type == USUB ) {
-		copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
-	    } else {
-		copy_mem_complex(len_to_copy, expanders[type].mem, new_mem);
-	    }
-	    SUPERLU_FREE (expanders[type].mem);
-	}
-	expanders[type].mem = (void *) new_mem;
-	
-    } else { /* MemModel == USER */
-	if ( no_expand == 0 ) {
-	    new_mem = cuser_malloc(new_len * lword, HEAD);
-	    if ( NotDoubleAlign(new_mem) &&
-		(type == LUSUP || type == UCOL) ) {
-		old_mem = new_mem;
-		new_mem = (void *)DoubleAlign(new_mem);
-		extra = (char*)new_mem - (char*)old_mem;
-#ifdef DEBUG		
-		printf("expand(): not aligned, extra %d\n", extra);
-#endif		
-		stack.top1 += extra;
-		stack.used += extra;
-	    }
-	    expanders[type].mem = (void *) new_mem;
-	}
-	else {
-	    tries = 0;
-	    extra = (new_len - *prev_len) * lword;
-	    if ( keep_prev ) {
-		if ( StackFull(extra) ) return (NULL);
-	    } else {
-		while ( StackFull(extra) ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    extra = (new_len - *prev_len) * lword;	    
-		}
-	    }
-
-	    if ( type != USUB ) {
-		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
-		bytes_to_copy = (char*)stack.array + stack.top1
-		    - (char*)expanders[type + 1].mem;
-		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
-
-		if ( type < USUB ) {
-		    Glu->usub = expanders[USUB].mem =
-			(void*)((char*)expanders[USUB].mem + extra);
-		}
-		if ( type < LSUB ) {
-		    Glu->lsub = expanders[LSUB].mem =
-			(void*)((char*)expanders[LSUB].mem + extra);
-		}
-		if ( type < UCOL ) {
-		    Glu->ucol = expanders[UCOL].mem =
-			(void*)((char*)expanders[UCOL].mem + extra);
-		}
-		stack.top1 += extra;
-		stack.used += extra;
-		if ( type == UCOL ) {
-		    stack.top1 += extra;   /* Add same amount for USUB */
-		    stack.used += extra;
-		}
-		
-	    } /* if ... */
-
-	} /* else ... */
-    }
-
-    expanders[type].size = new_len;
-    *prev_len = new_len;
-    if ( no_expand ) ++no_expand;
-    
-    return (void *) expanders[type].mem;
-    
-} /* cexpand */
-
-
-/*
- * Compress the work[] array to remove fragmentation.
- */
-void
-cStackCompress(GlobalLU_t *Glu)
-{
-    register int iword, dword, ndim;
-    char    *last, *fragment;
-    int      *ifrom, *ito;
-    complex   *dfrom, *dto;
-    int      *xlsub, *lsub, *xusub, *usub, *xlusup;
-    complex   *ucol, *lusup;
-    
-    iword = sizeof(int);
-    dword = sizeof(complex);
-    ndim = Glu->n;
-
-    xlsub  = Glu->xlsub;
-    lsub   = Glu->lsub;
-    xusub  = Glu->xusub;
-    usub   = Glu->usub;
-    xlusup = Glu->xlusup;
-    ucol   = Glu->ucol;
-    lusup  = Glu->lusup;
-    
-    dfrom = ucol;
-    dto = (complex *)((char*)lusup + xlusup[ndim] * dword);
-    copy_mem_complex(xusub[ndim], dfrom, dto);
-    ucol = dto;
-
-    ifrom = lsub;
-    ito = (int *) ((char*)ucol + xusub[ndim] * iword);
-    copy_mem_int(xlsub[ndim], ifrom, ito);
-    lsub = ito;
-    
-    ifrom = usub;
-    ito = (int *) ((char*)lsub + xlsub[ndim] * iword);
-    copy_mem_int(xusub[ndim], ifrom, ito);
-    usub = ito;
-    
-    last = (char*)usub + xusub[ndim] * iword;
-    fragment = (char*) (((char*)stack.array + stack.top1) - last);
-    stack.used -= (long int) fragment;
-    stack.top1 -= (long int) fragment;
-
-    Glu->ucol = ucol;
-    Glu->lsub = lsub;
-    Glu->usub = usub;
-    
-#ifdef DEBUG
-    printf("cStackCompress: fragment %d\n", fragment);
-    /* for (last = 0; last < ndim; ++last)
-	print_lu_col("After compress:", last, 0);*/
-#endif    
-    
-}
-
-/*
- * Allocate storage for original matrix A
- */
-void
-callocateA(int n, int nnz, complex **a, int **asub, int **xa)
-{
-    *a    = (complex *) complexMalloc(nnz);
-    *asub = (int *) intMalloc(nnz);
-    *xa   = (int *) intMalloc(n+1);
-}
-
-
-complex *complexMalloc(int n)
-{
-    complex *buf;
-    buf = (complex *) SUPERLU_MALLOC((size_t)n * sizeof(complex)); 
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in complexMalloc()\n");
-    }
-    return (buf);
-}
-
-complex *complexCalloc(int n)
-{
-    complex *buf;
-    register int i;
-    complex zero = {0.0, 0.0};
-    buf = (complex *) SUPERLU_MALLOC((size_t)n * sizeof(complex));
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in complexCalloc()\n");
-    }
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
-
-int cmemory_usage(const int nzlmax, const int nzumax, 
-		  const int nzlumax, const int n)
-{
-    register int iword, dword;
-
-    iword   = sizeof(int);
-    dword   = sizeof(complex);
-    
-    return (10 * n * iword +
-	    nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmyblas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cmyblas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cmyblas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,182 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		cmyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-#include "slu_scomplex.h"
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void clsolve ( int ldm, int ncol, complex *M, complex *rhs )
-{
-    int k;
-    complex x0, x1, x2, x3, temp;
-    complex *M0;
-    complex *Mki0, *Mki1, *Mki2, *Mki3;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      	Mki0 = M0 + 1;
-      	Mki1 = Mki0 + ldm + 1;
-      	Mki2 = Mki1 + ldm + 1;
-      	Mki3 = Mki2 + ldm + 1;
-
-      	x0 = rhs[firstcol];
-      	cc_mult(&temp, &x0, Mki0); Mki0++;
-      	c_sub(&x1, &rhs[firstcol+1], &temp);
-      	cc_mult(&temp, &x0, Mki0); Mki0++;
-	c_sub(&x2, &rhs[firstcol+2], &temp);
-	cc_mult(&temp, &x1, Mki1); Mki1++;
-	c_sub(&x2, &x2, &temp);
-      	cc_mult(&temp, &x0, Mki0); Mki0++;
-	c_sub(&x3, &rhs[firstcol+3], &temp);
-	cc_mult(&temp, &x1, Mki1); Mki1++;
-	c_sub(&x3, &x3, &temp);
-	cc_mult(&temp, &x2, Mki2); Mki2++;
-	c_sub(&x3, &x3, &temp);
-
- 	rhs[++firstcol] = x1;
-      	rhs[++firstcol] = x2;
-      	rhs[++firstcol] = x3;
-      	++firstcol;
-    
-      	for (k = firstcol; k < ncol; k++) {
-	    cc_mult(&temp, &x0, Mki0); Mki0++;
-	    c_sub(&rhs[k], &rhs[k], &temp);
-	    cc_mult(&temp, &x1, Mki1); Mki1++;
-	    c_sub(&rhs[k], &rhs[k], &temp);
-	    cc_mult(&temp, &x2, Mki2); Mki2++;
-	    c_sub(&rhs[k], &rhs[k], &temp);
-	    cc_mult(&temp, &x3, Mki3); Mki3++;
-	    c_sub(&rhs[k], &rhs[k], &temp);
-	}
-
-        M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-        Mki0 = M0 + 1;
-        Mki1 = Mki0 + ldm + 1;
-
-        x0 = rhs[firstcol];
-	cc_mult(&temp, &x0, Mki0); Mki0++;
-	c_sub(&x1, &rhs[firstcol+1], &temp);
-
-      	rhs[++firstcol] = x1;
-      	++firstcol;
-    
-      	for (k = firstcol; k < ncol; k++) {
-	    cc_mult(&temp, &x0, Mki0); Mki0++;
-	    c_sub(&rhs[k], &rhs[k], &temp);
-	    cc_mult(&temp, &x1, Mki1); Mki1++;
-	    c_sub(&rhs[k], &rhs[k], &temp);
-	} 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-cusolve ( ldm, ncol, M, rhs )
-int ldm;	/* in */
-int ncol;	/* in */
-complex *M;	/* in */
-complex *rhs;	/* modified */
-{
-    complex xj, temp;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	c_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++) {
-	    cc_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
-	    c_sub(&rhs[irow], &rhs[irow], &temp);
-	}
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void cmatvec ( ldm, nrow, ncol, M, vec, Mxvec )
-int ldm;	/* in -- leading dimension of M */
-int nrow;	/* in */ 
-int ncol;	/* in */
-complex *M;	/* in */
-complex *vec;	/* in */
-complex *Mxvec;	/* in/out */
-{
-    complex vi0, vi1, vi2, vi3;
-    complex *M0, temp;
-    complex *Mki0, *Mki1, *Mki2, *Mki3;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) {
-	    cc_mult(&temp, &vi0, Mki0); Mki0++;
-	    c_add(&Mxvec[k], &Mxvec[k], &temp);
-	    cc_mult(&temp, &vi1, Mki1); Mki1++;
-	    c_add(&Mxvec[k], &Mxvec[k], &temp);
-	    cc_mult(&temp, &vi2, Mki2); Mki2++;
-	    c_add(&Mxvec[k], &Mxvec[k], &temp);
-	    cc_mult(&temp, &vi3, Mki3); Mki3++;
-	    c_add(&Mxvec[k], &Mxvec[k], &temp);
-	}
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++) {
-	    cc_mult(&temp, &vi0, Mki0); Mki0++;
-	    c_add(&Mxvec[k], &Mxvec[k], &temp);
-	}
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Cnames.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Cnames.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Cnames.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Cnames.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,57 +0,0 @@
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 1, 1997
- *
- * Changes made to this file addressing issue regarding calls to
- * blas/lapack functions (Dec 2003 at LLNL)
- */
-#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
-#define __SUPERLU_CNAMES
-
-/*
- * These macros define how C routines will be called.  
- * They have been modified to use of the new autoconf Fortran
- * name mangling support, F77_FUNC, which is wrapped in the
- * hypre_F90_NAME_BLAS macro.
- */
-
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine
- * No redefinition necessary to have following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void hypre_F90_NAME_BLAS(dgemm,DGEMM)(...)
- *
- * This is the default.
- */
-
-#endif
-/***
-#define hypre_F90_NAME_BLAS(dasum,DASUM)    dasum
-#define hypre_F90_NAME_BLAS(idamax,IDAMAX)   idamax
-#define hypre_F90_NAME_BLAS(dcopy,DCOPY)    dcopy
-#define hypre_F90_NAME_BLAS(dscal,DSCAL)    dscal
-#define hypre_F90_NAME_BLAS(dger,DGER)     dger
-#define hypre_F90_NAME_BLAS(dnrm2,DNRM2)    dnrm2
-#define hypre_F90_NAME_BLAS(dsymv,DSYMV)    dsymv
-#define hypre_F90_NAME_BLAS(ddot,DDOT)     ddot
-#define hypre_F90_NAME_BLAS(daxpy,DAXPY)    daxpy
-#define hypre_F90_NAME_BLAS(dsyr2,DSYR2)    dsyr2
-#define hypre_F90_NAME_BLAS(drot,DROT)     drot
-#define hypre_F90_NAME_BLAS(dgemv,DGEMV)    dgemv
-#define hypre_F90_NAME_BLAS(dtrsv,DTRSV)    dtrsv
-#define hypre_F90_NAME_BLAS(dgemm,DGEMM)    dgemm
-#define hypre_F90_NAME_BLAS(dtrsm,DTRSM)    dtrsm
-
-#define hypre_F90_NAME_BLAS(xerbla,XERBLA)  xerbla
-#define hypre_F90_NAME_BLAS(dpotrf,DPOTRF)   dpotrf
-#define hypre_F90_NAME_BLAS(dgels,DGELS)    dgels
-#define hypre_F90_NAME_BLAS(dpotrs,DPOTRS)   dpotrs
-#define hypre_F90_NAME_BLAS(lsame,LSAME)    lsame
-#define hypre_F90_NAME_BLAS(dlamch,DLAMCH)  dlamch
-
-#define c_bridge_dgssv_ c_bridge_dgssv
-***/
-#endif /* __SUPERLU_CNAMES */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,3412 +0,0 @@
-/* ========================================================================== */
-/* === colamd/symamd - a sparse matrix column ordering algorithm ============ */
-/* ========================================================================== */
-
-/*
-    colamd:  an approximate minimum degree column ordering algorithm,
-    	for LU factorization of symmetric or unsymmetric matrices,
-	QR factorization, least squares, interior point methods for
-	linear programming problems, and other related problems.
-
-    symamd:  an approximate minimum degree ordering algorithm for Cholesky
-    	factorization of symmetric matrices.
-
-    Purpose:
-
-	Colamd computes a permutation Q such that the Cholesky factorization of
-	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
-	than A'A.  This also provides a good ordering for sparse partial
-	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
-	factorization, and P is computed during numerical factorization via
-	conventional partial pivoting with row interchanges.  Colamd is the
-	column ordering method used in SuperLU, part of the ScaLAPACK library.
-	It is also available as built-in function in MATLAB Version 6,
-	available from MathWorks, Inc. (http://www.mathworks.com).  This
-	routine can be used in place of colmmd in MATLAB.
-
-    	Symamd computes a permutation P of a symmetric matrix A such that the
-	Cholesky factorization of PAP' has less fill-in and requires fewer
-	floating point operations than A.  Symamd constructs a matrix M such
-	that M'M has the same nonzero pattern of A, and then orders the columns
-	of M using colmmd.  The column ordering of M is then returned as the
-	row and column ordering P of A. 
-
-    Authors:
-
-	The authors of the code itself are Stefan I. Larimore and Timothy A.
-	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
-	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
-	Ng, Oak Ridge National Laboratory.
-
-    Date:
-
-	September 8, 2003.  Version 2.3.
-
-    Acknowledgements:
-
-	This work was supported by the National Science Foundation, under
-	grants DMS-9504974 and DMS-9803599.
-
-    Copyright and License:
-
-	Copyright (c) 1998-2003 by the University of Florida.
-	All Rights Reserved.
-
-	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-
-	Permission is hereby granted to use, copy, modify, and/or distribute
-	this program, provided that the Copyright, this License, and the
-	Availability of the original version is retained on all copies and made
-	accessible to the end-user of any code or package that includes COLAMD
-	or any modified version of COLAMD. 
-
-    Availability:
-
-	The colamd/symamd library is available at
-
-	    http://www.cise.ufl.edu/research/sparse/colamd/
-
-	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.c
-	file.  It requires the colamd.h file.  It is required by the colamdmex.c
-	and symamdmex.c files, for the MATLAB interface to colamd and symamd.
-
-    See the ChangeLog file for changes since Version 1.0.
-
-*/
-
-/* ========================================================================== */
-/* === Description of user-callable routines ================================ */
-/* ========================================================================== */
-
-/*
-    ----------------------------------------------------------------------------
-    colamd_recommended:
-    ----------------------------------------------------------------------------
-
-	C syntax:
-
-	    #include "colamd.h"
-	    int colamd_recommended (int nnz, int n_row, int n_col) ;
-
-	    or as a C macro
-
-	    #include "colamd.h"
-	    Alen = COLAMD_RECOMMENDED (int nnz, int n_row, int n_col) ;
-
-	Purpose:
-
-	    Returns recommended value of Alen for use by colamd.  Returns -1
-	    if any input argument is negative.  The use of this routine
-	    or macro is optional.  Note that the macro uses its arguments
-	    more than once, so be careful for side effects, if you pass
-	    expressions as arguments to COLAMD_RECOMMENDED.  Not needed for
-	    symamd, which dynamically allocates its own memory.
-
-	Arguments (all input arguments):
-
-	    int nnz ;		Number of nonzeros in the matrix A.  This must
-				be the same value as p [n_col] in the call to
-				colamd - otherwise you will get a wrong value
-				of the recommended memory to use.
-
-	    int n_row ;		Number of rows in the matrix A.
-
-	    int n_col ;		Number of columns in the matrix A.
-
-    ----------------------------------------------------------------------------
-    colamd_set_defaults:
-    ----------------------------------------------------------------------------
-
-	C syntax:
-
-	    #include "colamd.h"
-	    colamd_set_defaults (double knobs [COLAMD_KNOBS]) ;
-
-	Purpose:
-
-	    Sets the default parameters.  The use of this routine is optional.
-
-	Arguments:
-
-	    double knobs [COLAMD_KNOBS] ;	Output only.
-
-		Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
-		entries are removed prior to ordering.  Columns with more than
-		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
-		ordering, and placed last in the output column ordering. 
-
-		Symamd: uses only knobs [COLAMD_DENSE_ROW], which is knobs [0].
-		Rows and columns with more than (knobs [COLAMD_DENSE_ROW] * n)
-		entries are removed prior to ordering, and placed last in the
-		output ordering.
-
-		COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
-		respectively, in colamd.h.  Default values of these two knobs
-		are both 0.5.  Currently, only knobs [0] and knobs [1] are
-		used, but future versions may use more knobs.  If so, they will
-		be properly set to their defaults by the future version of
-		colamd_set_defaults, so that the code that calls colamd will
-		not need to change, assuming that you either use
-		colamd_set_defaults, or pass a (double *) NULL pointer as the
-		knobs array to colamd or symamd.
-
-    ----------------------------------------------------------------------------
-    colamd:
-    ----------------------------------------------------------------------------
-
-	C syntax:
-
-	    #include "colamd.h"
-	    int colamd (int n_row, int n_col, int Alen, int *A, int *p,
-	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS]) ;
-
-	Purpose:
-
-	    Computes a column ordering (Q) of A such that P(AQ)=LU or
-	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
-	    operations than factorizing the unpermuted matrix A or A'A,
-	    respectively.
-	    
-	Returns:
-
-	    TRUE (1) if successful, FALSE (0) otherwise.
-
-	Arguments:
-
-	    int n_row ;		Input argument.
-
-		Number of rows in the matrix A.
-		Restriction:  n_row >= 0.
-		Colamd returns FALSE if n_row is negative.
-
-	    int n_col ;		Input argument.
-
-		Number of columns in the matrix A.
-		Restriction:  n_col >= 0.
-		Colamd returns FALSE if n_col is negative.
-
-	    int Alen ;		Input argument.
-
-		Restriction (see note):
-		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col
-		Colamd returns FALSE if these conditions are not met.
-
-		Note:  this restriction makes an modest assumption regarding
-		the size of the two typedef's structures in colamd.h.
-		We do, however, guarantee that
-
-			Alen >= colamd_recommended (nnz, n_row, n_col)
-		
-		or equivalently as a C preprocessor macro: 
-
-			Alen >= COLAMD_RECOMMENDED (nnz, n_row, n_col)
-
-		will be sufficient.
-
-	    int A [Alen] ;	Input argument, undefined on output.
-
-		A is an integer array of size Alen.  Alen must be at least as
-		large as the bare minimum value given above, but this is very
-		low, and can result in excessive run time.  For best
-		performance, we recommend that Alen be greater than or equal to
-		colamd_recommended (nnz, n_row, n_col), which adds
-		nnz/5 to the bare minimum value given above.
-
-		On input, the row indices of the entries in column c of the
-		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
-		in a given column c need not be in ascending order, and
-		duplicate row indices may be be present.  However, colamd will
-		work a little faster if both of these conditions are met
-		(Colamd puts the matrix into this format, if it finds that the
-		the conditions are not met).
-
-		The matrix is 0-based.  That is, rows are in the range 0 to
-		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
-		returns FALSE if any row index is out of range.
-
-		The contents of A are modified during ordering, and are
-		undefined on output.
-
-	    int p [n_col+1] ;	Both input and output argument.
-
-		p is an integer array of size n_col+1.  On input, it holds the
-		"pointers" for the column form of the matrix A.  Column c of
-		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
-		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
-		for all c in the range 0 to n_col-1.  The value p [n_col] is
-		thus the total number of entries in the pattern of the matrix A.
-		Colamd returns FALSE if these conditions are not met.
-
-		On output, if colamd returns TRUE, the array p holds the column
-		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
-		the first column index in the new ordering, and p [n_col-1] is
-		the last.  That is, p [k] = j means that column j of A is the
-		kth pivot column, in AQ, where k is in the range 0 to n_col-1
-		(p [0] = j means that column j of A is the first column in AQ).
-
-		If colamd returns FALSE, then no permutation is returned, and
-		p is undefined on output.
-
-	    double knobs [COLAMD_KNOBS] ;	Input argument.
-
-		See colamd_set_defaults for a description.
-
-	    int stats [COLAMD_STATS] ;		Output argument.
-
-		Statistics on the ordering, and error status.
-		See colamd.h for related definitions.
-		Colamd returns FALSE if stats is not present.
-
-		stats [0]:  number of dense or empty rows ignored.
-
-		stats [1]:  number of dense or empty columns ignored (and
-				ordered last in the output permutation p)
-				Note that a row can become "empty" if it
-				contains only "dense" and/or "empty" columns,
-				and similarly a column can become "empty" if it
-				only contains "dense" and/or "empty" rows.
-
-		stats [2]:  number of garbage collections performed.
-				This can be excessively high if Alen is close
-				to the minimum required value.
-
-		stats [3]:  status code.  < 0 is an error code.
-			    > 1 is a warning or notice.
-
-			0	OK.  Each column of the input matrix contained
-				row indices in increasing order, with no
-				duplicates.
-
-			1	OK, but columns of input matrix were jumbled
-				(unsorted columns or duplicate entries).  Colamd
-				had to do some extra work to sort the matrix
-				first and remove duplicate entries, but it
-				still was able to return a valid permutation
-				(return value of colamd was TRUE).
-
-					stats [4]: highest numbered column that
-						is unsorted or has duplicate
-						entries.
-					stats [5]: last seen duplicate or
-						unsorted row index.
-					stats [6]: number of duplicate or
-						unsorted row indices.
-
-			-1	A is a null pointer
-
-			-2	p is a null pointer
-
-			-3 	n_row is negative
-
-					stats [4]: n_row
-
-			-4	n_col is negative
-
-					stats [4]: n_col
-
-			-5	number of nonzeros in matrix is negative
-
-					stats [4]: number of nonzeros, p [n_col]
-
-			-6	p [0] is nonzero
-
-					stats [4]: p [0]
-
-			-7	A is too small
-
-					stats [4]: required size
-					stats [5]: actual size (Alen)
-
-			-8	a column has a negative number of entries
-
-					stats [4]: column with < 0 entries
-					stats [5]: number of entries in col
-
-			-9	a row index is out of bounds
-
-					stats [4]: column with bad row index
-					stats [5]: bad row index
-					stats [6]: n_row, # of rows of matrx
-
-			-10	(unused; see symamd.c)
-
-			-999	(unused; see symamd.c)
-
-		Future versions may return more statistics in the stats array.
-
-	Example:
-	
-	    See http://www.cise.ufl.edu/research/sparse/colamd/example.c
-	    for a complete example.
-
-	    To order the columns of a 5-by-4 matrix with 11 nonzero entries in
-	    the following nonzero pattern
-
-	    	x 0 x 0
-		x 0 x x
-		0 x x 0
-		0 0 x x
-		x x 0 0
-
-	    with default knobs and no output statistics, do the following:
-
-		#include "colamd.h"
-		#define ALEN COLAMD_RECOMMENDED (11, 5, 4)
-		int A [ALEN] = {1, 2, 5, 3, 5, 1, 2, 3, 4, 2, 4} ;
-		int p [ ] = {0, 3, 5, 9, 11} ;
-		int stats [COLAMD_STATS] ;
-		colamd (5, 4, ALEN, A, p, (double *) NULL, stats) ;
-
-	    The permutation is returned in the array p, and A is destroyed.
-
-    ----------------------------------------------------------------------------
-    symamd:
-    ----------------------------------------------------------------------------
-
-	C syntax:
-
-	    #include "colamd.h"
-	    int symamd (int n, int *A, int *p, int *perm,
-	    	double knobs [COLAMD_KNOBS], int stats [COLAMD_STATS],
-		void (*allocate) (size_t, size_t), void (*release) (void *)) ;
-
-	Purpose:
-
-    	    The symamd routine computes an ordering P of a symmetric sparse
-	    matrix A such that the Cholesky factorization PAP' = LL' remains
-	    sparse.  It is based on a column ordering of a matrix M constructed
-	    so that the nonzero pattern of M'M is the same as A.  The matrix A
-	    is assumed to be symmetric; only the strictly lower triangular part
-	    is accessed.  You must pass your selected memory allocator (usually
-	    calloc/free or mxCalloc/mxFree) to symamd, for it to allocate
-	    memory for the temporary matrix M.
-
-	Returns:
-
-	    TRUE (1) if successful, FALSE (0) otherwise.
-
-	Arguments:
-
-	    int n ;		Input argument.
-
-	    	Number of rows and columns in the symmetrix matrix A.
-		Restriction:  n >= 0.
-		Symamd returns FALSE if n is negative.
-
-	    int A [nnz] ;	Input argument.
-
-	    	A is an integer array of size nnz, where nnz = p [n].
-		
-		The row indices of the entries in column c of the matrix are
-		held in A [(p [c]) ... (p [c+1]-1)].  The row indices in a
-		given column c need not be in ascending order, and duplicate
-		row indices may be present.  However, symamd will run faster
-		if the columns are in sorted order with no duplicate entries. 
-
-		The matrix is 0-based.  That is, rows are in the range 0 to
-		n-1, and columns are in the range 0 to n-1.  Symamd
-		returns FALSE if any row index is out of range.
-
-		The contents of A are not modified.
-
-	    int p [n+1] ;   	Input argument.
-
-		p is an integer array of size n+1.  On input, it holds the
-		"pointers" for the column form of the matrix A.  Column c of
-		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
-		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
-		for all c in the range 0 to n-1.  The value p [n] is
-		thus the total number of entries in the pattern of the matrix A.
-		Symamd returns FALSE if these conditions are not met.
-
-		The contents of p are not modified.
-
-	    int perm [n+1] ;   	Output argument.
-
-		On output, if symamd returns TRUE, the array perm holds the
-		permutation P, where perm [0] is the first index in the new
-		ordering, and perm [n-1] is the last.  That is, perm [k] = j
-		means that row and column j of A is the kth column in PAP',
-		where k is in the range 0 to n-1 (perm [0] = j means
-		that row and column j of A are the first row and column in
-		PAP').  The array is used as a workspace during the ordering,
-		which is why it must be of length n+1, not just n.
-
-	    double knobs [COLAMD_KNOBS] ;	Input argument.
-
-		See colamd_set_defaults for a description.
-
-	    int stats [COLAMD_STATS] ;		Output argument.
-
-		Statistics on the ordering, and error status.
-		See colamd.h for related definitions.
-		Symamd returns FALSE if stats is not present.
-
-		stats [0]:  number of dense or empty row and columns ignored
-				(and ordered last in the output permutation 
-				perm).  Note that a row/column can become
-				"empty" if it contains only "dense" and/or
-				"empty" columns/rows.
-
-		stats [1]:  (same as stats [0])
-
-		stats [2]:  number of garbage collections performed.
-
-		stats [3]:  status code.  < 0 is an error code.
-			    > 1 is a warning or notice.
-
-			0	OK.  Each column of the input matrix contained
-				row indices in increasing order, with no
-				duplicates.
-
-			1	OK, but columns of input matrix were jumbled
-				(unsorted columns or duplicate entries).  Symamd
-				had to do some extra work to sort the matrix
-				first and remove duplicate entries, but it
-				still was able to return a valid permutation
-				(return value of symamd was TRUE).
-
-					stats [4]: highest numbered column that
-						is unsorted or has duplicate
-						entries.
-					stats [5]: last seen duplicate or
-						unsorted row index.
-					stats [6]: number of duplicate or
-						unsorted row indices.
-
-			-1	A is a null pointer
-
-			-2	p is a null pointer
-
-			-3	(unused, see colamd.c)
-
-			-4 	n is negative
-
-					stats [4]: n
-
-			-5	number of nonzeros in matrix is negative
-
-					stats [4]: # of nonzeros (p [n]).
-
-			-6	p [0] is nonzero
-
-					stats [4]: p [0]
-
-			-7	(unused)
-
-			-8	a column has a negative number of entries
-
-					stats [4]: column with < 0 entries
-					stats [5]: number of entries in col
-
-			-9	a row index is out of bounds
-
-					stats [4]: column with bad row index
-					stats [5]: bad row index
-					stats [6]: n_row, # of rows of matrx
-
-			-10	out of memory (unable to allocate temporary
-				workspace for M or count arrays using the
-				"allocate" routine passed into symamd).
-
-			-999	internal error.  colamd failed to order the
-				matrix M, when it should have succeeded.  This
-				indicates a bug.  If this (and *only* this)
-				error code occurs, please contact the authors.
-				Don't contact the authors if you get any other
-				error code.
-
-		Future versions may return more statistics in the stats array.
-
-	    void * (*allocate) (size_t, size_t)
-
-	    	A pointer to a function providing memory allocation.  The
-		allocated memory must be returned initialized to zero.  For a
-		C application, this argument should normally be a pointer to
-		calloc.  For a MATLAB mexFunction, the routine mxCalloc is
-		passed instead.
-
-	    void (*release) (size_t, size_t)
-
-	    	A pointer to a function that frees memory allocated by the
-		memory allocation routine above.  For a C application, this
-		argument should normally be a pointer to free.  For a MATLAB
-		mexFunction, the routine mxFree is passed instead.
-
-
-    ----------------------------------------------------------------------------
-    colamd_report:
-    ----------------------------------------------------------------------------
-
-	C syntax:
-
-	    #include "colamd.h"
-	    colamd_report (int stats [COLAMD_STATS]) ;
-
-	Purpose:
-
-	    Prints the error status and statistics recorded in the stats
-	    array on the standard error output (for a standard C routine)
-	    or on the MATLAB output (for a mexFunction).
-
-	Arguments:
-
-	    int stats [COLAMD_STATS] ;	Input only.  Statistics from colamd.
-
-
-    ----------------------------------------------------------------------------
-    symamd_report:
-    ----------------------------------------------------------------------------
-
-	C syntax:
-
-	    #include "colamd.h"
-	    symamd_report (int stats [COLAMD_STATS]) ;
-
-	Purpose:
-
-	    Prints the error status and statistics recorded in the stats
-	    array on the standard error output (for a standard C routine)
-	    or on the MATLAB output (for a mexFunction).
-
-	Arguments:
-
-	    int stats [COLAMD_STATS] ;	Input only.  Statistics from symamd.
-
-
-*/
-
-/* ========================================================================== */
-/* === Scaffolding code definitions  ======================================== */
-/* ========================================================================== */
-
-/* Ensure that debugging is turned off: */
-#ifndef NDEBUG
-#define NDEBUG
-#endif /* NDEBUG */
-
-/*
-   Our "scaffolding code" philosophy:  In our opinion, well-written library
-   code should keep its "debugging" code, and just normally have it turned off
-   by the compiler so as not to interfere with performance.  This serves
-   several purposes:
-
-   (1) assertions act as comments to the reader, telling you what the code
-	expects at that point.  All assertions will always be true (unless
-	there really is a bug, of course).
-
-   (2) leaving in the scaffolding code assists anyone who would like to modify
-	the code, or understand the algorithm (by reading the debugging output,
-	one can get a glimpse into what the code is doing).
-
-   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
-	and "should" be fully functional and bug-free ... but you never know...
-
-    To enable debugging, comment out the "#define NDEBUG" above.  For a MATLAB
-    mexFunction, you will also need to modify mexopts.sh to remove the -DNDEBUG
-    definition.  The code will become outrageously slow when debugging is
-    enabled.  To control the level of debugging output, set an environment
-    variable D to 0 (little), 1 (some), 2, 3, or 4 (lots).  When debugging,
-    you should see the following message on the standard output:
-
-    	colamd: debug version, D = 1 (THIS WILL BE SLOW!)
-
-    or a similar message for symamd.  If you don't, then debugging has not
-    been enabled.
-
-*/
-
-/* ========================================================================== */
-/* === Include files ======================================================== */
-/* ========================================================================== */
-
-#include "colamd.h"
-#include <limits.h>
-
-#ifdef MATLAB_MEX_FILE
-#include "mex.h"
-#include "matrix.h"
-#else
-#include <stdio.h>
-#include <assert.h>
-#endif /* MATLAB_MEX_FILE */
-
-/* ========================================================================== */
-/* === Definitions ========================================================== */
-/* ========================================================================== */
-
-/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
-#define PUBLIC
-#define PRIVATE static
-
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-#define ONES_COMPLEMENT(r) (-(r)-1)
-
-/* -------------------------------------------------------------------------- */
-/* Change for version 2.1:  define TRUE and FALSE only if not yet defined */  
-/* -------------------------------------------------------------------------- */
-
-#ifndef TRUE
-#define TRUE (1)
-#endif
-
-#ifndef FALSE
-#define FALSE (0)
-#endif
-
-/* -------------------------------------------------------------------------- */
-
-#define EMPTY	(-1)
-
-/* Row and column status */
-#define ALIVE	(0)
-#define DEAD	(-1)
-
-/* Column status */
-#define DEAD_PRINCIPAL		(-1)
-#define DEAD_NON_PRINCIPAL	(-2)
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
-#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
-
-/* ========================================================================== */
-/* === Colamd reporting mechanism =========================================== */
-/* ========================================================================== */
-
-#ifdef MATLAB_MEX_FILE
-
-/* use mexPrintf in a MATLAB mexFunction, for debugging and statistics output */
-#define PRINTF mexPrintf
-
-/* In MATLAB, matrices are 1-based to the user, but 0-based internally */
-#define INDEX(i) ((i)+1)
-
-#else
-
-/* Use printf in standard C environment, for debugging and statistics output. */
-/* Output is generated only if debugging is enabled at compile time, or if */
-/* the caller explicitly calls colamd_report or symamd_report. */
-#define PRINTF printf
-
-/* In C, matrices are 0-based and indices are reported as such in *_report */
-#define INDEX(i) (i)
-
-#endif /* MATLAB_MEX_FILE */
-
-/* ========================================================================== */
-/* === Prototypes of PRIVATE routines ======================================= */
-/* ========================================================================== */
-
-PRIVATE int init_rows_cols
-(
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A [],
-    int p [],
-    int stats [COLAMD_STATS]
-) ;
-
-PRIVATE void init_scoring
-(
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A [],
-    int head [],
-    double knobs [COLAMD_KNOBS],
-    int *p_n_row2,
-    int *p_n_col2,
-    int *p_max_deg
-) ;
-
-PRIVATE int find_ordering
-(
-    int n_row,
-    int n_col,
-    int Alen,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A [],
-    int head [],
-    int n_col2,
-    int max_deg,
-    int pfree
-) ;
-
-PRIVATE void order_children
-(
-    int n_col,
-    Colamd_Col Col [],
-    int p []
-) ;
-
-PRIVATE void detect_super_cols
-(
-
-#ifndef NDEBUG
-    int n_col,
-    Colamd_Row Row [],
-#endif /* NDEBUG */
-
-    Colamd_Col Col [],
-    int A [],
-    int head [],
-    int row_start,
-    int row_length
-) ;
-
-PRIVATE int garbage_collection
-(
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A [],
-    int *pfree
-) ;
-
-PRIVATE int clear_mark
-(
-    int n_row,
-    Colamd_Row Row []
-) ;
-
-PRIVATE void print_report
-(
-    const char *method,
-    int stats [COLAMD_STATS]
-) ;
-
-/* ========================================================================== */
-/* === Debugging prototypes and definitions ================================= */
-/* ========================================================================== */
-
-#ifndef NDEBUG
-
-/* colamd_debug is the *ONLY* global variable, and is only */
-/* present when debugging */
-
-PRIVATE int colamd_debug ;	/* debug print level */
-
-#define DEBUG0(params) { (void) PRINTF params ; }
-#define DEBUG1(params) { if (colamd_debug >= 1) (void) PRINTF params ; }
-#define DEBUG2(params) { if (colamd_debug >= 2) (void) PRINTF params ; }
-#define DEBUG3(params) { if (colamd_debug >= 3) (void) PRINTF params ; }
-#define DEBUG4(params) { if (colamd_debug >= 4) (void) PRINTF params ; }
-
-#ifdef MATLAB_MEX_FILE
-#define ASSERT(expression) (mxAssert ((expression), ""))
-#else
-#define ASSERT(expression) (assert (expression))
-#endif /* MATLAB_MEX_FILE */
-
-PRIVATE void colamd_get_debug	/* gets the debug print level from getenv */
-(
-    char *method
-) ;
-
-PRIVATE void debug_deg_lists
-(
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int head [],
-    int min_score,
-    int should,
-    int max_deg
-) ;
-
-PRIVATE void debug_mark
-(
-    int n_row,
-    Colamd_Row Row [],
-    int tag_mark,
-    int max_mark
-) ;
-
-PRIVATE void debug_matrix
-(
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A []
-) ;
-
-PRIVATE void debug_structures
-(
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A [],
-    int n_col2
-) ;
-
-#else /* NDEBUG */
-
-/* === No debugging ========================================================= */
-
-#define DEBUG0(params) ;
-#define DEBUG1(params) ;
-#define DEBUG2(params) ;
-#define DEBUG3(params) ;
-#define DEBUG4(params) ;
-
-#define ASSERT(expression) ((void) 0)
-
-#endif /* NDEBUG */
-
-/* ========================================================================== */
-
-
-
-/* ========================================================================== */
-/* === USER-CALLABLE ROUTINES: ============================================== */
-/* ========================================================================== */
-
-
-/* ========================================================================== */
-/* === colamd_recommended =================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd_recommended routine returns the suggested size for Alen.  This
-    value has been determined to provide good balance between the number of
-    garbage collections and the memory requirements for colamd.  If any
-    argument is negative, a -1 is returned as an error condition.  This
-    function is also available as a macro defined in colamd.h, so that you
-    can use it for a statically-allocated array size.
-*/
-
-PUBLIC int colamd_recommended	/* returns recommended value of Alen. */
-(
-    /* === Parameters ======================================================= */
-
-    int nnz,			/* number of nonzeros in A */
-    int n_row,			/* number of rows in A */
-    int n_col			/* number of columns in A */
-)
-{
-    return (COLAMD_RECOMMENDED (nnz, n_row, n_col)) ; 
-}
-
-
-/* ========================================================================== */
-/* === colamd_set_defaults ================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd_set_defaults routine sets the default values of the user-
-    controllable parameters for colamd:
-
-	knobs [0]	rows with knobs[0]*n_col entries or more are removed
-			prior to ordering in colamd.  Rows and columns with
-			knobs[0]*n_col entries or more are removed prior to
-			ordering in symamd and placed last in the output
-			ordering.
-
-	knobs [1]	columns with knobs[1]*n_row entries or more are removed
-			prior to ordering in colamd, and placed last in the
-			column permutation.  Symamd ignores this knob.
-
-	knobs [2..19]	unused, but future versions might use this
-*/
-
-PUBLIC void colamd_set_defaults
-(
-    /* === Parameters ======================================================= */
-
-    double knobs [COLAMD_KNOBS]		/* knob array */
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;
-
-    if (!knobs)
-    {
-	return ;			/* no knobs to initialize */
-    }
-    for (i = 0 ; i < COLAMD_KNOBS ; i++)
-    {
-	knobs [i] = 0 ;
-    }
-    knobs [COLAMD_DENSE_ROW] = 0.5 ;	/* ignore rows over 50% dense */
-    knobs [COLAMD_DENSE_COL] = 0.5 ;	/* ignore columns over 50% dense */
-}
-
-
-/* ========================================================================== */
-/* === symamd =============================================================== */
-/* ========================================================================== */
-
-PUBLIC int symamd			/* return TRUE if OK, FALSE otherwise */
-(
-    /* === Parameters ======================================================= */
-
-    int n,				/* number of rows and columns of A */
-    int A [],				/* row indices of A */
-    int p [],				/* column pointers of A */
-    int perm [],			/* output permutation, size n+1 */
-    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
-    int stats [COLAMD_STATS],		/* output statistics and error codes */
-    void * (*allocate) (size_t, size_t),
-    					/* pointer to calloc (ANSI C) or */
-					/* mxCalloc (for MATLAB mexFunction) */
-    void (*release) (void *)
-    					/* pointer to free (ANSI C) or */
-    					/* mxFree (for MATLAB mexFunction) */
-)
-{
-    /* === Local variables ================================================== */
-
-    int *count ;		/* length of each column of M, and col pointer*/
-    int *mark ;			/* mark array for finding duplicate entries */
-    int *M ;			/* row indices of matrix M */
-    int Mlen ;			/* length of M */
-    int n_row ;			/* number of rows in M */
-    int nnz ;			/* number of entries in A */
-    int i ;			/* row index of A */
-    int j ;			/* column index of A */
-    int k ;			/* row index of M */ 
-    int mnz ;			/* number of nonzeros in M */
-    int pp ;			/* index into a column of A */
-    int last_row ;		/* last row seen in the current column */
-    int length ;		/* number of nonzeros in a column */
-
-    double cknobs [COLAMD_KNOBS] ;		/* knobs for colamd */
-    double default_knobs [COLAMD_KNOBS] ;	/* default knobs for colamd */
-    int cstats [COLAMD_STATS] ;			/* colamd stats */
-
-#ifndef NDEBUG
-    colamd_get_debug ("symamd") ;
-#endif /* NDEBUG */
-
-    /* === Check the input arguments ======================================== */
-
-    if (!stats)
-    {
-	DEBUG0 (("symamd: stats not present\n")) ;
-	return (FALSE) ;
-    }
-    for (i = 0 ; i < COLAMD_STATS ; i++)
-    {
-	stats [i] = 0 ;
-    }
-    stats [COLAMD_STATUS] = COLAMD_OK ;
-    stats [COLAMD_INFO1] = -1 ;
-    stats [COLAMD_INFO2] = -1 ;
-
-    if (!A)
-    {
-    	stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
-	DEBUG0 (("symamd: A not present\n")) ;
-	return (FALSE) ;
-    }
-
-    if (!p)		/* p is not present */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
-	DEBUG0 (("symamd: p not present\n")) ;
-    	return (FALSE) ;
-    }
-
-    if (n < 0)		/* n must be >= 0 */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
-	stats [COLAMD_INFO1] = n ;
-	DEBUG0 (("symamd: n negative %d\n", n)) ;
-    	return (FALSE) ;
-    }
-
-    nnz = p [n] ;
-    if (nnz < 0)	/* nnz must be >= 0 */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
-	stats [COLAMD_INFO1] = nnz ;
-	DEBUG0 (("symamd: number of entries negative %d\n", nnz)) ;
-	return (FALSE) ;
-    }
-
-    if (p [0] != 0)
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
-	stats [COLAMD_INFO1] = p [0] ;
-	DEBUG0 (("symamd: p[0] not zero %d\n", p [0])) ;
-	return (FALSE) ;
-    }
-
-    /* === If no knobs, set default knobs =================================== */
-
-    if (!knobs)
-    {
-	colamd_set_defaults (default_knobs) ;
-	knobs = default_knobs ;
-    }
-
-    /* === Allocate count and mark ========================================== */
-
-    count = (int *) ((*allocate) (n+1, sizeof (int))) ;
-    if (!count)
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
-	DEBUG0 (("symamd: allocate count (size %d) failed\n", n+1)) ;
-	return (FALSE) ;
-    }
-
-    mark = (int *) ((*allocate) (n+1, sizeof (int))) ;
-    if (!mark)
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
-	(*release) ((void *) count) ;
-	DEBUG0 (("symamd: allocate mark (size %d) failed\n", n+1)) ;
-	return (FALSE) ;
-    }
-
-    /* === Compute column counts of M, check if A is valid ================== */
-
-    stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
-
-    for (i = 0 ; i < n ; i++)
-    {
-    	mark [i] = -1 ;
-    }
-
-    for (j = 0 ; j < n ; j++)
-    {
-	last_row = -1 ;
-
-	length = p [j+1] - p [j] ;
-	if (length < 0)
-	{
-	    /* column pointers must be non-decreasing */
-	    stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
-	    stats [COLAMD_INFO1] = j ;
-	    stats [COLAMD_INFO2] = length ;
-	    (*release) ((void *) count) ;
-	    (*release) ((void *) mark) ;
-	    DEBUG0 (("symamd: col %d negative length %d\n", j, length)) ;
-	    return (FALSE) ;
-	}
-
-	for (pp = p [j] ; pp < p [j+1] ; pp++)
-	{
-	    i = A [pp] ;
-	    if (i < 0 || i >= n)
-	    {
-		/* row index i, in column j, is out of bounds */
-		stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
-		stats [COLAMD_INFO1] = j ;
-		stats [COLAMD_INFO2] = i ;
-		stats [COLAMD_INFO3] = n ;
-		(*release) ((void *) count) ;
-		(*release) ((void *) mark) ;
-		DEBUG0 (("symamd: row %d col %d out of bounds\n", i, j)) ;
-		return (FALSE) ;
-	    }
-
-	    if (i <= last_row || mark [i] == j)
-	    {
-		/* row index is unsorted or repeated (or both), thus col */
-		/* is jumbled.  This is a notice, not an error condition. */
-		stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
-		stats [COLAMD_INFO1] = j ;
-		stats [COLAMD_INFO2] = i ;
-		(stats [COLAMD_INFO3]) ++ ;
-		DEBUG1 (("symamd: row %d col %d unsorted/duplicate\n", i, j)) ;
-	    }
-
-	    if (i > j && mark [i] != j)
-	    {
-		/* row k of M will contain column indices i and j */
-		count [i]++ ;
-		count [j]++ ;
-	    }
-
-	    /* mark the row as having been seen in this column */
-	    mark [i] = j ;
-
-	    last_row = i ;
-	}
-    }
-
-    if (stats [COLAMD_STATUS] == COLAMD_OK)
-    {
-	/* if there are no duplicate entries, then mark is no longer needed */
-	(*release) ((void *) mark) ;
-    }
-
-    /* === Compute column pointers of M ===================================== */
-
-    /* use output permutation, perm, for column pointers of M */
-    perm [0] = 0 ;
-    for (j = 1 ; j <= n ; j++)
-    {
-	perm [j] = perm [j-1] + count [j-1] ;
-    }
-    for (j = 0 ; j < n ; j++)
-    {
-	count [j] = perm [j] ;
-    }
-
-    /* === Construct M ====================================================== */
-
-    mnz = perm [n] ;
-    n_row = mnz / 2 ;
-    Mlen = colamd_recommended (mnz, n_row, n) ;
-    M = (int *) ((*allocate) (Mlen, sizeof (int))) ;
-    DEBUG0 (("symamd: M is %d-by-%d with %d entries, Mlen = %d\n",
-    	n_row, n, mnz, Mlen)) ;
-
-    if (!M)
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_out_of_memory ;
-	(*release) ((void *) count) ;
-	(*release) ((void *) mark) ;
-	DEBUG0 (("symamd: allocate M (size %d) failed\n", Mlen)) ;
-	return (FALSE) ;
-    }
-
-    k = 0 ;
-
-    if (stats [COLAMD_STATUS] == COLAMD_OK)
-    {
-	/* Matrix is OK */
-	for (j = 0 ; j < n ; j++)
-	{
-	    ASSERT (p [j+1] - p [j] >= 0) ;
-	    for (pp = p [j] ; pp < p [j+1] ; pp++)
-	    {
-		i = A [pp] ;
-		ASSERT (i >= 0 && i < n) ;
-		if (i > j)
-		{
-		    /* row k of M contains column indices i and j */
-		    M [count [i]++] = k ;
-		    M [count [j]++] = k ;
-		    k++ ;
-		}
-	    }
-	}
-    }
-    else
-    {
-	/* Matrix is jumbled.  Do not add duplicates to M.  Unsorted cols OK. */
-	DEBUG0 (("symamd: Duplicates in A.\n")) ;
-	for (i = 0 ; i < n ; i++)
-	{
-	    mark [i] = -1 ;
-	}
-	for (j = 0 ; j < n ; j++)
-	{
-	    ASSERT (p [j+1] - p [j] >= 0) ;
-	    for (pp = p [j] ; pp < p [j+1] ; pp++)
-	    {
-		i = A [pp] ;
-		ASSERT (i >= 0 && i < n) ;
-		if (i > j && mark [i] != j)
-		{
-		    /* row k of M contains column indices i and j */
-		    M [count [i]++] = k ;
-		    M [count [j]++] = k ;
-		    k++ ;
-		    mark [i] = j ;
-		}
-	    }
-	}
-	(*release) ((void *) mark) ;
-    }
-
-    /* count and mark no longer needed */
-    (*release) ((void *) count) ;
-    ASSERT (k == n_row) ;
-
-    /* === Adjust the knobs for M =========================================== */
-
-    for (i = 0 ; i < COLAMD_KNOBS ; i++)
-    {
-	cknobs [i] = knobs [i] ;
-    }
-
-    /* there are no dense rows in M */
-    cknobs [COLAMD_DENSE_ROW] = 1.0 ;
-
-    if (n_row != 0 && n < n_row)
-    {
-	/* On input, the knob is a fraction of 1..n, the number of rows of A. */
-	/* Convert it to a fraction of 1..n_row, of the number of rows of M. */
-    	cknobs [COLAMD_DENSE_COL] = (knobs [COLAMD_DENSE_ROW] * n) / n_row ;
-    }
-    else
-    {
-	/* no dense columns in M */
-    	cknobs [COLAMD_DENSE_COL] = 1.0 ;
-    }
-
-    DEBUG0 (("symamd: dense col knob for M: %g\n", cknobs [COLAMD_DENSE_COL])) ;
-
-    /* === Order the columns of M =========================================== */
-
-    if (!colamd (n_row, n, Mlen, M, perm, cknobs, cstats))
-    {
-	/* This "cannot" happen, unless there is a bug in the code. */
-	stats [COLAMD_STATUS] = COLAMD_ERROR_internal_error ;
-	(*release) ((void *) M) ;
-	DEBUG0 (("symamd: internal error!\n")) ;
-	return (FALSE) ;
-    }
-
-    /* Note that the output permutation is now in perm */
-
-    /* === get the statistics for symamd from colamd ======================== */
-
-    /* note that a dense column in colamd means a dense row and col in symamd */
-    stats [COLAMD_DENSE_ROW]    = cstats [COLAMD_DENSE_COL] ;
-    stats [COLAMD_DENSE_COL]    = cstats [COLAMD_DENSE_COL] ;
-    stats [COLAMD_DEFRAG_COUNT] = cstats [COLAMD_DEFRAG_COUNT] ;
-
-    /* === Free M =========================================================== */
-
-    (*release) ((void *) M) ;
-    DEBUG0 (("symamd: done.\n")) ;
-    return (TRUE) ;
-
-}
-
-/* ========================================================================== */
-/* === colamd =============================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd routine computes a column ordering Q of a sparse matrix
-    A such that the LU factorization P(AQ) = LU remains sparse, where P is
-    selected via partial pivoting.   The routine can also be viewed as
-    providing a permutation Q such that the Cholesky factorization
-    (AQ)'(AQ) = LL' remains sparse.
-*/
-
-PUBLIC int colamd		/* returns TRUE if successful, FALSE otherwise*/
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows in A */
-    int n_col,			/* number of columns in A */
-    int Alen,			/* length of A */
-    int A [],			/* row indices of A */
-    int p [],			/* pointers to columns in A */
-    double knobs [COLAMD_KNOBS],/* parameters (uses defaults if NULL) */
-    int stats [COLAMD_STATS]	/* output statistics and error codes */
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;			/* loop index */
-    int nnz ;			/* nonzeros in A */
-    int Row_size ;		/* size of Row [], in integers */
-    int Col_size ;		/* size of Col [], in integers */
-    int need ;			/* minimum required length of A */
-    Colamd_Row *Row ;		/* pointer into A of Row [0..n_row] array */
-    Colamd_Col *Col ;		/* pointer into A of Col [0..n_col] array */
-    int n_col2 ;		/* number of non-dense, non-empty columns */
-    int n_row2 ;		/* number of non-dense, non-empty rows */
-    int ngarbage ;		/* number of garbage collections performed */
-    int max_deg ;		/* maximum row degree */
-    double default_knobs [COLAMD_KNOBS] ;	/* default knobs array */
-
-#ifndef NDEBUG
-    colamd_get_debug ("colamd") ;
-#endif /* NDEBUG */
-
-    /* === Check the input arguments ======================================== */
-
-    if (!stats)
-    {
-	DEBUG0 (("colamd: stats not present\n")) ;
-	return (FALSE) ;
-    }
-    for (i = 0 ; i < COLAMD_STATS ; i++)
-    {
-	stats [i] = 0 ;
-    }
-    stats [COLAMD_STATUS] = COLAMD_OK ;
-    stats [COLAMD_INFO1] = -1 ;
-    stats [COLAMD_INFO2] = -1 ;
-
-    if (!A)		/* A is not present */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
-	DEBUG0 (("colamd: A not present\n")) ;
-	return (FALSE) ;
-    }
-
-    if (!p)		/* p is not present */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
-	DEBUG0 (("colamd: p not present\n")) ;
-    	return (FALSE) ;
-    }
-
-    if (n_row < 0)	/* n_row must be >= 0 */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
-	stats [COLAMD_INFO1] = n_row ;
-	DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
-    	return (FALSE) ;
-    }
-
-    if (n_col < 0)	/* n_col must be >= 0 */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
-	stats [COLAMD_INFO1] = n_col ;
-	DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
-    	return (FALSE) ;
-    }
-
-    nnz = p [n_col] ;
-    if (nnz < 0)	/* nnz must be >= 0 */
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
-	stats [COLAMD_INFO1] = nnz ;
-	DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
-	return (FALSE) ;
-    }
-
-    if (p [0] != 0)
-    {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero	;
-	stats [COLAMD_INFO1] = p [0] ;
-	DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
-	return (FALSE) ;
-    }
-
-    /* === If no knobs, set default knobs =================================== */
-
-    if (!knobs)
-    {
-	colamd_set_defaults (default_knobs) ;
-	knobs = default_knobs ;
-    }
-
-    /* === Allocate the Row and Col arrays from array A ===================== */
-
-    Col_size = COLAMD_C (n_col) ;
-    Row_size = COLAMD_R (n_row) ;
-    need = 2*nnz + n_col + Col_size + Row_size ;
-
-    if (need > Alen)
-    {
-	/* not enough space in array A to perform the ordering */
-	stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
-	stats [COLAMD_INFO1] = need ;
-	stats [COLAMD_INFO2] = Alen ;
-	DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
-	return (FALSE) ;
-    }
-
-    Alen -= Col_size + Row_size ;
-    Col = (Colamd_Col *) &A [Alen] ;
-    Row = (Colamd_Row *) &A [Alen + Col_size] ;
-
-    /* === Construct the row and column data structures ===================== */
-
-    if (!init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
-    {
-	/* input matrix is invalid */
-	DEBUG0 (("colamd: Matrix invalid\n")) ;
-	return (FALSE) ;
-    }
-
-    /* === Initialize scores, kill dense rows/columns ======================= */
-
-    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
-	&n_row2, &n_col2, &max_deg) ;
-
-    /* === Order the supercolumns =========================================== */
-
-    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
-	n_col2, max_deg, 2*nnz) ;
-
-    /* === Order the non-principal columns ================================== */
-
-    order_children (n_col, Col, p) ;
-
-    /* === Return statistics in stats ======================================= */
-
-    stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-    stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
-    stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
-    DEBUG0 (("colamd: done.\n")) ; 
-    return (TRUE) ;
-}
-
-
-/* ========================================================================== */
-/* === colamd_report ======================================================== */
-/* ========================================================================== */
-
-PUBLIC void colamd_report
-(
-    int stats [COLAMD_STATS]
-)
-{
-    print_report ("colamd", stats) ;
-}
-
-
-/* ========================================================================== */
-/* === symamd_report ======================================================== */
-/* ========================================================================== */
-
-PUBLIC void symamd_report
-(
-    int stats [COLAMD_STATS]
-)
-{
-    print_report ("symamd", stats) ;
-}
-
-
-
-/* ========================================================================== */
-/* === NON-USER-CALLABLE ROUTINES: ========================================== */
-/* ========================================================================== */
-
-/* There are no user-callable routines beyond this point in the file */
-
-
-/* ========================================================================== */
-/* === init_rows_cols ======================================================= */
-/* ========================================================================== */
-
-/*
-    Takes the column form of the matrix in A and creates the row form of the
-    matrix.  Also, row and column attributes are stored in the Col and Row
-    structs.  If the columns are un-sorted or contain duplicate row indices,
-    this routine will also sort and remove duplicate row indices from the
-    column form of the matrix.  Returns FALSE if the matrix is invalid,
-    TRUE otherwise.  Not user-callable.
-*/
-
-PRIVATE int init_rows_cols	/* returns TRUE if OK, or FALSE otherwise */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    Colamd_Row Row [],		/* of size n_row+1 */
-    Colamd_Col Col [],		/* of size n_col+1 */
-    int A [],			/* row indices of A, of size Alen */
-    int p [],			/* pointers to columns in A, of size n_col+1 */
-    int stats [COLAMD_STATS]	/* colamd statistics */ 
-)
-{
-    /* === Local variables ================================================== */
-
-    int col ;			/* a column index */
-    int row ;			/* a row index */
-    int *cp ;			/* a column pointer */
-    int *cp_end ;		/* a pointer to the end of a column */
-    int *rp ;			/* a row pointer */
-    int *rp_end ;		/* a pointer to the end of a row */
-    int last_row ;		/* previous row */
-
-    /* === Initialize columns, and check column pointers ==================== */
-
-    for (col = 0 ; col < n_col ; col++)
-    {
-	Col [col].start = p [col] ;
-	Col [col].length = p [col+1] - p [col] ;
-
-	if (Col [col].length < 0)
-	{
-	    /* column pointers must be non-decreasing */
-	    stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
-	    stats [COLAMD_INFO1] = col ;
-	    stats [COLAMD_INFO2] = Col [col].length ;
-	    DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
-	    return (FALSE) ;
-	}
-
-	Col [col].shared1.thickness = 1 ;
-	Col [col].shared2.score = 0 ;
-	Col [col].shared3.prev = EMPTY ;
-	Col [col].shared4.degree_next = EMPTY ;
-    }
-
-    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
-
-    /* === Scan columns, compute row degrees, and check row indices ========= */
-
-    stats [COLAMD_INFO3] = 0 ;	/* number of duplicate or unsorted row indices*/
-
-    for (row = 0 ; row < n_row ; row++)
-    {
-	Row [row].length = 0 ;
-	Row [row].shared2.mark = -1 ;
-    }
-
-    for (col = 0 ; col < n_col ; col++)
-    {
-	last_row = -1 ;
-
-	cp = &A [p [col]] ;
-	cp_end = &A [p [col+1]] ;
-
-	while (cp < cp_end)
-	{
-	    row = *cp++ ;
-
-	    /* make sure row indices within range */
-	    if (row < 0 || row >= n_row)
-	    {
-		stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
-		stats [COLAMD_INFO1] = col ;
-		stats [COLAMD_INFO2] = row ;
-		stats [COLAMD_INFO3] = n_row ;
-		DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
-		return (FALSE) ;
-	    }
-
-	    if (row <= last_row || Row [row].shared2.mark == col)
-	    {
-		/* row index are unsorted or repeated (or both), thus col */
-		/* is jumbled.  This is a notice, not an error condition. */
-		stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
-		stats [COLAMD_INFO1] = col ;
-		stats [COLAMD_INFO2] = row ;
-		(stats [COLAMD_INFO3]) ++ ;
-		DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
-	    }
-
-	    if (Row [row].shared2.mark != col)
-	    {
-		Row [row].length++ ;
-	    }
-	    else
-	    {
-		/* this is a repeated entry in the column, */
-		/* it will be removed */
-		Col [col].length-- ;
-	    }
-
-	    /* mark the row as having been seen in this column */
-	    Row [row].shared2.mark = col ;
-
-	    last_row = row ;
-	}
-    }
-
-    /* === Compute row pointers ============================================= */
-
-    /* row form of the matrix starts directly after the column */
-    /* form of matrix in A */
-    Row [0].start = p [n_col] ;
-    Row [0].shared1.p = Row [0].start ;
-    Row [0].shared2.mark = -1 ;
-    for (row = 1 ; row < n_row ; row++)
-    {
-	Row [row].start = Row [row-1].start + Row [row-1].length ;
-	Row [row].shared1.p = Row [row].start ;
-	Row [row].shared2.mark = -1 ;
-    }
-
-    /* === Create row form ================================================== */
-
-    if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
-    {
-	/* if cols jumbled, watch for repeated row indices */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    cp = &A [p [col]] ;
-	    cp_end = &A [p [col+1]] ;
-	    while (cp < cp_end)
-	    {
-		row = *cp++ ;
-		if (Row [row].shared2.mark != col)
-		{
-		    A [(Row [row].shared1.p)++] = col ;
-		    Row [row].shared2.mark = col ;
-		}
-	    }
-	}
-    }
-    else
-    {
-	/* if cols not jumbled, we don't need the mark (this is faster) */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    cp = &A [p [col]] ;
-	    cp_end = &A [p [col+1]] ;
-	    while (cp < cp_end)
-	    {
-		A [(Row [*cp++].shared1.p)++] = col ;
-	    }
-	}
-    }
-
-    /* === Clear the row marks and set row degrees ========================== */
-
-    for (row = 0 ; row < n_row ; row++)
-    {
-	Row [row].shared2.mark = 0 ;
-	Row [row].shared1.degree = Row [row].length ;
-    }
-
-    /* === See if we need to re-create columns ============================== */
-
-    if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
-    {
-    	DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
-
-#ifndef NDEBUG
-	/* make sure column lengths are correct */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    p [col] = Col [col].length ;
-	}
-	for (row = 0 ; row < n_row ; row++)
-	{
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		p [*rp++]-- ;
-	    }
-	}
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    ASSERT (p [col] == 0) ;
-	}
-	/* now p is all zero (different than when debugging is turned off) */
-#endif /* NDEBUG */
-
-	/* === Compute col pointers ========================================= */
-
-	/* col form of the matrix starts at A [0]. */
-	/* Note, we may have a gap between the col form and the row */
-	/* form if there were duplicate entries, if so, it will be */
-	/* removed upon the first garbage collection */
-	Col [0].start = 0 ;
-	p [0] = Col [0].start ;
-	for (col = 1 ; col < n_col ; col++)
-	{
-	    /* note that the lengths here are for pruned columns, i.e. */
-	    /* no duplicate row indices will exist for these columns */
-	    Col [col].start = Col [col-1].start + Col [col-1].length ;
-	    p [col] = Col [col].start ;
-	}
-
-	/* === Re-create col form =========================================== */
-
-	for (row = 0 ; row < n_row ; row++)
-	{
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		A [(p [*rp++])++] = row ;
-	    }
-	}
-    }
-
-    /* === Done.  Matrix is not (or no longer) jumbled ====================== */
-
-    return (TRUE) ;
-}
-
-
-/* ========================================================================== */
-/* === init_scoring ========================================================= */
-/* ========================================================================== */
-
-/*
-    Kills dense or empty columns and rows, calculates an initial score for
-    each column, and places all columns in the degree lists.  Not user-callable.
-*/
-
-PRIVATE void init_scoring
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    Colamd_Row Row [],		/* of size n_row+1 */
-    Colamd_Col Col [],		/* of size n_col+1 */
-    int A [],			/* column form and row form of A */
-    int head [],		/* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
-    int *p_n_row2,		/* number of non-dense, non-empty rows */
-    int *p_n_col2,		/* number of non-dense, non-empty columns */
-    int *p_max_deg		/* maximum row degree */
-)
-{
-    /* === Local variables ================================================== */
-
-    int c ;			/* a column index */
-    int r, row ;		/* a row index */
-    int *cp ;			/* a column pointer */
-    int deg ;			/* degree of a row or column */
-    int *cp_end ;		/* a pointer to the end of a column */
-    int *new_cp ;		/* new column pointer */
-    int col_length ;		/* length of pruned column */
-    int score ;			/* current column score */
-    int n_col2 ;		/* number of non-dense, non-empty columns */
-    int n_row2 ;		/* number of non-dense, non-empty rows */
-    int dense_row_count ;	/* remove rows with more entries than this */
-    int dense_col_count ;	/* remove cols with more entries than this */
-    int min_score ;		/* smallest column score */
-    int max_deg ;		/* maximum row degree */
-    int next_col ;		/* Used to add to degree list.*/
-
-#ifndef NDEBUG
-    int debug_count ;		/* debug only. */
-#endif /* NDEBUG */
-
-    /* === Extract knobs ==================================================== */
-
-    dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-    dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
-    DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
-    max_deg = 0 ;
-    n_col2 = n_col ;
-    n_row2 = n_row ;
-
-    /* === Kill empty columns =============================================== */
-
-    /* Put the empty columns at the end in their natural order, so that LU */
-    /* factorization can proceed as far as possible. */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	deg = Col [c].length ;
-	if (deg == 0)
-	{
-	    /* this is a empty column, kill and order it last */
-	    Col [c].shared2.order = --n_col2 ;
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-    }
-    DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
-
-    /* === Kill dense columns =============================================== */
-
-    /* Put the dense columns at the end, in their natural order */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* skip any dead columns */
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	deg = Col [c].length ;
-	if (deg > dense_col_count)
-	{
-	    /* this is a dense column, kill and order it last */
-	    Col [c].shared2.order = --n_col2 ;
-	    /* decrement the row degrees */
-	    cp = &A [Col [c].start] ;
-	    cp_end = cp + Col [c].length ;
-	    while (cp < cp_end)
-	    {
-		Row [*cp++].shared1.degree-- ;
-	    }
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-    }
-    DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
-
-    /* === Kill dense and empty rows ======================================== */
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	deg = Row [r].shared1.degree ;
-	ASSERT (deg >= 0 && deg <= n_col) ;
-	if (deg > dense_row_count || deg == 0)
-	{
-	    /* kill a dense or empty row */
-	    KILL_ROW (r) ;
-	    --n_row2 ;
-	}
-	else
-	{
-	    /* keep track of max degree of remaining rows */
-	    max_deg = MAX (max_deg, deg) ;
-	}
-    }
-    DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
-
-    /* === Compute initial column scores ==================================== */
-
-    /* At this point the row degrees are accurate.  They reflect the number */
-    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
-    /* Some "live" columns may contain only dead rows, however.  These are */
-    /* pruned in the code below. */
-
-    /* now find the initial matlab score for each column */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* skip dead column */
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	score = 0 ;
-	cp = &A [Col [c].start] ;
-	new_cp = cp ;
-	cp_end = cp + Col [c].length ;
-	while (cp < cp_end)
-	{
-	    /* get a row */
-	    row = *cp++ ;
-	    /* skip if dead */
-	    if (ROW_IS_DEAD (row))
-	    {
-		continue ;
-	    }
-	    /* compact the column */
-	    *new_cp++ = row ;
-	    /* add row's external degree */
-	    score += Row [row].shared1.degree - 1 ;
-	    /* guard against integer overflow */
-	    score = MIN (score, n_col) ;
-	}
-	/* determine pruned column length */
-	col_length = (int) (new_cp - &A [Col [c].start]) ;
-	if (col_length == 0)
-	{
-	    /* a newly-made null column (all rows in this col are "dense" */
-	    /* and have already been killed) */
-	    DEBUG2 (("Newly null killed: %d\n", c)) ;
-	    Col [c].shared2.order = --n_col2 ;
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-	else
-	{
-	    /* set column length and set score */
-	    ASSERT (score >= 0) ;
-	    ASSERT (score <= n_col) ;
-	    Col [c].length = col_length ;
-	    Col [c].shared2.score = score ;
-	}
-    }
-    DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n",
-    	n_col-n_col2)) ;
-
-    /* At this point, all empty rows and columns are dead.  All live columns */
-    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
-    /* yet).  Rows may contain dead columns, but all live rows contain at */
-    /* least one live column. */
-
-#ifndef NDEBUG
-    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
-#endif /* NDEBUG */
-
-    /* === Initialize degree lists ========================================== */
-
-#ifndef NDEBUG
-    debug_count = 0 ;
-#endif /* NDEBUG */
-
-    /* clear the hash buckets */
-    for (c = 0 ; c <= n_col ; c++)
-    {
-	head [c] = EMPTY ;
-    }
-    min_score = n_col ;
-    /* place in reverse order, so low column indices are at the front */
-    /* of the lists.  This is to encourage natural tie-breaking */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* only add principal columns to degree lists */
-	if (COL_IS_ALIVE (c))
-	{
-	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
-		c, Col [c].shared2.score, min_score, n_col)) ;
-
-	    /* === Add columns score to DList =============================== */
-
-	    score = Col [c].shared2.score ;
-
-	    ASSERT (min_score >= 0) ;
-	    ASSERT (min_score <= n_col) ;
-	    ASSERT (score >= 0) ;
-	    ASSERT (score <= n_col) ;
-	    ASSERT (head [score] >= EMPTY) ;
-
-	    /* now add this column to dList at proper score location */
-	    next_col = head [score] ;
-	    Col [c].shared3.prev = EMPTY ;
-	    Col [c].shared4.degree_next = next_col ;
-
-	    /* if there already was a column with the same score, set its */
-	    /* previous pointer to this new column */
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = c ;
-	    }
-	    head [score] = c ;
-
-	    /* see if this score is less than current min */
-	    min_score = MIN (min_score, score) ;
-
-#ifndef NDEBUG
-	    debug_count++ ;
-#endif /* NDEBUG */
-
-	}
-    }
-
-#ifndef NDEBUG
-    DEBUG1 (("colamd: Live cols %d out of %d, non-princ: %d\n",
-	debug_count, n_col, n_col-debug_count)) ;
-    ASSERT (debug_count == n_col2) ;
-    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
-#endif /* NDEBUG */
-
-    /* === Return number of remaining columns, and max row degree =========== */
-
-    *p_n_col2 = n_col2 ;
-    *p_n_row2 = n_row2 ;
-    *p_max_deg = max_deg ;
-}
-
-
-/* ========================================================================== */
-/* === find_ordering ======================================================== */
-/* ========================================================================== */
-
-/*
-    Order the principal columns of the supercolumn form of the matrix
-    (no supercolumns on input).  Uses a minimum approximate column minimum
-    degree ordering method.  Not user-callable.
-*/
-
-PRIVATE int find_ordering	/* return the number of garbage collections */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    int Alen,			/* size of A, 2*nnz + n_col or larger */
-    Colamd_Row Row [],		/* of size n_row+1 */
-    Colamd_Col Col [],		/* of size n_col+1 */
-    int A [],			/* column form and row form of A */
-    int head [],		/* of size n_col+1 */
-    int n_col2,			/* Remaining columns to order */
-    int max_deg,		/* Maximum row degree */
-    int pfree			/* index of first free slot (2*nnz on entry) */
-)
-{
-    /* === Local variables ================================================== */
-
-    int k ;			/* current pivot ordering step */
-    int pivot_col ;		/* current pivot column */
-    int *cp ;			/* a column pointer */
-    int *rp ;			/* a row pointer */
-    int pivot_row ;		/* current pivot row */
-    int *new_cp ;		/* modified column pointer */
-    int *new_rp ;		/* modified row pointer */
-    int pivot_row_start ;	/* pointer to start of pivot row */
-    int pivot_row_degree ;	/* number of columns in pivot row */
-    int pivot_row_length ;	/* number of supercolumns in pivot row */
-    int pivot_col_score ;	/* score of pivot column */
-    int needed_memory ;		/* free space needed for pivot row */
-    int *cp_end ;		/* pointer to the end of a column */
-    int *rp_end ;		/* pointer to the end of a row */
-    int row ;			/* a row index */
-    int col ;			/* a column index */
-    int max_score ;		/* maximum possible score */
-    int cur_score ;		/* score of current column */
-    unsigned int hash ;		/* hash value for supernode detection */
-    int head_column ;		/* head of hash bucket */
-    int first_col ;		/* first column in hash bucket */
-    int tag_mark ;		/* marker value for mark array */
-    int row_mark ;		/* Row [row].shared2.mark */
-    int set_difference ;	/* set difference size of row with pivot row */
-    int min_score ;		/* smallest column score */
-    int col_thickness ;		/* "thickness" (no. of columns in a supercol) */
-    int max_mark ;		/* maximum value of tag_mark */
-    int pivot_col_thickness ;	/* number of columns represented by pivot col */
-    int prev_col ;		/* Used by Dlist operations. */
-    int next_col ;		/* Used by Dlist operations. */
-    int ngarbage ;		/* number of garbage collections performed */
-
-#ifndef NDEBUG
-    int debug_d ;		/* debug loop counter */
-    int debug_step = 0 ;	/* debug loop counter */
-#endif /* NDEBUG */
-
-    /* === Initialization and clear mark ==================================== */
-
-    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
-    tag_mark = clear_mark (n_row, Row) ;
-    min_score = 0 ;
-    ngarbage = 0 ;
-    DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
-
-    /* === Order the columns ================================================ */
-
-    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
-    {
-
-#ifndef NDEBUG
-	if (debug_step % 100 == 0)
-	{
-	    DEBUG2 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
-	}
-	else
-	{
-	    DEBUG3 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
-	}
-	debug_step++ ;
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k, max_deg) ;
-	debug_matrix (n_row, n_col, Row, Col, A) ;
-#endif /* NDEBUG */
-
-	/* === Select pivot column, and order it ============================ */
-
-	/* make sure degree list isn't empty */
-	ASSERT (min_score >= 0) ;
-	ASSERT (min_score <= n_col) ;
-	ASSERT (head [min_score] >= EMPTY) ;
-
-#ifndef NDEBUG
-	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
-	{
-	    ASSERT (head [debug_d] == EMPTY) ;
-	}
-#endif /* NDEBUG */
-
-	/* get pivot column from head of minimum degree list */
-	while (head [min_score] == EMPTY && min_score < n_col)
-	{
-	    min_score++ ;
-	}
-	pivot_col = head [min_score] ;
-	ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
-	next_col = Col [pivot_col].shared4.degree_next ;
-	head [min_score] = next_col ;
-	if (next_col != EMPTY)
-	{
-	    Col [next_col].shared3.prev = EMPTY ;
-	}
-
-	ASSERT (COL_IS_ALIVE (pivot_col)) ;
-	DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
-
-	/* remember score for defrag check */
-	pivot_col_score = Col [pivot_col].shared2.score ;
-
-	/* the pivot column is the kth column in the pivot order */
-	Col [pivot_col].shared2.order = k ;
-
-	/* increment order count by column thickness */
-	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
-	k += pivot_col_thickness ;
-	ASSERT (pivot_col_thickness > 0) ;
-
-	/* === Garbage_collection, if necessary ============================= */
-
-	needed_memory = MIN (pivot_col_score, n_col - k) ;
-	if (pfree + needed_memory >= Alen)
-	{
-	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
-	    ngarbage++ ;
-	    /* after garbage collection we will have enough */
-	    ASSERT (pfree + needed_memory < Alen) ;
-	    /* garbage collection has wiped out the Row[].shared2.mark array */
-	    tag_mark = clear_mark (n_row, Row) ;
-
-#ifndef NDEBUG
-	    debug_matrix (n_row, n_col, Row, Col, A) ;
-#endif /* NDEBUG */
-	}
-
-	/* === Compute pivot row pattern ==================================== */
-
-	/* get starting location for this new merged row */
-	pivot_row_start = pfree ;
-
-	/* initialize new row counts to zero */
-	pivot_row_degree = 0 ;
-
-	/* tag pivot column as having been visited so it isn't included */
-	/* in merged pivot row */
-	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
-
-	/* pivot row is the union of all rows in the pivot column pattern */
-	cp = &A [Col [pivot_col].start] ;
-	cp_end = cp + Col [pivot_col].length ;
-	while (cp < cp_end)
-	{
-	    /* get a row */
-	    row = *cp++ ;
-	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
-	    /* skip if row is dead */
-	    if (ROW_IS_DEAD (row))
-	    {
-		continue ;
-	    }
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		/* get a column */
-		col = *rp++ ;
-		/* add the column, if alive and untagged */
-		col_thickness = Col [col].shared1.thickness ;
-		if (col_thickness > 0 && COL_IS_ALIVE (col))
-		{
-		    /* tag column in pivot row */
-		    Col [col].shared1.thickness = -col_thickness ;
-		    ASSERT (pfree < Alen) ;
-		    /* place column in pivot row */
-		    A [pfree++] = col ;
-		    pivot_row_degree += col_thickness ;
-		}
-	    }
-	}
-
-	/* clear tag on pivot column */
-	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-	max_deg = MAX (max_deg, pivot_row_degree) ;
-
-#ifndef NDEBUG
-	DEBUG3 (("check2\n")) ;
-	debug_mark (n_row, Row, tag_mark, max_mark) ;
-#endif /* NDEBUG */
-
-	/* === Kill all rows used to construct pivot row ==================== */
-
-	/* also kill pivot row, temporarily */
-	cp = &A [Col [pivot_col].start] ;
-	cp_end = cp + Col [pivot_col].length ;
-	while (cp < cp_end)
-	{
-	    /* may be killing an already dead row */
-	    row = *cp++ ;
-	    DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-	    KILL_ROW (row) ;
-	}
-
-	/* === Select a row index to use as the new pivot row =============== */
-
-	pivot_row_length = pfree - pivot_row_start ;
-	if (pivot_row_length > 0)
-	{
-	    /* pick the "pivot" row arbitrarily (first row in col) */
-	    pivot_row = A [Col [pivot_col].start] ;
-	    DEBUG3 (("Pivotal row is %d\n", pivot_row)) ;
-	}
-	else
-	{
-	    /* there is no pivot row, since it is of zero length */
-	    pivot_row = EMPTY ;
-	    ASSERT (pivot_row_length == 0) ;
-	}
-	ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
-
-	/* === Approximate degree computation =============================== */
-
-	/* Here begins the computation of the approximate degree.  The column */
-	/* score is the sum of the pivot row "length", plus the size of the */
-	/* set differences of each row in the column minus the pattern of the */
-	/* pivot row itself.  The column ("thickness") itself is also */
-	/* excluded from the column score (we thus use an approximate */
-	/* external degree). */
-
-	/* The time taken by the following code (compute set differences, and */
-	/* add them up) is proportional to the size of the data structure */
-	/* being scanned - that is, the sum of the sizes of each column in */
-	/* the pivot row.  Thus, the amortized time to compute a column score */
-	/* is proportional to the size of that column (where size, in this */
-	/* context, is the column "length", or the number of row indices */
-	/* in that column).  The number of row indices in a column is */
-	/* monotonically non-decreasing, from the length of the original */
-	/* column on input to colamd. */
-
-	/* === Compute set differences ====================================== */
-
-	DEBUG3 (("** Computing set differences phase. **\n")) ;
-
-	/* pivot row is currently dead - it will be revived later. */
-
-	DEBUG3 (("Pivot row: ")) ;
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    col = *rp++ ;
-	    ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
-	    DEBUG3 (("Col: %d\n", col)) ;
-
-	    /* clear tags used to construct pivot row pattern */
-	    col_thickness = -Col [col].shared1.thickness ;
-	    ASSERT (col_thickness > 0) ;
-	    Col [col].shared1.thickness = col_thickness ;
-
-	    /* === Remove column from degree list =========================== */
-
-	    cur_score = Col [col].shared2.score ;
-	    prev_col = Col [col].shared3.prev ;
-	    next_col = Col [col].shared4.degree_next ;
-	    ASSERT (cur_score >= 0) ;
-	    ASSERT (cur_score <= n_col) ;
-	    ASSERT (cur_score >= EMPTY) ;
-	    if (prev_col == EMPTY)
-	    {
-		head [cur_score] = next_col ;
-	    }
-	    else
-	    {
-		Col [prev_col].shared4.degree_next = next_col ;
-	    }
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = prev_col ;
-	    }
-
-	    /* === Scan the column ========================================== */
-
-	    cp = &A [Col [col].start] ;
-	    cp_end = cp + Col [col].length ;
-	    while (cp < cp_end)
-	    {
-		/* get a row */
-		row = *cp++ ;
-		row_mark = Row [row].shared2.mark ;
-		/* skip if dead */
-		if (ROW_IS_MARKED_DEAD (row_mark))
-		{
-		    continue ;
-		}
-		ASSERT (row != pivot_row) ;
-		set_difference = row_mark - tag_mark ;
-		/* check if the row has been seen yet */
-		if (set_difference < 0)
-		{
-		    ASSERT (Row [row].shared1.degree <= max_deg) ;
-		    set_difference = Row [row].shared1.degree ;
-		}
-		/* subtract column thickness from this row's set difference */
-		set_difference -= col_thickness ;
-		ASSERT (set_difference >= 0) ;
-		/* absorb this row if the set difference becomes zero */
-		if (set_difference == 0)
-		{
-		    DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-		    KILL_ROW (row) ;
-		}
-		else
-		{
-		    /* save the new mark */
-		    Row [row].shared2.mark = set_difference + tag_mark ;
-		}
-	    }
-	}
-
-#ifndef NDEBUG
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k-pivot_row_degree, max_deg) ;
-#endif /* NDEBUG */
-
-	/* === Add up set differences for each column ======================= */
-
-	DEBUG3 (("** Adding set differences phase. **\n")) ;
-
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    /* get a column */
-	    col = *rp++ ;
-	    ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
-	    hash = 0 ;
-	    cur_score = 0 ;
-	    cp = &A [Col [col].start] ;
-	    /* compact the column */
-	    new_cp = cp ;
-	    cp_end = cp + Col [col].length ;
-
-	    DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ;
-
-	    while (cp < cp_end)
-	    {
-		/* get a row */
-		row = *cp++ ;
-		ASSERT(row >= 0 && row < n_row) ;
-		row_mark = Row [row].shared2.mark ;
-		/* skip if dead */
-		if (ROW_IS_MARKED_DEAD (row_mark))
-		{
-		    continue ;
-		}
-		ASSERT (row_mark > tag_mark) ;
-		/* compact the column */
-		*new_cp++ = row ;
-		/* compute hash function */
-		hash += row ;
-		/* add set difference */
-		cur_score += row_mark - tag_mark ;
-		/* integer overflow... */
-		cur_score = MIN (cur_score, n_col) ;
-	    }
-
-	    /* recompute the column's length */
-	    Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
-
-	    /* === Further mass elimination ================================= */
-
-	    if (Col [col].length == 0)
-	    {
-		DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
-		/* nothing left but the pivot row in this column */
-		KILL_PRINCIPAL_COL (col) ;
-		pivot_row_degree -= Col [col].shared1.thickness ;
-		ASSERT (pivot_row_degree >= 0) ;
-		/* order it */
-		Col [col].shared2.order = k ;
-		/* increment order count by column thickness */
-		k += Col [col].shared1.thickness ;
-	    }
-	    else
-	    {
-		/* === Prepare for supercolumn detection ==================== */
-
-		DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ;
-
-		/* save score so far */
-		Col [col].shared2.score = cur_score ;
-
-		/* add column to hash table, for supercolumn detection */
-		hash %= n_col + 1 ;
-
-		DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
-		ASSERT (hash <= n_col) ;
-
-		head_column = head [hash] ;
-		if (head_column > EMPTY)
-		{
-		    /* degree list "hash" is non-empty, use prev (shared3) of */
-		    /* first column in degree list as head of hash bucket */
-		    first_col = Col [head_column].shared3.headhash ;
-		    Col [head_column].shared3.headhash = col ;
-		}
-		else
-		{
-		    /* degree list "hash" is empty, use head as hash bucket */
-		    first_col = - (head_column + 2) ;
-		    head [hash] = - (col + 2) ;
-		}
-		Col [col].shared4.hash_next = first_col ;
-
-		/* save hash function in Col [col].shared3.hash */
-		Col [col].shared3.hash = (int) hash ;
-		ASSERT (COL_IS_ALIVE (col)) ;
-	    }
-	}
-
-	/* The approximate external column degree is now computed.  */
-
-	/* === Supercolumn detection ======================================== */
-
-	DEBUG3 (("** Supercolumn detection phase. **\n")) ;
-
-	detect_super_cols (
-
-#ifndef NDEBUG
-		n_col, Row,
-#endif /* NDEBUG */
-
-		Col, A, head, pivot_row_start, pivot_row_length) ;
-
-	/* === Kill the pivotal column ====================================== */
-
-	KILL_PRINCIPAL_COL (pivot_col) ;
-
-	/* === Clear mark =================================================== */
-
-	tag_mark += (max_deg + 1) ;
-	if (tag_mark >= max_mark)
-	{
-	    DEBUG2 (("clearing tag_mark\n")) ;
-	    tag_mark = clear_mark (n_row, Row) ;
-	}
-
-#ifndef NDEBUG
-	DEBUG3 (("check3\n")) ;
-	debug_mark (n_row, Row, tag_mark, max_mark) ;
-#endif /* NDEBUG */
-
-	/* === Finalize the new pivot row, and column scores ================ */
-
-	DEBUG3 (("** Finalize scores phase. **\n")) ;
-
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	/* compact the pivot row */
-	new_rp = rp ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    col = *rp++ ;
-	    /* skip dead columns */
-	    if (COL_IS_DEAD (col))
-	    {
-		continue ;
-	    }
-	    *new_rp++ = col ;
-	    /* add new pivot row to column */
-	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
-
-	    /* retrieve score so far and add on pivot row's degree. */
-	    /* (we wait until here for this in case the pivot */
-	    /* row's degree was reduced due to mass elimination). */
-	    cur_score = Col [col].shared2.score + pivot_row_degree ;
-
-	    /* calculate the max possible score as the number of */
-	    /* external columns minus the 'k' value minus the */
-	    /* columns thickness */
-	    max_score = n_col - k - Col [col].shared1.thickness ;
-
-	    /* make the score the external degree of the union-of-rows */
-	    cur_score -= Col [col].shared1.thickness ;
-
-	    /* make sure score is less or equal than the max score */
-	    cur_score = MIN (cur_score, max_score) ;
-	    ASSERT (cur_score >= 0) ;
-
-	    /* store updated score */
-	    Col [col].shared2.score = cur_score ;
-
-	    /* === Place column back in degree list ========================= */
-
-	    ASSERT (min_score >= 0) ;
-	    ASSERT (min_score <= n_col) ;
-	    ASSERT (cur_score >= 0) ;
-	    ASSERT (cur_score <= n_col) ;
-	    ASSERT (head [cur_score] >= EMPTY) ;
-	    next_col = head [cur_score] ;
-	    Col [col].shared4.degree_next = next_col ;
-	    Col [col].shared3.prev = EMPTY ;
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = col ;
-	    }
-	    head [cur_score] = col ;
-
-	    /* see if this score is less than current min */
-	    min_score = MIN (min_score, cur_score) ;
-
-	}
-
-#ifndef NDEBUG
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k, max_deg) ;
-#endif /* NDEBUG */
-
-	/* === Resurrect the new pivot row ================================== */
-
-	if (pivot_row_degree > 0)
-	{
-	    /* update pivot row length to reflect any cols that were killed */
-	    /* during super-col detection and mass elimination */
-	    Row [pivot_row].start  = pivot_row_start ;
-	    Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
-	    Row [pivot_row].shared1.degree = pivot_row_degree ;
-	    Row [pivot_row].shared2.mark = 0 ;
-	    /* pivot row is no longer dead */
-	}
-    }
-
-    /* === All principal columns have now been ordered ====================== */
-
-    return (ngarbage) ;
-}
-
-
-/* ========================================================================== */
-/* === order_children ======================================================= */
-/* ========================================================================== */
-
-/*
-    The find_ordering routine has ordered all of the principal columns (the
-    representatives of the supercolumns).  The non-principal columns have not
-    yet been ordered.  This routine orders those columns by walking up the
-    parent tree (a column is a child of the column which absorbed it).  The
-    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
-    being the first column, and p [n_col-1] being the last.  It doesn't look
-    like it at first glance, but be assured that this routine takes time linear
-    in the number of columns.  Although not immediately obvious, the time
-    taken by this routine is O (n_col), that is, linear in the number of
-    columns.  Not user-callable.
-*/
-
-PRIVATE void order_children
-(
-    /* === Parameters ======================================================= */
-
-    int n_col,			/* number of columns of A */
-    Colamd_Col Col [],		/* of size n_col+1 */
-    int p []			/* p [0 ... n_col-1] is the column permutation*/
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;			/* loop counter for all columns */
-    int c ;			/* column index */
-    int parent ;		/* index of column's parent */
-    int order ;			/* column's order */
-
-    /* === Order each non-principal column ================================== */
-
-    for (i = 0 ; i < n_col ; i++)
-    {
-	/* find an un-ordered non-principal column */
-	ASSERT (COL_IS_DEAD (i)) ;
-	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
-	{
-	    parent = i ;
-	    /* once found, find its principal parent */
-	    do
-	    {
-		parent = Col [parent].shared1.parent ;
-	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
-
-	    /* now, order all un-ordered non-principal columns along path */
-	    /* to this parent.  collapse tree at the same time */
-	    c = i ;
-	    /* get order of parent */
-	    order = Col [parent].shared2.order ;
-
-	    do
-	    {
-		ASSERT (Col [c].shared2.order == EMPTY) ;
-
-		/* order this column */
-		Col [c].shared2.order = order++ ;
-		/* collaps tree */
-		Col [c].shared1.parent = parent ;
-
-		/* get immediate parent of this column */
-		c = Col [c].shared1.parent ;
-
-		/* continue until we hit an ordered column.  There are */
-		/* guarranteed not to be anymore unordered columns */
-		/* above an ordered column */
-	    } while (Col [c].shared2.order == EMPTY) ;
-
-	    /* re-order the super_col parent to largest order for this group */
-	    Col [parent].shared2.order = order ;
-	}
-    }
-
-    /* === Generate the permutation ========================================= */
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	p [Col [c].shared2.order] = c ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === detect_super_cols ==================================================== */
-/* ========================================================================== */
-
-/*
-    Detects supercolumns by finding matches between columns in the hash buckets.
-    Check amongst columns in the set A [row_start ... row_start + row_length-1].
-    The columns under consideration are currently *not* in the degree lists,
-    and have already been placed in the hash buckets.
-
-    The hash bucket for columns whose hash function is equal to h is stored
-    as follows:
-
-	if head [h] is >= 0, then head [h] contains a degree list, so:
-
-		head [h] is the first column in degree bucket h.
-		Col [head [h]].headhash gives the first column in hash bucket h.
-
-	otherwise, the degree list is empty, and:
-
-		-(head [h] + 2) is the first column in hash bucket h.
-
-    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
-    column" pointer.  Col [c].shared3.hash is used instead as the hash number
-    for that column.  The value of Col [c].shared4.hash_next is the next column
-    in the same hash bucket.
-
-    Assuming no, or "few" hash collisions, the time taken by this routine is
-    linear in the sum of the sizes (lengths) of each column whose score has
-    just been computed in the approximate degree computation.
-    Not user-callable.
-*/
-
-PRIVATE void detect_super_cols
-(
-    /* === Parameters ======================================================= */
-
-#ifndef NDEBUG
-    /* these two parameters are only needed when debugging is enabled: */
-    int n_col,			/* number of columns of A */
-    Colamd_Row Row [],		/* of size n_row+1 */
-#endif /* NDEBUG */
-
-    Colamd_Col Col [],		/* of size n_col+1 */
-    int A [],			/* row indices of A */
-    int head [],		/* head of degree lists and hash buckets */
-    int row_start,		/* pointer to set of columns to check */
-    int row_length		/* number of columns to check */
-)
-{
-    /* === Local variables ================================================== */
-
-    int hash ;			/* hash value for a column */
-    int *rp ;			/* pointer to a row */
-    int c ;			/* a column index */
-    int super_c ;		/* column index of the column to absorb into */
-    int *cp1 ;			/* column pointer for column super_c */
-    int *cp2 ;			/* column pointer for column c */
-    int length ;		/* length of column super_c */
-    int prev_c ;		/* column preceding c in hash bucket */
-    int i ;			/* loop counter */
-    int *rp_end ;		/* pointer to the end of the row */
-    int col ;			/* a column index in the row to check */
-    int head_column ;		/* first column in hash bucket or degree list */
-    int first_col ;		/* first column in hash bucket */
-
-    /* === Consider each column in the row ================================== */
-
-    rp = &A [row_start] ;
-    rp_end = rp + row_length ;
-    while (rp < rp_end)
-    {
-	col = *rp++ ;
-	if (COL_IS_DEAD (col))
-	{
-	    continue ;
-	}
-
-	/* get hash number for this column */
-	hash = Col [col].shared3.hash ;
-	ASSERT (hash <= n_col) ;
-
-	/* === Get the first column in this hash bucket ===================== */
-
-	head_column = head [hash] ;
-	if (head_column > EMPTY)
-	{
-	    first_col = Col [head_column].shared3.headhash ;
-	}
-	else
-	{
-	    first_col = - (head_column + 2) ;
-	}
-
-	/* === Consider each column in the hash bucket ====================== */
-
-	for (super_c = first_col ; super_c != EMPTY ;
-	    super_c = Col [super_c].shared4.hash_next)
-	{
-	    ASSERT (COL_IS_ALIVE (super_c)) ;
-	    ASSERT (Col [super_c].shared3.hash == hash) ;
-	    length = Col [super_c].length ;
-
-	    /* prev_c is the column preceding column c in the hash bucket */
-	    prev_c = super_c ;
-
-	    /* === Compare super_c with all columns after it ================ */
-
-	    for (c = Col [super_c].shared4.hash_next ;
-		 c != EMPTY ; c = Col [c].shared4.hash_next)
-	    {
-		ASSERT (c != super_c) ;
-		ASSERT (COL_IS_ALIVE (c)) ;
-		ASSERT (Col [c].shared3.hash == hash) ;
-
-		/* not identical if lengths or scores are different */
-		if (Col [c].length != length ||
-		    Col [c].shared2.score != Col [super_c].shared2.score)
-		{
-		    prev_c = c ;
-		    continue ;
-		}
-
-		/* compare the two columns */
-		cp1 = &A [Col [super_c].start] ;
-		cp2 = &A [Col [c].start] ;
-
-		for (i = 0 ; i < length ; i++)
-		{
-		    /* the columns are "clean" (no dead rows) */
-		    ASSERT (ROW_IS_ALIVE (*cp1))  ;
-		    ASSERT (ROW_IS_ALIVE (*cp2))  ;
-		    /* row indices will same order for both supercols, */
-		    /* no gather scatter nessasary */
-		    if (*cp1++ != *cp2++)
-		    {
-			break ;
-		    }
-		}
-
-		/* the two columns are different if the for-loop "broke" */
-		if (i != length)
-		{
-		    prev_c = c ;
-		    continue ;
-		}
-
-		/* === Got it!  two columns are identical =================== */
-
-		ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ;
-
-		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
-		Col [c].shared1.parent = super_c ;
-		KILL_NON_PRINCIPAL_COL (c) ;
-		/* order c later, in order_children() */
-		Col [c].shared2.order = EMPTY ;
-		/* remove c from hash bucket */
-		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
-	    }
-	}
-
-	/* === Empty this hash bucket ======================================= */
-
-	if (head_column > EMPTY)
-	{
-	    /* corresponding degree list "hash" is not empty */
-	    Col [head_column].shared3.headhash = EMPTY ;
-	}
-	else
-	{
-	    /* corresponding degree list "hash" is empty */
-	    head [hash] = EMPTY ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === garbage_collection =================================================== */
-/* ========================================================================== */
-
-/*
-    Defragments and compacts columns and rows in the workspace A.  Used when
-    all avaliable memory has been used while performing row merging.  Returns
-    the index of the first free position in A, after garbage collection.  The
-    time taken by this routine is linear is the size of the array A, which is
-    itself linear in the number of nonzeros in the input matrix.
-    Not user-callable.
-*/
-
-PRIVATE int garbage_collection  /* returns the new value of pfree */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows */
-    int n_col,			/* number of columns */
-    Colamd_Row Row [],		/* row info */
-    Colamd_Col Col [],		/* column info */
-    int A [],			/* A [0 ... Alen-1] holds the matrix */
-    int *pfree			/* &A [0] ... pfree is in use */
-)
-{
-    /* === Local variables ================================================== */
-
-    int *psrc ;			/* source pointer */
-    int *pdest ;		/* destination pointer */
-    int j ;			/* counter */
-    int r ;			/* a row index */
-    int c ;			/* a column index */
-    int length ;		/* length of a row or column */
-
-#ifndef NDEBUG
-    int debug_rows ;
-    DEBUG2 (("Defrag..\n")) ;
-    for (psrc = &A[0] ; psrc < pfree ; psrc++) ASSERT (*psrc >= 0) ;
-    debug_rows = 0 ;
-#endif /* NDEBUG */
-
-    /* === Defragment the columns =========================================== */
-
-    pdest = &A[0] ;
-    for (c = 0 ; c < n_col ; c++)
-    {
-	if (COL_IS_ALIVE (c))
-	{
-	    psrc = &A [Col [c].start] ;
-
-	    /* move and compact the column */
-	    ASSERT (pdest <= psrc) ;
-	    Col [c].start = (int) (pdest - &A [0]) ;
-	    length = Col [c].length ;
-	    for (j = 0 ; j < length ; j++)
-	    {
-		r = *psrc++ ;
-		if (ROW_IS_ALIVE (r))
-		{
-		    *pdest++ = r ;
-		}
-	    }
-	    Col [c].length = (int) (pdest - &A [Col [c].start]) ;
-	}
-    }
-
-    /* === Prepare to defragment the rows =================================== */
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    if (Row [r].length == 0)
-	    {
-		/* this row is of zero length.  cannot compact it, so kill it */
-		DEBUG3 (("Defrag row kill\n")) ;
-		KILL_ROW (r) ;
-	    }
-	    else
-	    {
-		/* save first column index in Row [r].shared2.first_column */
-		psrc = &A [Row [r].start] ;
-		Row [r].shared2.first_column = *psrc ;
-		ASSERT (ROW_IS_ALIVE (r)) ;
-		/* flag the start of the row with the one's complement of row */
-		*psrc = ONES_COMPLEMENT (r) ;
-
-#ifndef NDEBUG
-		debug_rows++ ;
-#endif /* NDEBUG */
-
-	    }
-	}
-    }
-
-    /* === Defragment the rows ============================================== */
-
-    psrc = pdest ;
-    while (psrc < pfree)
-    {
-	/* find a negative number ... the start of a row */
-	if (*psrc++ < 0)
-	{
-	    psrc-- ;
-	    /* get the row index */
-	    r = ONES_COMPLEMENT (*psrc) ;
-	    ASSERT (r >= 0 && r < n_row) ;
-	    /* restore first column index */
-	    *psrc = Row [r].shared2.first_column ;
-	    ASSERT (ROW_IS_ALIVE (r)) ;
-
-	    /* move and compact the row */
-	    ASSERT (pdest <= psrc) ;
-	    Row [r].start = (int) (pdest - &A [0]) ;
-	    length = Row [r].length ;
-	    for (j = 0 ; j < length ; j++)
-	    {
-		c = *psrc++ ;
-		if (COL_IS_ALIVE (c))
-		{
-		    *pdest++ = c ;
-		}
-	    }
-	    Row [r].length = (int) (pdest - &A [Row [r].start]) ;
-
-#ifndef NDEBUG
-	    debug_rows-- ;
-#endif /* NDEBUG */
-
-	}
-    }
-    /* ensure we found all the rows */
-    ASSERT (debug_rows == 0) ;
-
-    /* === Return the new value of pfree ==================================== */
-
-    return ((int) (pdest - &A [0])) ;
-}
-
-
-/* ========================================================================== */
-/* === clear_mark =========================================================== */
-/* ========================================================================== */
-
-/*
-    Clears the Row [].shared2.mark array, and returns the new tag_mark.
-    Return value is the new tag_mark.  Not user-callable.
-*/
-
-PRIVATE int clear_mark	/* return the new value for tag_mark */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,		/* number of rows in A */
-    Colamd_Row Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    Row [r].shared2.mark = 0 ;
-	}
-    }
-    return (1) ;
-}
-
-
-/* ========================================================================== */
-/* === print_report ========================================================= */
-/* ========================================================================== */
-
-PRIVATE void print_report
-(
-    const char *method,
-    int stats [COLAMD_STATS]
-)
-{
-
-    int i1, i2, i3 ;
-
-    if (!stats)
-    {
-    	PRINTF ("%s: No statistics available.\n", method) ;
-	return ;
-    }
-
-    i1 = stats [COLAMD_INFO1] ;
-    i2 = stats [COLAMD_INFO2] ;
-    i3 = stats [COLAMD_INFO3] ;
-
-    if (stats [COLAMD_STATUS] >= 0)
-    {
-    	PRINTF ("%s: OK.  ", method) ;
-    }
-    else
-    {
-    	PRINTF ("%s: ERROR.  ", method) ;
-    }
-
-    switch (stats [COLAMD_STATUS])
-    {
-
-	case COLAMD_OK_BUT_JUMBLED:
-
-	    PRINTF ("Matrix has unsorted or duplicate row indices.\n") ;
-
-	    PRINTF ("%s: number of duplicate or out-of-order row indices: %d\n",
-	    method, i3) ;
-
-	    PRINTF ("%s: last seen duplicate or out-of-order row index:   %d\n",
-	    method, INDEX (i2)) ;
-
-	    PRINTF ("%s: last seen in column:                             %d",
-	    method, INDEX (i1)) ;
-
-	    /* no break - fall through to next case instead */
-
-	case COLAMD_OK:
-
-	    PRINTF ("\n") ;
-
- 	    PRINTF ("%s: number of dense or empty rows ignored:           %d\n",
-	    method, stats [COLAMD_DENSE_ROW]) ;
-
-	    PRINTF ("%s: number of dense or empty columns ignored:        %d\n",
-	    method, stats [COLAMD_DENSE_COL]) ;
-
-	    PRINTF ("%s: number of garbage collections performed:         %d\n",
-	    method, stats [COLAMD_DEFRAG_COUNT]) ;
-	    break ;
-
-	case COLAMD_ERROR_A_not_present:
-
-	    PRINTF ("Array A (row indices of matrix) not present.\n") ;
-	    break ;
-
-	case COLAMD_ERROR_p_not_present:
-
-	    PRINTF ("Array p (column pointers for matrix) not present.\n") ;
-	    break ;
-
-	case COLAMD_ERROR_nrow_negative:
-
-	    PRINTF ("Invalid number of rows (%d).\n", i1) ;
-	    break ;
-
-	case COLAMD_ERROR_ncol_negative:
-
-	    PRINTF ("Invalid number of columns (%d).\n", i1) ;
-	    break ;
-
-	case COLAMD_ERROR_nnz_negative:
-
-	    PRINTF ("Invalid number of nonzero entries (%d).\n", i1) ;
-	    break ;
-
-	case COLAMD_ERROR_p0_nonzero:
-
-	    PRINTF ("Invalid column pointer, p [0] = %d, must be zero.\n", i1) ;
-	    break ;
-
-	case COLAMD_ERROR_A_too_small:
-
-	    PRINTF ("Array A too small.\n") ;
-	    PRINTF ("        Need Alen >= %d, but given only Alen = %d.\n",
-	    i1, i2) ;
-	    break ;
-
-	case COLAMD_ERROR_col_length_negative:
-
-	    PRINTF
-	    ("Column %d has a negative number of nonzero entries (%d).\n",
-	    INDEX (i1), i2) ;
-	    break ;
-
-	case COLAMD_ERROR_row_index_out_of_bounds:
-
-	    PRINTF
-	    ("Row index (row %d) out of bounds (%d to %d) in column %d.\n",
-	    INDEX (i2), INDEX (0), INDEX (i3-1), INDEX (i1)) ;
-	    break ;
-
-	case COLAMD_ERROR_out_of_memory:
-
-	    PRINTF ("Out of memory.\n") ;
-	    break ;
-
-	case COLAMD_ERROR_internal_error:
-
-	    /* if this happens, there is a bug in the code */
-	    PRINTF
-	    ("Internal error! Please contact authors (davis@cise.ufl.edu).\n") ;
-	    break ;
-    }
-}
-
-
-
-
-/* ========================================================================== */
-/* === colamd debugging routines ============================================ */
-/* ========================================================================== */
-
-/* When debugging is disabled, the remainder of this file is ignored. */
-
-#ifndef NDEBUG
-
-
-/* ========================================================================== */
-/* === debug_structures ===================================================== */
-/* ========================================================================== */
-
-/*
-    At this point, all empty rows and columns are dead.  All live columns
-    are "clean" (containing no dead rows) and simplicial (no supercolumns
-    yet).  Rows may contain dead columns, but all live rows contain at
-    least one live column.
-*/
-
-PRIVATE void debug_structures
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A [],
-    int n_col2
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;
-    int c ;
-    int *cp ;
-    int *cp_end ;
-    int len ;
-    int score ;
-    int r ;
-    int *rp ;
-    int *rp_end ;
-    int deg ;
-
-    /* === Check A, Row, and Col ============================================ */
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	if (COL_IS_ALIVE (c))
-	{
-	    len = Col [c].length ;
-	    score = Col [c].shared2.score ;
-	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
-	    ASSERT (len > 0) ;
-	    ASSERT (score >= 0) ;
-	    ASSERT (Col [c].shared1.thickness == 1) ;
-	    cp = &A [Col [c].start] ;
-	    cp_end = cp + len ;
-	    while (cp < cp_end)
-	    {
-		r = *cp++ ;
-		ASSERT (ROW_IS_ALIVE (r)) ;
-	    }
-	}
-	else
-	{
-	    i = Col [c].shared2.order ;
-	    ASSERT (i >= n_col2 && i < n_col) ;
-	}
-    }
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    i = 0 ;
-	    len = Row [r].length ;
-	    deg = Row [r].shared1.degree ;
-	    ASSERT (len > 0) ;
-	    ASSERT (deg > 0) ;
-	    rp = &A [Row [r].start] ;
-	    rp_end = rp + len ;
-	    while (rp < rp_end)
-	    {
-		c = *rp++ ;
-		if (COL_IS_ALIVE (c))
-		{
-		    i++ ;
-		}
-	    }
-	    ASSERT (i > 0) ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_deg_lists ====================================================== */
-/* ========================================================================== */
-
-/*
-    Prints the contents of the degree lists.  Counts the number of columns
-    in the degree list and compares it to the total it should have.  Also
-    checks the row degrees.
-*/
-
-PRIVATE void debug_deg_lists
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int head [],
-    int min_score,
-    int should,
-    int max_deg
-)
-{
-    /* === Local variables ================================================== */
-
-    int deg ;
-    int col ;
-    int have ;
-    int row ;
-
-    /* === Check the degree lists =========================================== */
-
-    if (n_col > 10000 && colamd_debug <= 0)
-    {
-	return ;
-    }
-    have = 0 ;
-    DEBUG4 (("Degree lists: %d\n", min_score)) ;
-    for (deg = 0 ; deg <= n_col ; deg++)
-    {
-	col = head [deg] ;
-	if (col == EMPTY)
-	{
-	    continue ;
-	}
-	DEBUG4 (("%d:", deg)) ;
-	while (col != EMPTY)
-	{
-	    DEBUG4 ((" %d", col)) ;
-	    have += Col [col].shared1.thickness ;
-	    ASSERT (COL_IS_ALIVE (col)) ;
-	    col = Col [col].shared4.degree_next ;
-	}
-	DEBUG4 (("\n")) ;
-    }
-    DEBUG4 (("should %d have %d\n", should, have)) ;
-    ASSERT (should == have) ;
-
-    /* === Check the row degrees ============================================ */
-
-    if (n_row > 10000 && colamd_debug <= 0)
-    {
-	return ;
-    }
-    for (row = 0 ; row < n_row ; row++)
-    {
-	if (ROW_IS_ALIVE (row))
-	{
-	    ASSERT (Row [row].shared1.degree <= max_deg) ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_mark =========================================================== */
-/* ========================================================================== */
-
-/*
-    Ensures that the tag_mark is less that the maximum and also ensures that
-    each entry in the mark array is less than the tag mark.
-*/
-
-PRIVATE void debug_mark
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    Colamd_Row Row [],
-    int tag_mark,
-    int max_mark
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-
-    /* === Check the Row marks ============================================== */
-
-    ASSERT (tag_mark > 0 && tag_mark <= max_mark) ;
-    if (n_row > 10000 && colamd_debug <= 0)
-    {
-	return ;
-    }
-    for (r = 0 ; r < n_row ; r++)
-    {
-	ASSERT (Row [r].shared2.mark < tag_mark) ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_matrix ========================================================= */
-/* ========================================================================== */
-
-/*
-    Prints out the contents of the columns and the rows.
-*/
-
-PRIVATE void debug_matrix
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    Colamd_Row Row [],
-    Colamd_Col Col [],
-    int A []
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-    int c ;
-    int *rp ;
-    int *rp_end ;
-    int *cp ;
-    int *cp_end ;
-
-    /* === Dump the rows and columns of the matrix ========================== */
-
-    if (colamd_debug < 3)
-    {
-	return ;
-    }
-    DEBUG3 (("DUMP MATRIX:\n")) ;
-    for (r = 0 ; r < n_row ; r++)
-    {
-	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
-	if (ROW_IS_DEAD (r))
-	{
-	    continue ;
-	}
-	DEBUG3 (("start %d length %d degree %d\n",
-		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
-	rp = &A [Row [r].start] ;
-	rp_end = rp + Row [r].length ;
-	while (rp < rp_end)
-	{
-	    c = *rp++ ;
-	    DEBUG4 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
-	}
-    }
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
-		Col [c].start, Col [c].length,
-		Col [c].shared1.thickness, Col [c].shared2.score)) ;
-	cp = &A [Col [c].start] ;
-	cp_end = cp + Col [c].length ;
-	while (cp < cp_end)
-	{
-	    r = *cp++ ;
-	    DEBUG4 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
-	}
-    }
-}
-
-PRIVATE void colamd_get_debug
-(
-    char *method
-)
-{
-    colamd_debug = 0 ;		/* no debug printing */
-
-    /* get "D" environment variable, which gives the debug printing level */
-    if (getenv ("D"))
-    {
-    	colamd_debug = atoi (getenv ("D")) ;
-    }
-
-    DEBUG0 (("%s: debug version, D = %d (THIS WILL BE SLOW!)\n",
-    	method, colamd_debug)) ;
-}
-
-#endif /* NDEBUG */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/colamd.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/colamd.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,246 +0,0 @@
-/* ========================================================================== */
-/* === colamd/symamd prototypes and definitions ============================= */
-/* ========================================================================== */
-
-/*
-    You must include this file (colamd.h) in any routine that uses colamd,
-    symamd, or the related macros and definitions.
-
-    Authors:
-
-	The authors of the code itself are Stefan I. Larimore and Timothy A.
-	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
-	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
-	Ng, Oak Ridge National Laboratory.
-
-    Date:
-
-	September 8, 2003.  Version 2.3.
-
-    Acknowledgements:
-
-	This work was supported by the National Science Foundation, under
-	grants DMS-9504974 and DMS-9803599.
-
-    Notice:
-
-	Copyright (c) 1998-2003 by the University of Florida.
-	All Rights Reserved.
-
-	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-
-	Permission is hereby granted to use, copy, modify, and/or distribute
-	this program, provided that the Copyright, this License, and the
-	Availability of the original version is retained on all copies and made
-	accessible to the end-user of any code or package that includes COLAMD
-	or any modified version of COLAMD. 
-
-    Availability:
-
-	The colamd/symamd library is available at
-
-	    http://www.cise.ufl.edu/research/sparse/colamd/
-
-	This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
-	file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
-	files, and by any C code that calls the routines whose prototypes are
-	listed below, or that uses the colamd/symamd definitions listed below.
-
-*/
-
-#ifndef COLAMD_H
-#define COLAMD_H
-
-/* ========================================================================== */
-/* === Include files ======================================================== */
-/* ========================================================================== */
-
-#include <stdlib.h>
-
-/* ========================================================================== */
-/* === Knob and statistics definitions ====================================== */
-/* ========================================================================== */
-
-/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
-
-/* number of output statistics.  Only stats [0..6] are currently used. */
-#define COLAMD_STATS 20
-
-/* knobs [0] and stats [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
-
-/* knobs [1] and stats [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
-
-/* stats [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
-
-/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-#define COLAMD_STATUS 3
-
-/* stats [4..6]: error info, or info on jumbled columns */ 
-#define COLAMD_INFO1 4
-#define COLAMD_INFO2 5
-#define COLAMD_INFO3 6
-
-/* error codes returned in stats [3]: */
-#define COLAMD_OK				(0)
-#define COLAMD_OK_BUT_JUMBLED			(1)
-#define COLAMD_ERROR_A_not_present		(-1)
-#define COLAMD_ERROR_p_not_present		(-2)
-#define COLAMD_ERROR_nrow_negative		(-3)
-#define COLAMD_ERROR_ncol_negative		(-4)
-#define COLAMD_ERROR_nnz_negative		(-5)
-#define COLAMD_ERROR_p0_nonzero			(-6)
-#define COLAMD_ERROR_A_too_small		(-7)
-#define COLAMD_ERROR_col_length_negative	(-8)
-#define COLAMD_ERROR_row_index_out_of_bounds	(-9)
-#define COLAMD_ERROR_out_of_memory		(-10)
-#define COLAMD_ERROR_internal_error		(-999)
-
-/* ========================================================================== */
-/* === Row and Column structures ============================================ */
-/* ========================================================================== */
-
-/* User code that makes use of the colamd/symamd routines need not directly */
-/* reference these structures.  They are used only for the COLAMD_RECOMMENDED */
-/* macro. */
-
-typedef struct Colamd_Col_struct
-{
-    int start ;		/* index for A of first row in this column, or DEAD */
-			/* if column is dead */
-    int length ;	/* number of rows in this column */
-    union
-    {
-	int thickness ;	/* number of original columns represented by this */
-			/* col, if the column is alive */
-	int parent ;	/* parent in parent tree super-column structure, if */
-			/* the column is dead */
-    } shared1 ;
-    union
-    {
-	int score ;	/* the score used to maintain heap, if col is alive */
-	int order ;	/* pivot ordering of this column, if col is dead */
-    } shared2 ;
-    union
-    {
-	int headhash ;	/* head of a hash bucket, if col is at the head of */
-			/* a degree list */
-	int hash ;	/* hash value, if col is not in a degree list */
-	int prev ;	/* previous column in degree list, if col is in a */
-			/* degree list (but not at the head of a degree list) */
-    } shared3 ;
-    union
-    {
-	int degree_next ;	/* next column, if col is in a degree list */
-	int hash_next ;		/* next column, if col is in a hash list */
-    } shared4 ;
-
-} Colamd_Col ;
-
-typedef struct Colamd_Row_struct
-{
-    int start ;		/* index for A of first col in this row */
-    int length ;	/* number of principal columns in this row */
-    union
-    {
-	int degree ;	/* number of principal & non-principal columns in row */
-	int p ;		/* used as a row pointer in init_rows_cols () */
-    } shared1 ;
-    union
-    {
-	int mark ;	/* for computing set differences and marking dead rows*/
-	int first_column ;/* first column in row (used in garbage collection) */
-    } shared2 ;
-
-} Colamd_Row ;
-
-/* ========================================================================== */
-/* === Colamd recommended memory size ======================================= */
-/* ========================================================================== */
-
-/*
-    The recommended length Alen of the array A passed to colamd is given by
-    the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
-    argument is negative.  2*nnz space is required for the row and column
-    indices of the matrix. COLAMD_C (n_col) + COLAMD_R (n_row) space is
-    required for the Col and Row arrays, respectively, which are internal to
-    colamd.  An additional n_col space is the minimal amount of "elbow room",
-    and nnz/5 more space is recommended for run time efficiency.
-
-    This macro is not needed when using symamd.
-
-    Explicit typecast to int added Sept. 23, 2002, COLAMD version 2.2, to avoid
-    gcc -pedantic warning messages.
-*/
-
-#define COLAMD_C(n_col) ((int) (((n_col) + 1) * sizeof (Colamd_Col) / sizeof (int)))
-#define COLAMD_R(n_row) ((int) (((n_row) + 1) * sizeof (Colamd_Row) / sizeof (int)))
-
-#define COLAMD_RECOMMENDED(nnz, n_row, n_col)                                 \
-(                                                                             \
-((nnz) < 0 || (n_row) < 0 || (n_col) < 0)                                     \
-?                                                                             \
-    (-1)                                                                      \
-:                                                                             \
-    (2 * (nnz) + COLAMD_C (n_col) + COLAMD_R (n_row) + (n_col) + ((nnz) / 5)) \
-)
-
-/* ========================================================================== */
-/* === Prototypes of user-callable routines ================================= */
-/* ========================================================================== */
-
-int colamd_recommended		/* returns recommended value of Alen, */
-				/* or (-1) if input arguments are erroneous */
-(
-    int nnz,			/* nonzeros in A */
-    int n_row,			/* number of rows in A */
-    int n_col			/* number of columns in A */
-) ;
-
-void colamd_set_defaults	/* sets default parameters */
-(				/* knobs argument is modified on output */
-    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
-) ;
-
-int colamd			/* returns (1) if successful, (0) otherwise*/
-(				/* A and p arguments are modified on output */
-    int n_row,			/* number of rows in A */
-    int n_col,			/* number of columns in A */
-    int Alen,			/* size of the array A */
-    int A [],			/* row indices of A, of size Alen */
-    int p [],			/* column pointers of A, of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameter settings for colamd */
-    int stats [COLAMD_STATS]	/* colamd output statistics and error codes */
-) ;
-
-int symamd				/* return (1) if OK, (0) otherwise */
-(
-    int n,				/* number of rows and columns of A */
-    int A [],				/* row indices of A */
-    int p [],				/* column pointers of A */
-    int perm [],			/* output permutation, size n_col+1 */
-    double knobs [COLAMD_KNOBS],	/* parameters (uses defaults if NULL) */
-    int stats [COLAMD_STATS],		/* output statistics and error codes */
-    void * (*allocate) (size_t, size_t),
-    					/* pointer to calloc (ANSI C) or */
-					/* mxCalloc (for MATLAB mexFunction) */
-    void (*release) (void *)
-    					/* pointer to free (ANSI C) or */
-    					/* mxFree (for MATLAB mexFunction) */
-) ;
-
-void colamd_report
-(
-    int stats [COLAMD_STATS]
-) ;
-
-void symamd_report
-(
-    int stats [COLAMD_STATS]
-) ;
-
-#endif /* COLAMD_H */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,478 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_cdefs.h"
-
-/* 
- * Function prototypes 
- */
-void clsolve(int, int, complex *, complex *);
-void cmatvec(int, int, int, complex *, complex *, complex *);
-extern void ccheck_tempv();
-
-void
-cpanel_bmod (
-	    const int  m,          /* in - number of rows in the matrix */
-	    const int  w,          /* in */
-	    const int  jcol,       /* in */
-	    const int  nseg,       /* in */
-	    complex     *dense,     /* out, of size n by w */
-	    complex     *tempv,     /* working array */
-	    int        *segrep,    /* in */
-	    int        *repfnz,    /* in, of size n by w */
-	    GlobalLU_t *Glu,       /* modified */
-	    SuperLUStat_t *stat    /* output */
-	    )
-{
-/* 
- * Purpose
- * =======
- *
- *    Performs numeric block updates (sup-panel) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- *    Before entering this routine, the original nonzeros in the panel 
- *    were already copied into the spa[m,w].
- *
- *    Updated/Output parameters-
- *	dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
- *      collectively in the m-by-w vector dense[*]. 
- *
- */
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int          incx = 1, incy = 1;
-    complex       alpha, beta;
-#endif
-
-    register int k, ksub;
-    int          fsupc, nsupc, nsupr, nrow;
-    int          krep, krep_ind;
-    complex       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          segsze;
-    int          block_nrow;  /* no of rows in a block row */
-    register int lptr;	      /* Points to the row subscripts of a supernode */
-    int          kfnz, irow, no_zeros; 
-    register int isub, isub1, i;
-    register int jj;	      /* Index through each column in the panel */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    complex       *lusup;
-    int          *xlusup;
-    int          *repfnz_col; /* repfnz[] for a column in the panel */
-    complex       *dense_col;  /* dense[] for a column in the panel */
-    complex       *tempv1;             /* Used in 1-D update */
-    complex       *TriTmp, *MatvecTmp; /* used in 2-D update */
-    complex      zero = {0.0, 0.0};
-    complex      one = {1.0, 0.0};
-    complex      comp_temp, comp_temp1;
-    register int ldaTmp;
-    register int r_ind, r_hi;
-    static   int first = 1, maxsuper, rowblk, colblk;
-    flops_t  *ops = stat->ops;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	rowblk   = sp_ienv(4);
-	colblk   = sp_ienv(5);
-	first = 0;
-    }
-    ldaTmp = maxsuper + rowblk;
-
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
-
-	/* krep = representative of current k-th supernode
-	 * fsupc = first supernodal column
-	 * nsupc = no of columns in a supernode
-	 * nsupr = no of rows in a supernode
-	 */
-        krep = segrep[k--];
-	fsupc = xsup[supno[krep]];
-	nsupc = krep - fsupc + 1;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nrow = nsupr - nsupc;
-	lptr = xlsub[fsupc];
-	krep_ind = lptr + nsupc - 1;
-
-	repfnz_col = repfnz;
-	dense_col = dense;
-	
-	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
-
-	    TriTmp = tempv;
-	
-	    /* Sequence through each column in panel -- triangular solves */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
-
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-	    
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += 4 * segsze * (segsze - 1);
-		ops[GEMV] += 8 * nrow * segsze;
-	
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-	    	        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-		        c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-		        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		        c_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++;
-			    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    c_add(&comp_temp, &comp_temp, &comp_temp1);
-			    c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-  		        cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
-		        c_sub(&ukj1, &ukj1, &comp_temp);
-
-		        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-        		cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-		        c_add(&comp_temp, &comp_temp, &comp_temp1);
-		        c_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++; luptr2++;
-			    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    c_add(&comp_temp, &comp_temp, &comp_temp1);
-			    cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-			    c_add(&comp_temp, &comp_temp, &comp_temp1);
-			    c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    }
-
-		} else  {	/* segsze >= 4 */
-		    
-		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
-		       holds the result of triangular solves.    */
-		    no_zeros = kfnz - fsupc;
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			TriTmp[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#else
-		    ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#endif
-#else		
-		    clsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
-#endif
-		    
-
-		} /* else ... */
-	    
-	    }  /* for jj ... end tri-solves */
-
-	    /* Block row updates; push all the way into dense[*] block */
-	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
-		
-		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
-		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
-		luptr = xlusup[fsupc] + nsupc + r_ind;
-		isub1 = lptr + nsupc + r_ind;
-		
-		repfnz_col = repfnz;
-		TriTmp = tempv;
-		dense_col = dense;
-		
-		/* Sequence through each column in panel -- matrix-vector */
-		for (jj = jcol; jj < jcol + w; jj++,
-		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		    
-		    kfnz = repfnz_col[krep];
-		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		    
-		    segsze = krep - kfnz + 1;
-		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
-		    
-		    /* Perform a block update, and scatter the result of
-		       matrix-vector to dense[].		 */
-		    no_zeros = kfnz - fsupc;
-		    luptr1 = luptr + nsupr * no_zeros;
-		    MatvecTmp = &TriTmp[maxsuper];
-		    
-#ifdef USE_VENDOR_BLAS
-		    alpha = one; 
-                    beta = zero;
-#ifdef _CRAY
-		    CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#else
-		    cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#endif
-#else
-		    cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
-			   TriTmp, MatvecTmp);
-#endif
-		    
-		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
-		     * such that MatvecTmp[*] can be re-used for the
-		     * the next blok row update. dense[] will be copied into 
-		     * global store after the whole panel has been finished.
-		     */
-		    isub = isub1;
-		    for (i = 0; i < block_nrow; i++) {
-			irow = lsub[isub];
-		        c_sub(&dense_col[irow], &dense_col[irow], 
-                              &MatvecTmp[i]);
-			MatvecTmp[i] = zero;
-			++isub;
-		    }
-		    
-		} /* for jj ... */
-		
-	    } /* for each block row ... */
-	    
-	    /* Scatter the triangular solves into SPA dense[*] */
-	    repfnz_col = repfnz;
-	    TriTmp = tempv;
-	    dense_col = dense;
-	    
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		if ( segsze <= 3 ) continue; /* skip unrolled cases */
-		
-		no_zeros = kfnz - fsupc;		
-		isub = lptr + no_zeros;
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    dense_col[irow] = TriTmp[i];
-		    TriTmp[i] = zero;
-		    ++isub;
-		}
-		
-	    } /* for jj ... */
-	    
-	} else { /* 1-D block modification */
-	    
-	    
-	    /* Sequence through each column in the panel */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m) {
-		
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += 4 * segsze * (segsze - 1);
-		ops[GEMV] += 8 * nrow * segsze;
-		
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-	    	        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-		        c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-		        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		        c_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr;  ++luptr1;
-			    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    c_add(&comp_temp, &comp_temp, &comp_temp1);
-			    c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-  		        cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
-		        c_sub(&ukj1, &ukj1, &comp_temp);
-
-		        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-        		cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-		        c_add(&comp_temp, &comp_temp, &comp_temp1);
-		        c_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr; ++luptr1; ++luptr2;
-			    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    c_add(&comp_temp, &comp_temp, &comp_temp1);
-			    cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-			    c_add(&comp_temp, &comp_temp, &comp_temp1);
-			    c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    }
-
-		} else  { /* segsze >= 4 */
-		    /* 
-		     * Perform a triangular solve and block update,
-		     * then scatter the result of sup-col update to dense[].
-		     */
-		    no_zeros = kfnz - fsupc;
-		    
-		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
-		     *    The result of triangular solve is in tempv[*];
-		     *    The result of matrix vector update is in dense_col[*]
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			tempv[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-		    
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#else
-		    ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#endif
-		    
-		    luptr += segsze;	/* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-                    alpha = one;
-                    beta = zero;
-#ifdef _CRAY
-		    CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-		    cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		    clsolve ( nsupr, segsze, &lusup[luptr], tempv );
-		    
-		    luptr += segsze;        /* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-		    cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		    
-		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
-		     * that tempv[*] can be used for the triangular solve of
-		     * the next column of the panel. They will be copied into 
-		     * ucol[*] after the whole panel has been finished.
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; i++) {
-			irow = lsub[isub];
-			dense_col[irow] = tempv[i];
-			tempv[i] = zero;
-			isub++;
-		    }
-		    
-		    /* Scatter the update from tempv1[*] into SPA dense[*] */
-		    /* Start dense rectangular L */
-		    for (i = 0; i < nrow; i++) {
-			irow = lsub[isub];
-		        c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
-			tempv1[i] = zero;
-			++isub;	
-		    }
-		    
-		} /* else segsze>=4 ... */
-		
-	    } /* for each column in the panel... */
-	    
-	} /* else 1-D update ... */
-
-    } /* for each updating supernode ... */
-
-}
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpanel_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,247 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-void
-cpanel_dfs (
-	   const int  m,           /* in - number of rows in the matrix */
-	   const int  w,           /* in */
-	   const int  jcol,        /* in */
-	   SuperMatrix *A,       /* in - original matrix */
-	   int        *perm_r,     /* in */
-	   int        *nseg,	   /* out */
-	   complex     *dense,      /* out */
-	   int        *panel_lsub, /* out */
-	   int        *segrep,     /* out */
-	   int        *repfnz,     /* out */
-	   int        *xprune,     /* out */
-	   int        *marker,     /* out */     
-	   int        *parent,     /* working array */
-	   int        *xplore,     /* working array */
-	   GlobalLU_t *Glu         /* modified */
-	   )
-{
-/*
- * Purpose
- * =======
- *
- *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives.
- *
- *   The routine returns one list of the supernodal representatives
- *   in topological order of the dfs that generates them. This list is
- *   a superset of the topological order of each individual column within
- *   the panel. 
- *   The location of the first nonzero in each supernodal segment
- *   (supernodal entry location) is also returned. Each column has a 
- *   separate list for this purpose.
- *
- *   Two marker arrays are used for dfs:
- *     marker[i] == jj, if i was visited during dfs of current column jj;
- *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
- *
- *   marker: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- */
-    NCPformat *Astore;
-    complex    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int	      krep, chperm, chmark, chrep, oldrep, kchild, myfnz;
-    int       k, krow, kmark, kperm;
-    int       xdfs, maxdfs, kpar;
-    int       jj;	   /* index through each column in the panel */
-    int       *marker1;	   /* marker1[jj] >= jcol if vertex jj was visited 
-			      by a previous column within this panel.   */
-    int       *repfnz_col; /* start of each column in the panel */
-    complex    *dense_col;  /* start of each column in the panel */
-    int       nextl_col;   /* next available position in panel_lsub[*,jj] */
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-
-    /* Initialize pointers */
-    Astore     = A->Store;
-    a          = Astore->nzval;
-    asub       = Astore->rowind;
-    xa_begin   = Astore->colbeg;
-    xa_end     = Astore->colend;
-    marker1    = marker + m;
-    repfnz_col = repfnz;
-    dense_col  = dense;
-    *nseg      = 0;
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-
-    /* For each column in the panel */
-    for (jj = jcol; jj < jcol + w; jj++) {
-	nextl_col = (jj - jcol) * m;
-
-#ifdef CHK_DFS
-	printf("\npanel col %d: ", jj);
-#endif
-
-	/* For each nonz in A[*,jj] do dfs */
-	for (k = xa_begin[jj]; k < xa_end[jj]; k++) {
-	    krow = asub[k];
-            dense_col[krow] = a[k];
-	    kmark = marker[krow];    	
-	    if ( kmark == jj ) 
-		continue;     /* krow visited before, go to the next nonzero */
-
-	    /* For each unmarked nbr krow of jj
-	     * krow is in L: place it in structure of L[*,jj]
-	     */
-	    marker[krow] = jj;
-	    kperm = perm_r[krow];
-	    
-	    if ( kperm == EMPTY ) {
-		panel_lsub[nextl_col++] = krow; /* krow is indexed into A */
-	    }
-	    /* 
-	     * krow is in U: if its supernode-rep krep
-	     * has been explored, update repfnz[*]
-	     */
-	    else {
-		
-		krep = xsup[supno[kperm]+1] - 1;
-		myfnz = repfnz_col[krep];
-		
-#ifdef CHK_DFS
-		printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm);
-#endif
-		if ( myfnz != EMPTY ) {	/* Representative visited before */
-		    if ( myfnz > kperm ) repfnz_col[krep] = kperm;
-		    /* continue; */
-		}
-		else {
-		    /* Otherwise, perform dfs starting at krep */
-		    oldrep = EMPTY;
-		    parent[krep] = oldrep;
-		    repfnz_col[krep] = kperm;
-		    xdfs = xlsub[krep];
-		    maxdfs = xprune[krep];
-		    
-#ifdef CHK_DFS 
-		    printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-		    for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-		    printf("\n");
-#endif
-		    do {
-			/* 
-			 * For each unmarked kchild of krep 
-			 */
-			while ( xdfs < maxdfs ) {
-			    
-			    kchild = lsub[xdfs];
-			    xdfs++;
-			    chmark = marker[kchild];
-			    
-			    if ( chmark != jj ) { /* Not reached yet */
-				marker[kchild] = jj;
-				chperm = perm_r[kchild];
-			      
-				/* Case kchild is in L: place it in L[*,j] */
-				if ( chperm == EMPTY ) {
-				    panel_lsub[nextl_col++] = kchild;
-				} 
-				/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-				 *   been explored, update its repfnz[*]
-				 */
-				else {
-				    
-				    chrep = xsup[supno[chperm]+1] - 1;
-				    myfnz = repfnz_col[chrep];
-#ifdef CHK_DFS
-				    printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm);
-#endif
-				    if ( myfnz != EMPTY ) { /* Visited before */
-					if ( myfnz > chperm )
-					    repfnz_col[chrep] = chperm;
-				    }
-				    else {
-					/* Cont. dfs at snode-rep of kchild */
-					xplore[krep] = xdfs;	
-					oldrep = krep;
-					krep = chrep; /* Go deeper down G(L) */
-					parent[krep] = oldrep;
-					repfnz_col[krep] = chperm;
-					xdfs = xlsub[krep];     
-					maxdfs = xprune[krep];
-#ifdef CHK_DFS 
-					printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-					for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);	
-					printf("\n");
-#endif
-				    } /* else */
-				  
-				} /* else */
-			      
-			    } /* if... */
-			    
-			} /* while xdfs < maxdfs */
-			
-			/* krow has no more unexplored nbrs:
-			 *    Place snode-rep krep in postorder DFS, if this 
-			 *    segment is seen for the first time. (Note that
-			 *    "repfnz[krep]" may change later.)
-			 *    Backtrack dfs to its parent.
-			 */
-			if ( marker1[krep] < jcol ) {
-			    segrep[*nseg] = krep;
-			    ++(*nseg);
-			    marker1[krep] = jj;
-			}
-			
-			kpar = parent[krep]; /* Pop stack, mimic recursion */
-			if ( kpar == EMPTY ) break; /* dfs done */
-			krep = kpar;
-			xdfs = xplore[krep];
-			maxdfs = xprune[krep];
-			
-#ifdef CHK_DFS 
-			printf("  pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs);
-			for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-			printf("\n");
-#endif
-		    } while ( kpar != EMPTY ); /* do-while - until empty stack */
-		    
-		} /* else */
-		
-	    } /* else */
-	    
-	} /* for each nonz in A[*,jj] */
-	
-	repfnz_col += m;    /* Move to next column */
-        dense_col += m;
-	
-    } /* for jj ... */
-    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotgrowth.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,108 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <math.h>
-#include "slu_cdefs.h"
-
-float
-cPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
-             SuperMatrix *L, SuperMatrix *U)
-{
-/*
- * Purpose
- * =======
- *
- * Compute the reciprocal pivot growth factor of the leading ncols columns
- * of the matrix, using the formula:
- *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
- *
- * Arguments
- * =========
- *
- * ncols    (input) int
- *          The number of columns of matrices A, L and U.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = NC; Dtype = SLU_C; Mtype = GE.
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SC; Dtype = SLU_C; Mtype = TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = NC;
- *          Dtype = SLU_C; Mtype = TRU.
- *
- */
-    NCformat *Astore;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    complex  *Aval, *Lval, *Uval;
-    int      fsupc, nsupr, luptr, nz_in_U;
-    int      i, j, k, oldcol;
-    int      *inv_perm_c;
-    float   rpg, maxaj, maxuj;
-    extern   double slamch_(char *);
-    float   smlnum;
-    complex   *luval;
-    complex   temp_comp;
-   
-    /* Get machine constants. */
-    smlnum = slamch_("S");
-    rpg = 1. / smlnum;
-
-    Astore = A->Store;
-    Lstore = L->Store;
-    Ustore = U->Store;
-    Aval = Astore->nzval;
-    Lval = Lstore->nzval;
-    Uval = Ustore->nzval;
-    
-    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
-    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;
-
-    for (k = 0; k <= Lstore->nsuper; ++k) {
-	fsupc = L_FST_SUPC(k);
-	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	luptr = L_NZ_START(fsupc);
-	luval = &Lval[luptr];
-	nz_in_U = 1;
-	
-	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
-	    maxaj = 0.;
-            oldcol = inv_perm_c[j];
-	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
-		maxaj = SUPERLU_MAX( maxaj, c_abs1( &Aval[i]) );
-	
-	    maxuj = 0.;
-	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
-		maxuj = SUPERLU_MAX( maxuj, c_abs1( &Uval[i]) );
-	    
-	    /* Supernode */
-	    for (i = 0; i < nz_in_U; ++i)
-		maxuj = SUPERLU_MAX( maxuj, c_abs1( &luval[i]) );
-
-	    ++nz_in_U;
-	    luval += nsupr;
-
-	    if ( maxuj == 0. )
-		rpg = SUPERLU_MIN( rpg, 1.);
-	    else
-		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
-	}
-	
-	if ( j >= ncols ) break;
-    }
-
-    SUPERLU_FREE(inv_perm_c);
-    return (rpg);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpivotL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpivotL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,174 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include "slu_cdefs.h"
-
-#undef DEBUG
-
-int
-cpivotL(
-        const int  jcol,     /* in */
-        const float u,      /* in - diagonal pivoting threshold */
-        int        *usepr,   /* re-use the pivot sequence given by perm_r/iperm_r */
-        int        *perm_r,  /* may be modified */
-        int        *iperm_r, /* in - inverse of perm_r */
-        int        *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */
-        int        *pivrow,  /* out */
-        GlobalLU_t *Glu,     /* modified - global LU data structures */
-	SuperLUStat_t *stat  /* output */
-       )
-{
-/*
- * Purpose
- * =======
- *   Performs the numerical pivoting on the current column of L,
- *   and the CDIV operation.
- *
- *   Pivot policy:
- *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
- *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
- *           pivot row = k;
- *       ELSE IF abs(A_jj) >= thresh THEN
- *           pivot row = j;
- *       ELSE
- *           pivot row = m;
- * 
- *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
- *
- *   Return value: 0      success;
- *                 i > 0  U(i,i) is exactly zero.
- *
- */
-    complex one = {1.0, 0.0};
-    int          fsupc;	    /* first column in the supernode */
-    int          nsupc;	    /* no of columns in the supernode */
-    int          nsupr;     /* no of rows in the supernode */
-    int          lptr;	    /* points to the starting subscript of the supernode */
-    int          pivptr, old_pivptr, diag, diagind;
-    float       pivmax, rtemp, thresh;
-    complex       temp;
-    complex       *lu_sup_ptr; 
-    complex       *lu_col_ptr;
-    int          *lsub_ptr;
-    int          isub, icol, k, itemp;
-    int          *lsub, *xlsub;
-    complex       *lusup;
-    int          *xlusup;
-    flops_t      *ops = stat->ops;
-
-    /* Initialize pointers */
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    fsupc      = (Glu->xsup)[(Glu->supno)[jcol]];
-    nsupc      = jcol - fsupc;	        /* excluding jcol; nsupc >= 0 */
-    lptr       = xlsub[fsupc];
-    nsupr      = xlsub[fsupc+1] - lptr;
-    lu_sup_ptr = &lusup[xlusup[fsupc]];	/* start of the current supernode */
-    lu_col_ptr = &lusup[xlusup[jcol]];	/* start of jcol in the supernode */
-    lsub_ptr   = &lsub[lptr];	/* start of row indices of the supernode */
-
-#ifdef DEBUG
-if ( jcol == MIN_COL ) {
-    printf("Before cdiv: col %d\n", jcol);
-    for (k = nsupc; k < nsupr; k++) 
-	printf("  lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]);
-}
-#endif
-    
-    /* Determine the largest abs numerical value for partial pivoting;
-       Also search for user-specified pivot, and diagonal element. */
-    if ( *usepr ) *pivrow = iperm_r[jcol];
-    diagind = iperm_c[jcol];
-    pivmax = 0.0;
-    pivptr = nsupc;
-    diag = EMPTY;
-    old_pivptr = nsupc;
-    for (isub = nsupc; isub < nsupr; ++isub) {
-        rtemp = c_abs1 (&lu_col_ptr[isub]);
-	if ( rtemp > pivmax ) {
-	    pivmax = rtemp;
-	    pivptr = isub;
-	}
-	if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub;
-	if ( lsub_ptr[isub] == diagind ) diag = isub;
-    }
-
-    /* Test for singularity */
-    if ( pivmax == 0.0 ) {
-	*pivrow = lsub_ptr[pivptr];
-	perm_r[*pivrow] = jcol;
-	*usepr = 0;
-	return (jcol+1);
-    }
-
-    thresh = u * pivmax;
-    
-    /* Choose appropriate pivotal element by our policy. */
-    if ( *usepr ) {
-        rtemp = c_abs1 (&lu_col_ptr[old_pivptr]);
-	if ( rtemp != 0.0 && rtemp >= thresh )
-	    pivptr = old_pivptr;
-	else
-	    *usepr = 0;
-    }
-    if ( *usepr == 0 ) {
-	/* Use diagonal pivot? */
-	if ( diag >= 0 ) { /* diagonal exists */
-            rtemp = c_abs1 (&lu_col_ptr[diag]);
-	    if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag;
-        }
-	*pivrow = lsub_ptr[pivptr];
-    }
-    
-    /* Record pivot row */
-    perm_r[*pivrow] = jcol;
-    
-    /* Interchange row subscripts */
-    if ( pivptr != nsupc ) {
-	itemp = lsub_ptr[pivptr];
-	lsub_ptr[pivptr] = lsub_ptr[nsupc];
-	lsub_ptr[nsupc] = itemp;
-
-	/* Interchange numerical values as well, for the whole snode, such 
-	 * that L is indexed the same way as A.
- 	 */
-	for (icol = 0; icol <= nsupc; icol++) {
-	    itemp = pivptr + icol * nsupr;
-	    temp = lu_sup_ptr[itemp];
-	    lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr];
-	    lu_sup_ptr[nsupc + icol*nsupr] = temp;
-	}
-    } /* if */
-
-    /* cdiv operation */
-    ops[FACT] += 10 * (nsupr - nsupc);
-
-    c_div(&temp, &one, &lu_col_ptr[nsupc]);
-    for (k = nsupc+1; k < nsupr; k++) 
-	cc_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp);
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpruneL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cpruneL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cpruneL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,147 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-void
-cpruneL(
-       const int  jcol,	     /* in */
-       const int  *perm_r,   /* in */
-       const int  pivrow,    /* in */
-       const int  nseg,	     /* in */
-       const int  *segrep,   /* in */
-       const int  *repfnz,   /* in */
-       int        *xprune,   /* out */
-       GlobalLU_t *Glu       /* modified - global LU data structures */
-       )
-{
-/*
- * Purpose
- * =======
- *   Prunes the L-structure of supernodes whose L-structure
- *   contains the current pivot row "pivrow"
- *
- */
-    complex     utemp;
-    int        jsupno, irep, irep1, kmin, kmax, krow, movnum;
-    int        i, ktemp, minloc, maxloc;
-    int        do_prune; /* logical variable */
-    int        *xsup, *supno;
-    int        *lsub, *xlsub;
-    complex     *lusup;
-    int        *xlusup;
-
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    
-    /*
-     * For each supernode-rep irep in U[*,j]
-     */
-    jsupno = supno[jcol];
-    for (i = 0; i < nseg; i++) {
-
-	irep = segrep[i];
-	irep1 = irep + 1;
-	do_prune = FALSE;
-
-	/* Don't prune with a zero U-segment */
- 	if ( repfnz[irep] == EMPTY )
-		continue;
-
-     	/* If a snode overlaps with the next panel, then the U-segment 
-   	 * is fragmented into two parts -- irep and irep1. We should let
-	 * pruning occur at the rep-column in irep1's snode. 
-	 */
-	if ( supno[irep] == supno[irep1] ) 	/* Don't prune */
-		continue;
-
-	/*
-	 * If it has not been pruned & it has a nonz in row L[pivrow,i]
-	 */
-	if ( supno[irep] != jsupno ) {
-	    if ( xprune[irep] >= xlsub[irep1] ) {
-		kmin = xlsub[irep];
-		kmax = xlsub[irep1] - 1;
-		for (krow = kmin; krow <= kmax; krow++) 
-		    if ( lsub[krow] == pivrow ) {
-			do_prune = TRUE;
-			break;
-		    }
-	    }
-	    
-    	    if ( do_prune ) {
-
-	     	/* Do a quicksort-type partition
-	     	 * movnum=TRUE means that the num values have to be exchanged.
-	     	 */
-	        movnum = FALSE;
-	        if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */
-			movnum = TRUE;
-
-	        while ( kmin <= kmax ) {
-
-	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
-			kmax--;
-		    else if ( perm_r[lsub[kmin]] != EMPTY )
-			kmin++;
-		    else { /* kmin below pivrow, and kmax above pivrow: 
-		            * 	interchange the two subscripts
-			    */
-		        ktemp = lsub[kmin];
-		        lsub[kmin] = lsub[kmax];
-		        lsub[kmax] = ktemp;
-
-			/* If the supernode has only one column, then we
- 			 * only keep one set of subscripts. For any subscript 
-			 * interchange performed, similar interchange must be 
-			 * done on the numerical values.
- 			 */
-		        if ( movnum ) {
-		    	    minloc = xlusup[irep] + (kmin - xlsub[irep]);
-		    	    maxloc = xlusup[irep] + (kmax - xlsub[irep]);
-			    utemp = lusup[minloc];
-		  	    lusup[minloc] = lusup[maxloc];
-			    lusup[maxloc] = utemp;
-		        }
-
-		        kmin++;
-		        kmax--;
-
-		    }
-
-	        } /* while */
-
-	        xprune[irep] = kmin;	/* Pruning */
-
-#ifdef CHK_PRUNE
-	printf("    After cpruneL(),using col %d:  xprune[%d] = %d\n", 
-			jcol, irep, kmin);
-#endif
-	    } /* if do_prune */
-
-	} /* if */
-
-    } /* for each U-segment... */
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/creadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/creadhb.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/creadhb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/creadhb.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,265 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_cdefs.h"
-
-
-/* Eat up the rest of the current line */
-int cDumpLine(FILE *fp)
-{
-    register int c;
-    while ((c = fgetc(fp)) != '\n') ;
-    return 0;
-}
-
-int cParseIntFormat(char *buf, int *num, int *size)
-{
-    char *tmp;
-
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    sscanf(tmp, "%d", num);
-    while (*tmp != 'I' && *tmp != 'i') ++tmp;
-    ++tmp;
-    sscanf(tmp, "%d", size);
-    return 0;
-}
-
-int cParseFloatFormat(char *buf, int *num, int *size)
-{
-    char *tmp, *period;
-    
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
-	   && *tmp != 'F' && *tmp != 'f') {
-        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
-           num picked up refers to P, which should be skipped. */
-        if (*tmp=='p' || *tmp=='P') {
-           ++tmp;
-           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-        } else {
-           ++tmp;
-        }
-    }
-    ++tmp;
-    period = tmp;
-    while (*period != '.' && *period != ')') ++period ;
-    *period = '\0';
-    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
-
-    return 0;
-}
-
-int cReadVector(FILE *fp, int n, int *where, int perline, int persize)
-{
-    register int i, j, item;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	    where[i++] = item - 1;
-	}
-    }
-
-    return 0;
-}
-
-/* Read complex numbers as pairs of (real, imaginary) */
-int cReadValues(FILE *fp, int n, complex *destination, int perline, int persize)
-{
-    register int i, j, k, s, pair;
-    register float realpart;
-    char tmp, buf[100];
-    
-    i = pair = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    s = j*persize;
-	    for (k = 0; k < persize; ++k) /* No D_ format in C */
-		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
-	    if ( pair == 0 ) {
-	  	/* The value is real part */
-		realpart = atof(&buf[s]);
-		pair = 1;
-	    } else {
-		/* The value is imaginary part */
-	        destination[i].r = realpart;
-		destination[i++].i = atof(&buf[s]);
-		pair = 0;
-	    }
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	}
-    }
-
-    return 0;
-}
-
-
-void
-creadhb(int *nrow, int *ncol, int *nonz,
-	complex **nzval, int **rowind, int **colptr)
-{
-/* 
- * Purpose
- * =======
- * 
- * Read a COMPLEX PRECISION matrix stored in Harwell-Boeing format 
- * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
- *
- */
-
-    register int i, numer_lines = 0, rhscrd = 0;
-    int tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
-    char buf[100], type[4], key[10];
-    FILE *fp;
-
-    fp = stdin;
-
-    /* Line 1 */
-    fgets(buf, 100, fp);
-    fputs(buf, stdout);
-#if 0
-    fscanf(fp, "%72c", buf); buf[72] = 0;
-    printf("Title: %s", buf);
-    fscanf(fp, "%8c", key);  key[8] = 0;
-    printf("Key: %s\n", key);
-    cDumpLine(fp);
-#endif
-
-    /* Line 2 */
-    for (i=0; i<5; i++) {
-	fscanf(fp, "%14c", buf); buf[14] = 0;
-	sscanf(buf, "%d", &tmp);
-	if (i == 3) numer_lines = tmp;
-	if (i == 4 && tmp) rhscrd = tmp;
-    }
-    cDumpLine(fp);
-
-    /* Line 3 */
-    fscanf(fp, "%3c", type);
-    fscanf(fp, "%11c", buf); /* pad */
-    type[3] = 0;
-#ifdef DEBUG
-    printf("Matrix type %s\n", type);
-#endif
-    
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nrow);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", ncol);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nonz);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", &tmp);
-    
-    if (tmp != 0)
-	  printf("This is not an assembled matrix!\n");
-    if (*nrow != *ncol)
-	printf("Matrix is not square.\n");
-    cDumpLine(fp);
-
-    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
-    callocateA(*ncol, *nonz, nzval, rowind, colptr);
-
-    /* Line 4: format statement */
-    fscanf(fp, "%16c", buf);
-    cParseIntFormat(buf, &colnum, &colsize);
-    fscanf(fp, "%16c", buf);
-    cParseIntFormat(buf, &rownum, &rowsize);
-    fscanf(fp, "%20c", buf);
-    cParseFloatFormat(buf, &valnum, &valsize);
-    fscanf(fp, "%20c", buf);
-    cDumpLine(fp);
-
-    /* Line 5: right-hand side */    
-    if ( rhscrd ) cDumpLine(fp); /* skip RHSFMT */
-    
-#ifdef DEBUG
-    printf("%d rows, %d nonzeros\n", *nrow, *nonz);
-    printf("colnum %d, colsize %d\n", colnum, colsize);
-    printf("rownum %d, rowsize %d\n", rownum, rowsize);
-    printf("valnum %d, valsize %d\n", valnum, valsize);
-#endif
-    
-    cReadVector(fp, *ncol+1, *colptr, colnum, colsize);
-    cReadVector(fp, *nonz, *rowind, rownum, rowsize);
-    if ( numer_lines ) {
-        cReadValues(fp, *nonz, *nzval, valnum, valsize);
-    }
-    
-    fclose(fp);
-
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csnode_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csnode_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csnode_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csnode_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,118 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-
-/*
- * Performs numeric block updates within the relaxed snode. 
- */
-int
-csnode_bmod (
-	    const int  jcol,	  /* in */
-	    const int  jsupno,    /* in */
-	    const int  fsupc,     /* in */
-	    complex     *dense,    /* in */
-	    complex     *tempv,    /* working array */
-	    GlobalLU_t *Glu,      /* modified */
-	    SuperLUStat_t *stat   /* output */
-	    )
-{
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int            incx = 1, incy = 1;
-    complex         alpha = {-1.0, 0.0},  beta = {1.0, 0.0};
-#endif
-
-    complex   comp_zero = {0.0, 0.0};
-    int            luptr, nsupc, nsupr, nrow;
-    int            isub, irow, i, iptr; 
-    register int   ufirst, nextlu;
-    int            *lsub, *xlsub;
-    complex         *lusup;
-    int            *xlusup;
-    flops_t *ops = stat->ops;
-
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-
-    nextlu = xlusup[jcol];
-    
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-        dense[irow] = comp_zero;
-	++nextlu;
-    }
-
-    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */
-    
-    if ( fsupc < jcol ) {
-
-	luptr = xlusup[fsupc];
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nsupc = jcol - fsupc;	/* Excluding jcol */
-	ufirst = xlusup[jcol];	/* Points to the beginning of column
-				   jcol in supernode L\U(jsupno). */
-	nrow = nsupr - nsupc;
-
-	ops[TRSV] += 4 * nsupc * (nsupc - 1);
-	ops[GEMV] += 8 * nrow * nsupc;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-	ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-	cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
-			&lusup[ufirst], &tempv[0] );
-
-        /* Scatter tempv[*] into lusup[*] */
-	iptr = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    c_sub(&lusup[iptr], &lusup[iptr], &tempv[i]);
-            ++iptr;
-	    tempv[i] = comp_zero;
-	}
-#endif
-
-    }
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csnode_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csnode_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csnode_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_cdefs.h"
-
-int
-csnode_dfs (
-	   const int  jcol,	    /* in - start of the supernode */
-	   const int  kcol, 	    /* in - end of the supernode */
-	   const int  *asub,        /* in */
-	   const int  *xa_begin,    /* in */
-	   const int  *xa_end,      /* in */
-	   int        *xprune,      /* out */
-	   int        *marker,      /* modified */
-	   GlobalLU_t *Glu          /* modified */
-	   )
-{
-/* Purpose
- * =======
- *    csnode_dfs() - Determine the union of the row structures of those 
- *    columns within the relaxed snode.
- *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
- *    the portion outside the rectangular supernode must be zero.
- *
- * Return value
- * ============
- *     0   success;
- *    >0   number of bytes allocated when run out of memory.
- *
- */
-    register int i, k, ifrom, ito, nextl, new_next;
-    int          nsuper, krow, kmark, mem_error;
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    int          nzlmax;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    nsuper = ++supno[jcol];	/* Next available supernode number */
-    nextl = xlsub[jcol];
-
-    for (i = jcol; i <= kcol; i++) {
-	/* For each nonzero in A[*,i] */
-	for (k = xa_begin[i]; k < xa_end[i]; k++) {	
-	    krow = asub[k];
-	    kmark = marker[krow];
-	    if ( kmark != kcol ) { /* First time visit krow */
-		marker[krow] = kcol;
-		lsub[nextl++] = krow;
-		if ( nextl >= nzlmax ) {
-		    if ( mem_error = cLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-			return (mem_error);
-		    lsub = Glu->lsub;
-		}
-	    }
-    	}
-	supno[i] = nsuper;
-    }
-
-    /* Supernode > 1, then make a copy of the subscripts for pruning */
-    if ( jcol < kcol ) {
-	new_next = nextl + (nextl - xlsub[jcol]);
-	while ( new_next > nzlmax ) {
-	    if ( mem_error = cLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-		return (mem_error);
-	    lsub = Glu->lsub;
-	}
-	ito = nextl;
-	for (ifrom = xlsub[jcol]; ifrom < nextl; )
-	    lsub[ito++] = lsub[ifrom++];	
-        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
-	nextl = ito;
-    }
-
-    xsup[nsuper+1] = kcol + 1;
-    supno[kcol+1]  = nsuper;
-    xprune[kcol]   = nextl;
-    xlsub[kcol+1]  = nextl;
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,565 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:		csp_blas2.c
- * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
- */
-
-#include "slu_cdefs.h"
-
-/* 
- * Function prototypes 
- */
-void cusolve(int, int, complex*, complex*);
-void clsolve(int, int, complex*, complex*);
-void cmatvec(int, int, int, complex*, complex*, complex*);
-
-
-int
-sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
-         SuperMatrix *U, complex *x, SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose
- *   =======
- *
- *   sp_ctrsv() solves one of the systems of equations   
- *       A*x = b,   or   A'*x = b,
- *   where b and x are n element vectors and A is a sparse unit , or   
- *   non-unit, upper or lower triangular matrix.   
- *   No test for singularity or near-singularity is included in this   
- *   routine. Such tests must be performed before calling this routine.   
- *
- *   Parameters   
- *   ==========   
- *
- *   uplo   - (input) char*
- *            On entry, uplo specifies whether the matrix is an upper or   
- *             lower triangular matrix as follows:   
- *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
- *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
- *
- *   trans  - (input) char*
- *             On entry, trans specifies the equations to be solved as   
- *             follows:   
- *                trans = 'N' or 'n'   A*x = b.   
- *                trans = 'T' or 't'   A'*x = b.
- *                trans = 'C' or 'c'   A^H*x = b.   
- *
- *   diag   - (input) char*
- *             On entry, diag specifies whether or not A is unit   
- *             triangular as follows:   
- *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
- *                diag = 'N' or 'n'   A is not assumed to be unit   
- *                                    triangular.   
- *	     
- *   L       - (input) SuperMatrix*
- *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes,
- *             i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU.
- *
- *   U       - (input) SuperMatrix*
- *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU.
- *    
- *   x       - (input/output) complex*
- *             Before entry, the incremented array X must contain the n   
- *             element right-hand side vector b. On exit, X is overwritten 
- *             with the solution vector x.
- *
- *   info    - (output) int*
- *             If *info = -i, the i-th argument had an illegal value.
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    SCformat *Lstore;
-    NCformat *Ustore;
-    complex   *Lval, *Uval;
-    int incx = 1, incy = 1;
-    complex temp;
-    complex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
-    complex comp_zero = {0.0, 0.0};
-    int nrow;
-    int fsupc, nsupr, nsupc, luptr, istart, irow;
-    int i, k, iptr, jcol;
-    complex *work;
-    flops_t solve_ops;
-
-    /* Test the input parameters */
-    *info = 0;
-    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
-    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && 
-              !lsame_(trans, "C")) *info = -2;
-    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("sp_ctrsv", &i);
-	return 0;
-    }
-
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-
-    if ( !(work = complexCalloc(L->nrow)) )
-	ABORT("Malloc fails for work in sp_ctrsv().");
-    
-    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L)*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-		fsupc = L_FST_SUPC(k);
-		istart = L_SUB_START(fsupc);
-		nsupr = L_SUB_START(fsupc+1) - istart;
-		nsupc = L_FST_SUPC(k+1) - fsupc;
-		luptr = L_NZ_START(fsupc);
-		nrow = nsupr - nsupc;
-
-                /* 1 c_div costs 10 flops */
-	        solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc;
-	        solve_ops += 8 * nrow * nsupc;
-
-		if ( nsupc == 1 ) {
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
-			irow = L_SUB(iptr);
-			++luptr;
-			cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
-			c_sub(&x[irow], &x[irow], &comp_zero);
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#else
-		    ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#endif
-#else
-		    clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
-		
-		    cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
-                             &x[fsupc], &work[0] );
-#endif		
-		
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; ++i, ++iptr) {
-			irow = L_SUB(iptr);
-			c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
-			work[i] = comp_zero;
-
-		    }
-	 	}
-	    } /* for k ... */
-	    
-	} else {
-	    /* Form x := inv(U)*x */
-	    
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; k--) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-		
-                /* 1 c_div costs 10 flops */
-    	        solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
-
-		if ( nsupc == 1 ) {
-		    c_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
-		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
-			irow = U_SUB(i);
-			cc_mult(&comp_zero, &x[fsupc], &Uval[i]);
-			c_sub(&x[irow], &x[irow], &comp_zero);
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx);
-#else
-		    ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
-                           &x[fsupc], &incx);
-#endif
-#else		
-		    cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
-#endif		
-
-		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		        solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
-				i++) {
-			    irow = U_SUB(i);
-			cc_mult(&comp_zero, &x[jcol], &Uval[i]);
-			c_sub(&x[irow], &x[irow], &comp_zero);
-		    	}
-                    }
-		}
-	    } /* for k ... */
-	    
-	}
-    } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L')*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 8 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-			cc_mult(&comp_zero, &x[irow], &Lval[i]);
-		    	c_sub(&x[jcol], &x[jcol], &comp_zero);
-			iptr++;
-		    }
-		}
-		
-		if ( nsupc > 1 ) {
-		    solve_ops += 4 * nsupc * (nsupc - 1);
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-		    ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := inv(U')*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-			cc_mult(&comp_zero, &x[irow], &Uval[i]);
-		    	c_sub(&x[jcol], &x[jcol], &comp_zero);
-		    }
-		}
-
-                /* 1 c_div costs 10 flops */
-		solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
-
-		if ( nsupc == 1 ) {
-		    c_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
-		} else {
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-		    ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#endif
-		}
-	    } /* for k ... */
-	}
-    } else { /* Form x := conj(inv(A'))*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := conj(inv(L'))*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 8 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-                        cc_conj(&temp, &Lval[i]);
-			cc_mult(&comp_zero, &x[irow], &temp);
-		    	c_sub(&x[jcol], &x[jcol], &comp_zero);
-			iptr++;
-		    }
- 		}
- 		
- 		if ( nsupc > 1 ) {
-		    solve_ops += 4 * nsupc * (nsupc - 1);
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd(trans, strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-                    ctrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr,
-                           &x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := conj(inv(U'))*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-                        cc_conj(&temp, &Uval[i]);
-			cc_mult(&comp_zero, &x[irow], &temp);
-		    	c_sub(&x[jcol], &x[jcol], &comp_zero);
-		    }
-		}
-
-                /* 1 c_div costs 10 flops */
-		solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
- 
-		if ( nsupc == 1 ) {
-                    cc_conj(&temp, &Lval[luptr]);
-		    c_div(&x[fsupc], &x[fsupc], &temp);
-		} else {
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd(trans, strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-                    ctrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr,
-                               &x[fsupc], &incx);
-#endif
-  		}
-  	    } /* for k ... */
-  	}
-    }
-
-    stat->ops[SOLVE] += solve_ops;
-    SUPERLU_FREE(work);
-    return 0;
-}
-
-
-
-int
-sp_cgemv(char *trans, complex alpha, SuperMatrix *A, complex *x, 
-	 int incx, complex beta, complex *y, int incy)
-{
-/*  Purpose   
-    =======   
-
-    sp_cgemv()  performs one of the matrix-vector operations   
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-    where alpha and beta are scalars, x and y are vectors and A is a
-    sparse A->nrow by A->ncol matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - (input) char*
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-    ALPHA  - (input) complex
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-
-    X      - (input) complex*, array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-
-    INCX   - (input) int
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-
-    BETA   - (input) complex
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-
-    Y      - (output) complex*,  array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-             updated vector y.
-	     
-    INCY   - (input) int
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-
-    ==== Sparse Level 2 Blas routine.   
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    complex   *Aval;
-    int info;
-    complex temp, temp1;
-    int lenx, leny, i, j, irow;
-    int iy, jx, jy, kx, ky;
-    int notran;
-    complex comp_zero = {0.0, 0.0};
-    complex comp_one = {1.0, 0.0};
-
-    notran = lsame_(trans, "N");
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Test the input parameters */
-    info = 0;
-    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
-    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
-    else if (incx == 0) info = 5;
-    else if (incy == 0)	info = 8;
-    if (info != 0) {
-	xerbla_("sp_cgemv ", &info);
-	return 0;
-    }
-
-    /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || 
-	c_eq(&alpha, &comp_zero) && 
-	c_eq(&beta, &comp_one))
-	return 0;
-
-
-    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-       up the start points in  X  and  Y. */
-    if (lsame_(trans, "N")) {
-	lenx = A->ncol;
-	leny = A->nrow;
-    } else {
-	lenx = A->nrow;
-	leny = A->ncol;
-    }
-    if (incx > 0) kx = 0;
-    else kx =  - (lenx - 1) * incx;
-    if (incy > 0) ky = 0;
-    else ky =  - (leny - 1) * incy;
-
-    /* Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-    /* First form  y := beta*y. */
-    if ( !c_eq(&beta, &comp_one) ) {
-	if (incy == 1) {
-	    if ( c_eq(&beta, &comp_zero) )
-		for (i = 0; i < leny; ++i) y[i] = comp_zero;
-	    else
-		for (i = 0; i < leny; ++i) 
-		  cc_mult(&y[i], &beta, &y[i]);
-	} else {
-	    iy = ky;
-	    if ( c_eq(&beta, &comp_zero) )
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = comp_zero;
-		    iy += incy;
-		}
-	    else
-		for (i = 0; i < leny; ++i) {
-		    cc_mult(&y[iy], &beta, &y[iy]);
-		    iy += incy;
-		}
-	}
-    }
-    
-    if ( c_eq(&alpha, &comp_zero) ) return 0;
-
-    if ( notran ) {
-	/* Form  y := alpha*A*x + y. */
-	jx = kx;
-	if (incy == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		if ( !c_eq(&x[jx], &comp_zero) ) {
-		    cc_mult(&temp, &alpha, &x[jx]);
-		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-			irow = Astore->rowind[i];
-			cc_mult(&temp1, &temp,  &Aval[i]);
-			c_add(&y[irow], &y[irow], &temp1);
-		    }
-		}
-		jx += incx;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    } else {
-	/* Form  y := alpha*A'*x + y. */
-	jy = ky;
-	if (incx == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		temp = comp_zero;
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    cc_mult(&temp1, &Aval[i], &x[irow]);
-		    c_add(&temp, &temp, &temp1);
-		}
-		cc_mult(&temp1, &alpha, &temp);
-		c_add(&y[jy], &y[jy], &temp1);
-		jy += incy;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    }
-    return 0;    
-} /* sp_cgemv */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas3.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/csp_blas3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/csp_blas3.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,119 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		sp_blas3.c
- * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
- */
-
-#include "slu_cdefs.h"
-
-int
-sp_cgemm(char *transa, char *transb, int m, int n, int k, 
-         complex alpha, SuperMatrix *A, complex *b, int ldb, 
-         complex beta, complex *c, int ldc)
-{
-/*  Purpose   
-    =======   
-
-    sp_c performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of 
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - (input) char*
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-             Unchanged on exit.   
-
-    TRANSB - (input) char*
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-             Unchanged on exit.   
-
-    M      - (input) int   
-             On entry,  M  specifies  the number of rows of the matrix 
-	     op( A ) and of the matrix C.  M must be at least zero. 
-	     Unchanged on exit.   
-
-    N      - (input) int
-             On entry,  N specifies the number of columns of the matrix 
-	     op( B ) and the number of columns of the matrix C. N must be 
-	     at least zero.
-	     Unchanged on exit.   
-
-    K      - (input) int
-             On entry, K specifies the number of columns of the matrix 
-	     op( A ) and the number of rows of the matrix op( B ). K must 
-	     be at least  zero.   
-             Unchanged on exit.
-	     
-    ALPHA  - (input) complex
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = SLU_C; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    B      - COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
-             part of the array B must contain the matrix B, otherwise 
-             the leading n by k part of the array B must contain the 
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - (input) int
-             On entry, LDB specifies the first dimension of B as declared 
-             in the calling (sub) program. LDB must be at least max( 1, n ).  
-             Unchanged on exit.   
-
-    BETA   - (input) complex
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then C need not be set on input.   
-
-    C      - COMPLEX PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading m by n part of the array C must 
-             contain the matrix C,  except when beta is zero, in which 
-             case C need not be set on entry.   
-             On exit, the array C is overwritten by the m by n matrix 
-	     ( alpha*op( A )*B + beta*C ).   
-
-    LDC    - (input) int
-             On entry, LDC specifies the first dimension of C as declared 
-             in the calling (sub)program. LDC must be at least max(1,m).   
-             Unchanged on exit.   
-
-    ==== Sparse Level 3 Blas routine.   
-*/
-    int    incx = 1, incy = 1;
-    int    j;
-
-    for (j = 0; j < n; ++j) {
-	sp_cgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
-    }
-    return 0;    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cutil.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/cutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/cutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,485 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include "slu_cdefs.h"
-
-void
-cCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       complex *nzval, int *rowind, int *colptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NCformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->rowind = rowind;
-    Astore->colptr = colptr;
-}
-
-void
-cCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       complex *nzval, int *colind, int *rowptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NRformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->colind = colind;
-    Astore->rowptr = rowptr;
-}
-
-/* Copy matrix A into matrix B. */
-void
-cCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore, *Bstore;
-    int      ncol, nnz, i;
-
-    B->Stype = A->Stype;
-    B->Dtype = A->Dtype;
-    B->Mtype = A->Mtype;
-    B->nrow  = A->nrow;;
-    B->ncol  = ncol = A->ncol;
-    Astore   = (NCformat *) A->Store;
-    Bstore   = (NCformat *) B->Store;
-    Bstore->nnz = nnz = Astore->nnz;
-    for (i = 0; i < nnz; ++i)
-	((complex *)Bstore->nzval)[i] = ((complex *)Astore->nzval)[i];
-    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
-    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
-}
-
-
-void
-cCreate_Dense_Matrix(SuperMatrix *X, int m, int n, complex *x, int ldx,
-		    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    DNformat    *Xstore;
-    
-    X->Stype = stype;
-    X->Dtype = dtype;
-    X->Mtype = mtype;
-    X->nrow = m;
-    X->ncol = n;
-    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
-    Xstore = (DNformat *) X->Store;
-    Xstore->lda = ldx;
-    Xstore->nzval = (complex *) x;
-}
-
-void
-cCopy_Dense_Matrix(int M, int N, complex *X, int ldx,
-			complex *Y, int ldy)
-{
-/*
- *
- *  Purpose
- *  =======
- *
- *  Copies a two-dimensional matrix X to another matrix Y.
- */
-    int    i, j;
-    
-    for (j = 0; j < N; ++j)
-        for (i = 0; i < M; ++i)
-            Y[i + j*ldy] = X[i + j*ldx];
-}
-
-void
-cCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, 
-			complex *nzval, int *nzval_colptr, int *rowind,
-			int *rowind_colptr, int *col_to_sup, int *sup_to_col,
-			Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    SCformat *Lstore;
-
-    L->Stype = stype;
-    L->Dtype = dtype;
-    L->Mtype = mtype;
-    L->nrow = m;
-    L->ncol = n;
-    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
-    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
-    Lstore = L->Store;
-    Lstore->nnz = nnz;
-    Lstore->nsuper = col_to_sup[n];
-    Lstore->nzval = nzval;
-    Lstore->nzval_colptr = nzval_colptr;
-    Lstore->rowind = rowind;
-    Lstore->rowind_colptr = rowind_colptr;
-    Lstore->col_to_sup = col_to_sup;
-    Lstore->sup_to_col = sup_to_col;
-
-}
-
-
-/*
- * Convert a row compressed storage into a column compressed storage.
- */
-void
-cCompRow_to_CompCol(int m, int n, int nnz, 
-		    complex *a, int *colind, int *rowptr,
-		    complex **at, int **rowind, int **colptr)
-{
-    register int i, j, col, relpos;
-    int *marker;
-
-    /* Allocate storage for another copy of the matrix. */
-    *at = (complex *) complexMalloc(nnz);
-    *rowind = (int *) intMalloc(nnz);
-    *colptr = (int *) intMalloc(n+1);
-    marker = (int *) intCalloc(n);
-    
-    /* Get counts of each column of A, and set up column pointers */
-    for (i = 0; i < m; ++i)
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
-    (*colptr)[0] = 0;
-    for (j = 0; j < n; ++j) {
-	(*colptr)[j+1] = (*colptr)[j] + marker[j];
-	marker[j] = (*colptr)[j];
-    }
-
-    /* Transfer the matrix into the compressed column storage. */
-    for (i = 0; i < m; ++i) {
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    col = colind[j];
-	    relpos = marker[col];
-	    (*rowind)[relpos] = i;
-	    (*at)[relpos] = a[j];
-	    ++marker[col];
-	}
-    }
-
-    SUPERLU_FREE(marker);
-}
-
-
-void
-cPrint_CompCol_Matrix(char *what, SuperMatrix *A)
-{
-    NCformat     *Astore;
-    register int i,n;
-    float       *dp;
-    
-    printf("\nCompCol matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (NCformat *) A->Store;
-    dp = (float *) Astore->nzval;
-    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
-    printf("nzval: ");
-    for (i = 0; i < 2*Astore->colptr[n]; ++i) printf("%f  ", dp[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->colptr[n]; ++i) printf("%d  ", Astore->rowind[i]);
-    printf("\ncolptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->colptr[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-cPrint_SuperNode_Matrix(char *what, SuperMatrix *A)
-{
-    SCformat     *Astore;
-    register int i, j, k, c, d, n, nsup;
-    float       *dp;
-    int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr;
-    
-    printf("\nSuperNode matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (SCformat *) A->Store;
-    dp = (float *) Astore->nzval;
-    col_to_sup = Astore->col_to_sup;
-    sup_to_col = Astore->sup_to_col;
-    rowind_colptr = Astore->rowind_colptr;
-    rowind = Astore->rowind;
-    printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", 
-	   A->nrow,A->ncol,Astore->nnz,Astore->nsuper);
-    printf("nzval:\n");
-    for (k = 0; k <= Astore->nsuper; ++k) {
-      c = sup_to_col[k];
-      nsup = sup_to_col[k+1] - c;
-      for (j = c; j < c + nsup; ++j) {
-	d = Astore->nzval_colptr[j];
-	for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) {
-	  printf("%d\t%d\t%e\t%e\n", rowind[i], j, dp[d], dp[d+1]);
-          d += 2;	
-	}
-      }
-    }
-#if 0
-    for (i = 0; i < 2*Astore->nzval_colptr[n]; ++i) printf("%f  ", dp[i]);
-#endif
-    printf("\nnzval_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->nzval_colptr[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->rowind_colptr[n]; ++i) 
-        printf("%d  ", Astore->rowind[i]);
-    printf("\nrowind_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->rowind_colptr[i]);
-    printf("\ncol_to_sup: ");
-    for (i = 0; i < n; ++i) printf("%d  ", col_to_sup[i]);
-    printf("\nsup_to_col: ");
-    for (i = 0; i <= Astore->nsuper+1; ++i) 
-        printf("%d  ", sup_to_col[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-cPrint_Dense_Matrix(char *what, SuperMatrix *A)
-{
-    DNformat     *Astore;
-    register int i, j, lda = Astore->lda;
-    float       *dp;
-    
-    printf("\nDense matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (DNformat *) A->Store;
-    dp = (float *) Astore->nzval;
-    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda);
-    printf("\nnzval: ");
-    for (j = 0; j < A->ncol; ++j) {
-        for (i = 0; i < 2*A->nrow; ++i) printf("%f  ", dp[i + j*2*lda]);
-        printf("\n");
-    }
-    printf("\n");
-    fflush(stdout);
-}
-
-/*
- * Diagnostic print of column "jcol" in the U/L factor.
- */
-void
-cprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu)
-{
-    int     i, k, fsupc;
-    int     *xsup, *supno;
-    int     *xlsub, *lsub;
-    complex  *lusup;
-    int     *xlusup;
-    complex  *ucol;
-    int     *usub, *xusub;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    
-    printf("%s", msg);
-    printf("col %d: pivrow %d, supno %d, xprune %d\n", 
-	   jcol, pivrow, supno[jcol], xprune[jcol]);
-    
-    printf("\tU-col:\n");
-    for (i = xusub[jcol]; i < xusub[jcol+1]; i++)
-	printf("\t%d%10.4f, %10.4f\n", usub[i], ucol[i].r, ucol[i].i);
-    printf("\tL-col in rectangular snode:\n");
-    fsupc = xsup[supno[jcol]];	/* first col of the snode */
-    i = xlsub[fsupc];
-    k = xlusup[jcol];
-    while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) {
-	printf("\t%d\t%10.4f, %10.4f\n", lsub[i], lusup[k].r, lusup[k].i);
-	i++; k++;
-    }
-    fflush(stdout);
-}
-
-
-/*
- * Check whether tempv[] == 0. This should be true before and after 
- * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". 
- */
-void ccheck_tempv(int n, complex *tempv)
-{
-    int i;
-	
-    for (i = 0; i < n; i++) {
-	if ((tempv[i].r != 0.0) || (tempv[i].i != 0.0))
-	{
-	    fprintf(stderr,"tempv[%d] = {%f, %f}\n", i, tempv[i].r, tempv[i].i);
-	    ABORT("ccheck_tempv");
-	}
-    }
-}
-
-
-void
-cGenXtrue(int n, int nrhs, complex *x, int ldx)
-{
-    int  i, j;
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) {
-	    x[i + j*ldx].r = 1.0;
-	    x[i + j*ldx].i = 0.0;
-	}
-}
-
-/*
- * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
- */
-void
-cFillRHS(trans_t trans, int nrhs, complex *x, int ldx,
-         SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore;
-    complex   *Aval;
-    DNformat *Bstore;
-    complex   *rhs;
-    complex one = {1.0, 0.0};
-    complex zero = {0.0, 0.0};
-    int      ldc;
-    char transc[1];
-
-    Astore = A->Store;
-    Aval   = (complex *) Astore->nzval;
-    Bstore = B->Store;
-    rhs    = Bstore->nzval;
-    ldc    = Bstore->lda;
-    
-    if ( trans == NOTRANS ) *(unsigned char *)transc = 'N';
-    else *(unsigned char *)transc = 'T';
-
-    sp_cgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A,
-	     x, ldx, zero, rhs, ldc);
-
-}
-
-/* 
- * Fills a complex precision array with a given value.
- */
-void 
-cfill(complex *a, int alen, complex dval)
-{
-    register int i;
-    for (i = 0; i < alen; i++) a[i] = dval;
-}
-
-
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void cinf_norm_error(int nrhs, SuperMatrix *X, complex *xtrue)
-{
-    DNformat *Xstore;
-    float err, xnorm;
-    complex *Xmat, *soln_work;
-    complex temp;
-    int i, j;
-
-    Xstore = X->Store;
-    Xmat = Xstore->nzval;
-
-    for (j = 0; j < nrhs; j++) {
-      soln_work = &Xmat[j*Xstore->lda];
-      err = xnorm = 0.0;
-      for (i = 0; i < X->nrow; i++) {
-        c_sub(&temp, &soln_work[i], &xtrue[i]);
-	err = SUPERLU_MAX(err, c_abs(&temp));
-	xnorm = SUPERLU_MAX(xnorm, c_abs(&soln_work[i]));
-      }
-      err = err / xnorm;
-      printf("||X - Xtrue||/||X|| = %e\n", err);
-    }
-}
-
-
-
-/* Print performance of the code. */
-void
-cPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage,
-           float rpg, float rcond, float *ferr,
-           float *berr, char *equed, SuperLUStat_t *stat)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *utime;
-    flops_t  *ops;
-    
-    utime = stat->utime;
-    ops   = stat->ops;
-    
-    if ( utime[FACT] != 0. )
-	printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
-	       ops[FACT]*1e-6/utime[FACT]);
-    printf("Identify relaxed snodes	= %8.2f\n", utime[RELAX]);
-    if ( utime[SOLVE] != 0. )
-	printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE],
-	       ops[SOLVE]*1e-6/utime[SOLVE]);
-    
-    Lstore = (SCformat *) L->Store;
-    Ustore = (NCformat *) U->Store;
-    printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz);
-    printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz);
-    printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz);
-	
-    printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n",
-	   mem_usage->for_lu/1e6, mem_usage->total_needed/1e6,
-	   mem_usage->expansions);
-	
-    printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n");
-    printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n",
-	   utime[FACT], ops[FACT]*1e-6/utime[FACT],
-	   utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE],
-	   utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]);
-    
-    printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n");
-    printf("NUM:\t%e\t%e\t%e\t%e\t%s\n",
-	   rpg, rcond, ferr[0], berr[0], equed);
-    
-}
-
-
-
-
-print_complex_vec(char *what, int n, complex *vec)
-{
-    int i;
-    printf("%s: n %d\n", what, n);
-    for (i = 0; i < n; ++i) printf("%d\t%f%f\n", i, vec[i].r, vec[i].i);
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,361 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver 
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-/*
- * Function prototypes
-void sludusolve(int, int, double*, double*);
- */
-void sludlsolve(int, int, double*, double*);
-void sludmatvec(int, int, int, double*, double*, double*);
-
-
-/* Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-dcolumn_bmod (
-	     const int  jcol,	  /* in */
-	     const int  nseg,	  /* in */
-	     double     *dense,	  /* in */
-	     double     *tempv,	  /* working array */
-	     int        *segrep,  /* in */
-	     int        *repfnz,  /* in */
-	     int        fpanelc,  /* in -- first column in the current panel */
-	     GlobalLU_t *Glu,     /* modified */
-	     SuperLUStat_t *stat  /* output */
-	     )
-{
-/*
- * Purpose:
- * ========
- *    Performs numeric block updates (sup-col) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-#ifdef USE_VENDOR_BLAS
-    int         incx = 1, incy = 1;
-    double      alpha, beta;
-#endif
-    
-    /* krep = representative of current k-th supernode
-     * fsupc = first supernodal column
-     * nsupc = no of columns in supernode
-     * nsupr = no of rows in supernode (used as leading dimension)
-     * luptr = location of supernodal LU-block in storage
-     * kfnz = first nonz in the k-th supernodal segment
-     * no_zeros = no of leading zeros in a supernodal U-segment
-     */
-    double       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          fsupc, nsupc, nsupr, segsze;
-    int          nrow;	  /* No of rows in the matrix of matrix-vector */
-    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
-    register int lptr, kfnz, isub, irow, i;
-    register int no_zeros, new_next; 
-    int          ufirst, nextlu;
-    int          fst_col; /* First column within small LU update */
-    int          d_fsupc; /* Distance between the first column of the current
-			     panel and the first column of the current snode. */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    double       *lusup;
-    int          *xlusup;
-    int          nzlumax;
-    double       *tempv1;
-    double      zero = 0.0;
-#ifdef USE_VENDOR_BLAS
-    double      one = 1.0;
-    double      none = -1.0;
-#endif
-    int          mem_error;
-    flops_t      *ops = stat->ops;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    nzlumax = Glu->nzlumax;
-    jcolp1 = jcol + 1;
-    jsupno = supno[jcol];
-    
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-
-	krep = segrep[k];
-	k--;
-	ksupno = supno[krep];
-	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
-
-	    fsupc = xsup[ksupno];
-	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-  	    /* Distance from the current supernode to the current panel; 
-	       d_fsupc=0 if fsupc > fpanelc. */
-  	    d_fsupc = fst_col - fsupc; 
-
-	    luptr = xlusup[fst_col] + d_fsupc;
-	    lptr = xlsub[fsupc] + d_fsupc;
-
-	    kfnz = repfnz[krep];
-	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
-
-	    segsze = krep - kfnz + 1;
-	    nsupc = krep - fst_col + 1;
-	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	    nrow = nsupr - d_fsupc - nsupc;
-	    krep_ind = lptr + nsupc - 1;
-
-	    ops[TRSV] += segsze * (segsze - 1);
-	    ops[GEMV] += 2 * nrow * segsze;
-
-
-	    /* 
-	     * Case 1: Update U-segment of size 1 -- col-col update 
-	     */
-	    if ( segsze == 1 ) {
-	  	ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc;
-
-		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    dense[irow] -=  ukj*lusup[luptr];
-		    luptr++;
-		}
-
-	    } else if ( segsze <= 3 ) {
-		ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc-1;
-		ukj1 = dense[lsub[krep_ind - 1]];
-		luptr1 = luptr - nsupr;
-
-		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
-		    ukj -= ukj1 * lusup[luptr1];
-		    dense[lsub[krep_ind]] = ukj;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-		    	dense[irow] -= ( ukj*lusup[luptr]
-					+ ukj1*lusup[luptr1] );
-		    }
-		} else { /* Case 3: 3cols-col update */
-		    ukj2 = dense[lsub[krep_ind - 2]];
-		    luptr2 = luptr1 - nsupr;
-		    ukj1 -= ukj2 * lusup[luptr2-1];
-		    ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
-		    dense[lsub[krep_ind]] = ukj;
-		    dense[lsub[krep_ind-1]] = ukj1;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-			luptr2++;
-		    	dense[irow] -= ( ukj*lusup[luptr]
-			     + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
-		    }
-		}
-
-
-
-	    } else {
-	  	/*
-		 * Case: sup-col update
-		 * Perform a triangular solve and block update,
-		 * then scatter the result of sup-col update to dense
-		 */
-
-		no_zeros = kfnz - fst_col;
-
-	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
-	        isub = lptr + no_zeros;
-	        for (i = 0; i < segsze; i++) {
-	  	    irow = lsub[isub];
-		    tempv[i] = dense[irow];
-		    ++isub; 
-	        }
-
-	        /* Dense triangular solve -- start effective triangle */
-		luptr += nsupr * no_zeros + no_zeros; 
-		
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#else		
-                hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&segsze, 
-                       &lusup[luptr], &nsupr, tempv, &incx );
-#endif		
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-                alpha = one;
-                beta = zero;
-#ifdef _CRAY
-		SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-                hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow,&segsze,&alpha, 
-                       &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, 
-                       &incy );
-#endif
-#else
-		sludlsolve ( nsupr, segsze, &lusup[luptr], tempv );
-
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-		sludmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		
-		
-                /* Scatter tempv[] into SPA dense[] as a temporary storage */
-                isub = lptr + no_zeros;
-                for (i = 0; i < segsze; i++) {
-                    irow = lsub[isub];
-                    dense[irow] = tempv[i];
-                    tempv[i] = zero;
-                    ++isub;
-                }
-
-		/* Scatter tempv1[] into SPA dense[] */
-		for (i = 0; i < nrow; i++) {
-		    irow = lsub[isub];
-		    dense[irow] -= tempv1[i];
-		    tempv1[i] = zero;
-		    ++isub;
-		}
-	    }
-	    
-	} /* if jsupno ... */
-
-    } /* for each segment... */
-
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    nextlu = xlusup[jcol];
-    fsupc = xsup[jsupno];
-
-    /* Copy the SPA dense into L\U[*,j] */
-    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
-    while ( new_next > nzlumax ) {
-	if ((mem_error = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)))
-	    return (mem_error);
-	lusup = Glu->lusup;
-	lsub = Glu->lsub;
-    }
-
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-        dense[irow] = zero;
-	++nextlu;
-    }
-
-    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */
-
-    /* For more updates within the panel (also within the current supernode), 
-     * should start from the first column of the panel, or the first column 
-     * of the supernode, whichever is bigger. There are 2 cases:
-     *    1) fsupc < fpanelc, then fst_col := fpanelc
-     *    2) fsupc >= fpanelc, then fst_col := fsupc
-     */
-    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-    if ( fst_col < jcol ) {
-
-  	/* Distance between the current supernode and the current panel.
-	   d_fsupc=0 if fsupc >= fpanelc. */
-  	d_fsupc = fst_col - fsupc;
-
-	lptr = xlsub[fsupc] + d_fsupc;
-	luptr = xlusup[fst_col] + d_fsupc;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	nsupc = jcol - fst_col;	/* Excluding jcol */
-	nrow = nsupr - d_fsupc - nsupc;
-
-	/* Points to the beginning of jcol in snode L\U(jsupno) */
-	ufirst = xlusup[jcol] + d_fsupc;	
-
-	ops[TRSV] += nsupc * (nsupc - 1);
-	ops[GEMV] += 2 * nrow * nsupc;
-	
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#else
-        hypre_F90_NAME_BLAS(dtrsv,DTRSV)( "L", "N", "U", &nsupc, 
-               &lusup[luptr], &nsupr, &lusup[ufirst], &incx );
-#endif
-	
-	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
-
-#ifdef _CRAY
-	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-        hypre_F90_NAME_BLAS(dgemv,DGEMV)("N", &nrow, &nsupc, &alpha, 
-               &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, 
-               &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	sludlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-
-	sludmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
-		&lusup[ufirst], tempv );
-	
-        /* Copy updates from tempv[*] into lusup[*] */
-	isub = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    lusup[isub] -= tempv[i];
-	    tempv[i] = 0.0;
-	    ++isub;
-	}
-
-#endif
-	
-	
-    } /* if fst_col < jcol ... */ 
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcolumn_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,269 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-/* What type of supernodes we want */
-#define T2_SUPER
-
-int
-dcolumn_dfs(
-	   const int  m,         /* in - number of rows in the matrix */
-	   const int  jcol,      /* in */
-	   int        *perm_r,   /* in */
-	   int        *nseg,     /* modified - with new segments appended */
-	   int        *lsub_col, /* in - defines the RHS vector to start the dfs */
-	   int        *segrep,   /* modified - with new segments appended */
-	   int        *repfnz,   /* modified */
-	   int        *xprune,   /* modified */
-	   int        *marker,   /* modified */
-	   int        *parent,	 /* working array */
-	   int        *xplore,   /* working array */
-	   GlobalLU_t *Glu       /* modified */
-	   )
-{
-/* 
- * Purpose
- * =======
- *   "column_dfs" performs a symbolic factorization on column jcol, and
- *   decide the supernode boundary.
- *
- *   This routine does not use numeric values, but only use the RHS 
- *   row indices to start the dfs.
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives. The routine returns a list of such supernodal 
- *   representatives in topological order of the dfs that generates them.
- *   The location of the first nonzero in each such supernodal segment
- *   (supernodal entry location) is also returned.
- *
- * Local parameters
- * ================
- *   nseg: no of segments in current U[*,j]
- *   jsuper: jsuper=EMPTY if column j does not belong to the same
- *	supernode as j-1. Otherwise, jsuper=nsuper.
- *
- *   marker2: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- * Return value
- * ============
- *     0  success;
- *   > 0  number of bytes allocated when run out of space.
- *
- */
-    int     jcolp1, jcolm1, jsuper, nsuper, nextl;
-    int     k, krep, krow, kmark, kperm;
-    int     *marker2;           /* Used for small panel LU */
-    int	    fsupc;		/* First column of a snode */
-    int     myfnz;		/* First nonz column of a U-segment */
-    int	    chperm, chmark, chrep, kchild;
-    int     xdfs, maxdfs, kpar, oldrep;
-    int     jptr, jm1ptr;
-    int     ito, ifrom, istop;	/* Used to compress row subscripts */
-    int     mem_error;
-    int     *xsup, *supno, *lsub, *xlsub;
-    int     nzlmax;
-    static  int  first = 1, maxsuper;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	first = 0;
-    }
-    jcolp1  = jcol + 1;
-    jcolm1  = jcol - 1;
-    nsuper  = supno[jcol];
-    jsuper  = nsuper;
-    nextl   = xlsub[jcol];
-    marker2 = &marker[2*m];
-
-
-    /* For each nonzero in A[*,jcol] do dfs */
-    for (k = 0; lsub_col[k] != EMPTY; k++) {
-
-	krow = lsub_col[k];
-    	lsub_col[k] = EMPTY;
-	kmark = marker2[krow];    	
-
-	/* krow was visited before, go to the next nonz */
-        if ( kmark == jcol ) continue; 
-
-	/* For each unmarked nbr krow of jcol
-	 *	krow is in L: place it in structure of L[*,jcol]
-	 */
-	marker2[krow] = jcol;
-	kperm = perm_r[krow];
-
-   	if ( kperm == EMPTY ) {
-	    lsub[nextl++] = krow; 	/* krow is indexed into A */
-	    if ( nextl >= nzlmax ) {
-		if ( (mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)) )
-		    return (mem_error);
-		lsub = Glu->lsub;
-	    }
-            if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */
-  	} else {
-	    /*	krow is in U: if its supernode-rep krep
-	     *	has been explored, update repfnz[*]
-	     */
-	    krep = xsup[supno[kperm]+1] - 1;
-	    myfnz = repfnz[krep];
-
-	    if ( myfnz != EMPTY ) {	/* Visited before */
-	    	if ( myfnz > kperm ) repfnz[krep] = kperm;
-		/* continue; */
-	    }
-	    else {
-		/* Otherwise, perform dfs starting at krep */
-		oldrep = EMPTY;
-	 	parent[krep] = oldrep;
-	  	repfnz[krep] = kperm;
-		xdfs = xlsub[krep];
-	  	maxdfs = xprune[krep];
-
-		do {
-		    /* 
-		     * For each unmarked kchild of krep 
-		     */
-		    while ( xdfs < maxdfs ) {
-
-		   	kchild = lsub[xdfs];
-			xdfs++;
-		  	chmark = marker2[kchild];
-
-		   	if ( chmark != jcol ) { /* Not reached yet */
-		   	    marker2[kchild] = jcol;
-		   	    chperm = perm_r[kchild];
-
-		   	    /* Case kchild is in L: place it in L[*,k] */
-		   	    if ( chperm == EMPTY ) {
-			    	lsub[nextl++] = kchild;
-				if ( nextl >= nzlmax ) {
-				    if ((mem_error =
-					 dLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu)))
-					return (mem_error);
-				    lsub = Glu->lsub;
-				}
-				if ( chmark != jcolm1 ) jsuper = EMPTY;
-			    } else {
-		    	    	/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-			         *   been explored, update its repfnz[*]
-			         */
-		   	    	chrep = xsup[supno[chperm]+1] - 1;
-		   		myfnz = repfnz[chrep];
-		   		if ( myfnz != EMPTY ) { /* Visited before */
-				    if ( myfnz > chperm )
-     				  	repfnz[chrep] = chperm;
-				} else {
-		        	    /* Continue dfs at super-rep of kchild */
-		   		    xplore[krep] = xdfs;	
-		   		    oldrep = krep;
-		   		    krep = chrep; /* Go deeper down G(L^t) */
-				    parent[krep] = oldrep;
-		    		    repfnz[krep] = chperm;
-		   		    xdfs = xlsub[krep];     
-				    maxdfs = xprune[krep];
-				} /* else */
-
-			   } /* else */
-
-			} /* if */
-
-		    } /* while */
-
-		    /* krow has no more unexplored nbrs;
-	   	     *    place supernode-rep krep in postorder DFS.
-	   	     *    backtrack dfs to its parent
-		     */
-		    segrep[*nseg] = krep;
-		    ++(*nseg);
-		    kpar = parent[krep]; /* Pop from stack, mimic recursion */
-		    if ( kpar == EMPTY ) break; /* dfs done */
-		    krep = kpar;
-		    xdfs = xplore[krep];
-		    maxdfs = xprune[krep];
-
-		} while ( kpar != EMPTY ); 	/* Until empty stack */
-
-	    } /* else */
-
-	} /* else */
-
-    } /* for each nonzero ... */
-
-    /* Check to see if j belongs in the same supernode as j-1 */
-    if ( jcol == 0 ) { /* Do nothing for column 0 */
-	nsuper = supno[0] = 0;
-    } else {
-   	fsupc = xsup[nsuper];
-	jptr = xlsub[jcol];	/* Not compressed yet */
-	jm1ptr = xlsub[jcolm1];
-
-#ifdef T2_SUPER
-	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
-#endif
-	/* Make sure the number of columns in a supernode doesn't
-	   exceed threshold. */
-	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
-
-	/* If jcol starts a new supernode, reclaim storage space in
-	 * lsub from the previous supernode. Note we only store
-	 * the subscript set of the first and last columns of
-   	 * a supernode. (first for num values, last for pruning)
-	 */
-	if ( jsuper == EMPTY ) {	/* starts a new supernode */
-	    if ( (fsupc < jcolm1-1) ) {	/* >= 3 columns in nsuper */
-#ifdef CHK_COMPRESS
-		printf("  Compress lsub[] at super %d-%d\n", fsupc, jcolm1);
-#endif
-	        ito = xlsub[fsupc+1];
-		xlsub[jcolm1] = ito;
-		istop = ito + jptr - jm1ptr;
-		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
-		xlsub[jcol] = istop;
-		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
-		    lsub[ito] = lsub[ifrom];
-		nextl = ito;            /* = istop + length(jcol) */
-	    }
-	    nsuper++;
-	    supno[jcol] = nsuper;
-	} /* if a new supernode */
-
-    }	/* else: jcol > 0 */ 
-    
-    /* Tidy up the pointers before exit */
-    xsup[nsuper+1] = jcolp1;
-    supno[jcolp1]  = nsuper;
-    xprune[jcol]   = nextl;	/* Initialize upper bound for pruning */
-    xlsub[jcolp1]  = nextl;
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcomplex.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcomplex.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcomplex.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcomplex.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,106 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * This file defines common arithmetic operations for complex type.
- */
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "slu_dcomplex.h"
-
-
-/* Complex Division c = a/b */
-void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b)
-{
-    double ratio, den;
-    double abr, abi, cr, ci;
-  
-    if( (abr = b->r) < 0.)
-	abr = - abr;
-    if( (abi = b->i) < 0.)
-	abi = - abi;
-    if( abr <= abi ) {
-	if (abi == 0) {
-	    fprintf(stderr, "z_div.c: division by zero\n");
-            exit(-1);
-	}	  
-	ratio = b->r / b->i ;
-	den = b->i * (1 + ratio*ratio);
-	cr = (a->r*ratio + a->i) / den;
-	ci = (a->i*ratio - a->r) / den;
-    } else {
-	ratio = b->i / b->r ;
-	den = b->r * (1 + ratio*ratio);
-	cr = (a->r + a->i*ratio) / den;
-	ci = (a->i - a->r*ratio) / den;
-    }
-    c->r = cr;
-    c->i = ci;
-}
-
-
-/* Returns sqrt(z.r^2 + z.i^2) */
-double z_abs(doublecomplex *z)
-{
-    double temp;
-    double real = z->r;
-    double imag = z->i;
-
-    if (real < 0) real = -real;
-    if (imag < 0) imag = -imag;
-    if (imag > real) {
-	temp = real;
-	real = imag;
-	imag = temp;
-    }
-    if ((real+imag) == real) return(real);
-  
-    temp = imag/real;
-    temp = real*sqrt(1.0 + temp*temp);  /*overflow!!*/
-    return (temp);
-}
-
-
-/* Approximates the abs */
-/* Returns abs(z.r) + abs(z.i) */
-double z_abs1(doublecomplex *z)
-{
-    double real = z->r;
-    double imag = z->i;
-  
-    if (real < 0) real = -real;
-    if (imag < 0) imag = -imag;
-
-    return (real + imag);
-}
-
-/* Return the exponentiation */
-void z_exp(doublecomplex *r, doublecomplex *z)
-{
-    double expx;
-
-    expx = exp(z->r);
-    r->r = expx * cos(z->i);
-    r->i = expx * sin(z->i);
-}
-
-/* Return the complex conjugate */
-void d_cnjg(doublecomplex *r, doublecomplex *z)
-{
-    r->r = z->r;
-    r->i = -z->i;
-}
-
-/* Return the imaginary part */
-double d_imag(doublecomplex *z)
-{
-    return (z->i);
-}
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dcopy_to_ucol.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,107 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
-  This file has been modified to take out compile warning and
-  to be compatible with the HYPRE linear solver 
-*/
-
-#include "slu_ddefs.h"
-
-int
-dcopy_to_ucol(
-	      int        jcol,	  /* in */
-	      int        nseg,	  /* in */
-	      int        *segrep,  /* in */
-	      int        *repfnz,  /* in */
-	      int        *perm_r,  /* in */
-	      double     *dense,   /* modified - reset to zero on return */
-	      GlobalLU_t *Glu      /* modified */
-	      )
-{
-/* 
- * Gather from SPA dense[*] to global ucol[*].
- */
-    int ksub, krep, ksupno;
-    int i, k, kfnz, segsze;
-    int fsupc, isub, irow;
-    int jsupno, nextu;
-    int new_next, mem_error;
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-    double    *ucol;
-    int       *usub, *xusub;
-    int       nzumax;
-
-    double zero = 0.0;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    nzumax  = Glu->nzumax;
-    
-    jsupno = supno[jcol];
-    nextu  = xusub[jcol];
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-	krep = segrep[k--];
-	ksupno = supno[krep];
-
-	if ( ksupno != jsupno ) { /* Should go into ucol[] */
-	    kfnz = repfnz[krep];
-	    if ( kfnz != EMPTY ) {	/* Nonzero U-segment */
-
-	    	fsupc = xsup[ksupno];
-	        isub = xlsub[fsupc] + kfnz - fsupc;
-	        segsze = krep - kfnz + 1;
-
-		new_next = nextu + segsze;
-		while ( new_next > nzumax ) {
-		    if ((mem_error = dLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)))
-			return (mem_error);
-		    ucol = Glu->ucol;
-		    if ((mem_error = dLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)))
-			return (mem_error);
-		    usub = Glu->usub;
-		    lsub = Glu->lsub;
-		}
-		
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    usub[nextu] = perm_r[irow];
-		    ucol[nextu] = dense[irow];
-		    dense[irow] = zero;
-		    nextu++;
-		    isub++;
-		} 
-
-	    }
-
-	}
-
-    } /* for each segment... */
-
-    xusub[jcol + 1] = nextu;      /* Close U[*,jcol] */
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dGetDiagU.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dGetDiagU.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dGetDiagU.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dGetDiagU.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,59 +0,0 @@
-/* 
- * -- Auxiliary routine in SuperLU (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * Xiaoye S. Li
- * September 11, 2003
- *
- */
-
-#include "dsp_defs.h"
-
-
-void dGetDiagU(SuperMatrix *L, double *diagU)
-{
-  /*
-   * Purpose
-   * =======
-   *
-   * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
-   *  
-   * Arguments
-   * =========
-   *
-   * L      (input) SuperMatrix*
-   *        The factor L from the factorization Pr*A*Pc=L*U as computed by
-   *        dgstrf(). Use compressed row subscripts storage for supernodes,
-   *        i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
-   *
-   * diagU  (output) double*, dimension (n)
-   *        The main diagonal of matrix U.
-   *
-   * Note
-   * ====
-   * The diagonal blocks of the L and U matrices are stored in the L
-   * data structures.
-   *
-   */
-    int_t i, k, nsupers;
-    int_t fsupc, nsupr, nsupc, luptr;
-    double *dblock, *Lval;
-    SCformat *Lstore;
-
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    nsupers = Lstore->nsuper + 1;
-
-    for (k = 0; k < nsupers; ++k) {
-      fsupc = L_FST_SUPC(k);
-      nsupc = L_FST_SUPC(k+1) - fsupc;
-      nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-      luptr = L_NZ_START(fsupc);
-
-      dblock = &diagU[fsupc];
-      for (i = 0; i < nsupc; ++i) {
-	dblock[i] = Lval[luptr];
-	luptr += nsupr + 1;
-      }
-    }
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgscon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgscon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgscon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,151 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	dgscon.c
- * History:     Modified from lapack routines DGECON.
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-
-void
-dgscon(char *norm, SuperMatrix *L, SuperMatrix *U,
-       double anorm, double *rcond, SuperLUStat_t *stat, int *info)
-{
-/*
-    Purpose   
-    =======   
-
-    DGSCON estimates the reciprocal of the condition number of a general 
-    real matrix A, in either the 1-norm or the infinity-norm, using   
-    the LU factorization computed by DGETRF.   
-
-    An estimate is obtained for norm(inv(A)), and the reciprocal of the   
-    condition number is computed as   
-       RCOND = 1 / ( norm(A) * norm(inv(A)) ).   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    NORM    (input) char*
-            Specifies whether the 1-norm condition number or the   
-            infinity-norm condition number is required:   
-            = '1' or 'O':  1-norm;   
-            = 'I':         Infinity-norm.
-	    
-    L       (input) SuperMatrix*
-            The factor L from the factorization Pr*A*Pc=L*U as computed by
-            dgstrf(). Use compressed row subscripts storage for supernodes,
-            i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- 
-    U       (input) SuperMatrix*
-            The factor U from the factorization Pr*A*Pc=L*U as computed by
-            dgstrf(). Use column-wise storage scheme, i.e., U has types:
-            Stype = SLU_NC, Dtype = SLU_D, Mtype = TRU.
-	    
-    ANORM   (input) double
-            If NORM = '1' or 'O', the 1-norm of the original matrix A.   
-            If NORM = 'I', the infinity-norm of the original matrix A.
-	    
-    RCOND   (output) double*
-            The reciprocal of the condition number of the matrix A,   
-            computed as RCOND = 1/(norm(A) * norm(inv(A))).
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    int    kase, kase1, onenrm, i;
-    double ainvnm;
-    double *work;
-    int    *iwork;
-    extern int drscl_(int *, double *, double *, int *);
-
-    extern int dlacon_(int *, double *, double *, int *, double *, int *);
-
-    
-    /* Test the input parameters. */
-    *info = 0;
-    onenrm = *(unsigned char *)norm == '1' || superlu_lsame(norm, "O");
-    if (! onenrm && ! superlu_lsame(norm, "I")) *info = -1;
-    else if (L->nrow < 0 || L->nrow != L->ncol ||
-             L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU)
-	 *info = -2;
-    else if (U->nrow < 0 || U->nrow != U->ncol ||
-             U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU) 
-	*info = -3;
-    if (*info != 0) {
-	i = -(*info);
-	superlu_xerbla("dgscon", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    *rcond = 0.;
-    if ( L->nrow == 0 || U->nrow == 0) {
-	*rcond = 1.;
-	return;
-    }
-
-    work = doubleCalloc( 3*L->nrow );
-    iwork = intMalloc( L->nrow );
-
-
-    if ( !work || !iwork )
-	ABORT("Malloc fails for work arrays in dgscon.");
-    
-    /* Estimate the norm of inv(A). */
-    ainvnm = 0.;
-    if ( onenrm ) kase1 = 1;
-    else kase1 = 2;
-    kase = 0;
-
-    do {
-	dlacon_(&L->nrow, &work[L->nrow], &work[0], &iwork[0], &ainvnm, &kase);
-
-	if (kase == 0) break;
-
-	if (kase == kase1) {
-	    /* Multiply by inv(L). */
-	    sp_dtrsv("L", "No trans", "Unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(U). */
-	    sp_dtrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info);
-	    
-	} else {
-
-	    /* Multiply by inv(U'). */
-	    sp_dtrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(L'). */
-	    sp_dtrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info);
-	    
-	}
-
-    } while ( kase != 0 );
-
-    /* Compute the estimate of the reciprocal condition number. */
-    if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm;
-
-    SUPERLU_FREE (work);
-    SUPERLU_FREE (iwork);
-    return;
-
-} /* dgscon */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsequ.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,193 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	dgsequ.c
- * History:     Modified from LAPACK routine DGEEQU
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-
-#include <math.h>
-#include "slu_ddefs.h"
-#ifdef HYPRE_USING_HYPRE_LAPACK
-#include "hypre_lapack.h"
-#endif
-
-void
-dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	double *colcnd, double *amax, int *info)
-{
-/*    
-    Purpose   
-    =======   
-
-    DGSEQU computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
-	    
-    R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) double*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= A->nrow:  the i-th row of A is exactly zero   
-                  >  A->ncol:  the (i-M)-th column of A is exactly zero   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int i, j, irow;
-    double rcmin, rcmax;
-    double bignum, smlnum;
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	superlu_xerbla("dgsequ", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = (NCformat*) A->Store;
-    Aval = (double*) Astore->nzval;
-    
-    /* Get machine constants. */
-    smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = 0; i < A->nrow; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    return;
-
-} /* dgsequ */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsrfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgsrfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgsrfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,447 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	dgsrfs.c
- * History:     Modified from lapack routine DGERFS
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-#ifdef HYPRE_USING_HYPRE_BLAS
-#include "hypre_blas.h"
-#endif
-#ifdef HYPRE_USING_HYPRE_LAPACK
-#include "hypre_lapack.h"
-#endif
-void
-dgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
-       int *perm_c, int *perm_r, char *equed, double *R, double *C,
-       SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr,
-       SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose   
- *   =======   
- *
- *   DGSRFS improves the computed solution to a system of linear   
- *   equations and provides error bounds and backward error estimates for 
- *   the solution.   
- *
- *   If equilibration was performed, the system becomes:
- *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- *   Arguments   
- *   =========   
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *   
- *   A       (input) SuperMatrix*
- *           The original matrix A in the system, or the scaled A if
- *           equilibration was done. The type of A can be:
- *           Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_GE.
- *    
- *   L       (input) SuperMatrix*
- *	     The factor L from the factorization Pr*A*Pc=L*U. Use
- *           compressed row subscripts storage for supernodes, 
- *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- * 
- *   U       (input) SuperMatrix*
- *           The factor U from the factorization Pr*A*Pc=L*U as computed by
- *           dgstrf(). Use column-wise storage scheme, 
- *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
- *
- *   perm_c  (input) int*, dimension (A->ncol)
- *	     Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *
- *   perm_r  (input) int*, dimension (A->nrow)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- *   equed   (input) Specifies the form of equilibration that was done.
- *           = 'N': No equilibration.
- *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *           = 'C': Column equilibration, i.e., A was postmultiplied by
- *                  diag(C).
- *           = 'B': Both row and column equilibration, i.e., A was replaced 
- *                  by diag(R)*A*diag(C).
- *
- *   R       (input) double*, dimension (A->nrow)
- *           The row scale factors for A.
- *           If equed = 'R' or 'B', A is premultiplied by diag(R).
- *           If equed = 'N' or 'C', R is not accessed.
- * 
- *   C       (input) double*, dimension (A->ncol)
- *           The column scale factors for A.
- *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
- *           If equed = 'N' or 'R', C is not accessed.
- *
- *   B       (input) SuperMatrix*
- *           B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
- *           The right hand side matrix B.
- *           if equed = 'R' or 'B', B is premultiplied by diag(R).
- *
- *   X       (input/output) SuperMatrix*
- *           X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
- *           On entry, the solution matrix X, as computed by dgstrs().
- *           On exit, the improved solution matrix X.
- *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
- *               in order to obtain the solution to the original system.
- *
- *   FERR    (output) double*, dimension (B->ncol)   
- *           The estimated forward error bound for each solution vector   
- *           X(j) (the j-th column of the solution matrix X).   
- *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *           is an estimated upper bound for the magnitude of the largest 
- *           element in (X(j) - XTRUE) divided by the magnitude of the   
- *           largest element in X(j).  The estimate is as reliable as   
- *           the estimate for RCOND, and is almost always a slight   
- *           overestimate of the true error.
- *
- *   BERR    (output) double*, dimension (B->ncol)   
- *           The componentwise relative backward error of each solution   
- *           vector X(j) (i.e., the smallest relative change in   
- *           any element of A or B that makes X(j) an exact solution).
- *
- *   stat     (output) SuperLUStat_t*
- *            Record the statistics on runtime and floating-point operation count.
- *            See util.h for the definition of 'SuperLUStat_t'.
- *
- *   info    (output) int*   
- *           = 0:  successful exit   
- *            < 0:  if INFO = -i, the i-th argument had an illegal value   
- *
- *    Internal Parameters   
- *    ===================   
- *
- *    ITMAX is the maximum number of steps of iterative refinement.   
- *
- */  
-
-#define ITMAX 5
-    
-    /* Table of constant values */
-    int    ione = 1;
-    double ndone = -1.;
-    double done = 1.;
-    
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    SuperMatrix Bjcol;
-    DNformat *Bstore, *Xstore, *Bjcol_store;
-    double   *Bmat, *Xmat, *Bptr, *Xptr;
-    int      kase;
-    double   safe1, safe2;
-    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
-    int      ldb, ldx, nrhs;
-    double   s, xk, lstres, eps, safmin;
-    char     transc[1];
-    trans_t  transt;
-    double   *work;
-    double   *rwork;
-    int      *iwork;
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *);
-    extern int dlacon_(int *, double *, double *, int *, double *, int *);
-#ifdef _CRAY
-    extern int SCOPY(int *, double *, int *, double *, int *);
-    extern int SSAXPY(int *, double *, double *, int *, double *, int *);
-#else
-    extern int hypre_F90_NAME_BLAS(dcopy,DCOPY)(int *,double *,int *,double *,int *);
-    extern int hypre_F90_NAME_BLAS(daxpy,DAXPY)(int *,double *,double *,int *,double *,int *);
-#endif
-
-    Astore = (NCformat*) A->Store;
-    Aval   = (  double*) Astore->nzval;
-    Bstore = (DNformat*) B->Store;
-    Xstore = (DNformat*) X->Store;
-    Bmat   = (  double*) Bstore->nzval;
-    Xmat   = (  double*) Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-    
-    /* Test the input parameters */
-    *info = 0;
-    notran = (trans == NOTRANS);
-    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
- 	      L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU )
-	*info = -3;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
- 	      U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU )
-	*info = -4;
-    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
- 	      B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE )
-        *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
- 	      X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE )
-	*info = -11;
-    if (*info != 0) {
-	i = -(*info);
-	superlu_xerbla("dgsrfs", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || nrhs == 0) {
-	for (j = 0; j < nrhs; ++j) {
-	    ferr[j] = 0.;
-	    berr[j] = 0.;
-	}
-	return;
-    }
-
-    rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B");
-    colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B");
-    
-    /* Allocate working space */
-    work = doubleMalloc(2*A->nrow);
-    rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) );
-    iwork = intMalloc(2*A->nrow);
-    if ( !work || !rwork || !iwork ) 
-        ABORT("Malloc fails for work/rwork/iwork.");
-    
-    if ( notran ) {
-	*(unsigned char *)transc = 'N';
-        transt = TRANS;
-    } else {
-	*(unsigned char *)transc = 'T';
-	transt = NOTRANS;
-    }
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Epsilon");
-    safmin = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-    /* Compute the number of nonzeros in each row (or column) of A */
-    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
-    if ( notran ) {
-	for (k = 0; k < A->ncol; ++k)
-	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
-		++iwork[Astore->rowind[i]];
-    } else {
-	for (k = 0; k < A->ncol; ++k)
-	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
-    }	
-
-    /* Copy one column of RHS B into Bjcol. */
-    Bjcol.Stype = B->Stype;
-    Bjcol.Dtype = B->Dtype;
-    Bjcol.Mtype = B->Mtype;
-    Bjcol.nrow  = B->nrow;
-    Bjcol.ncol  = 1;
-    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
-    Bjcol_store = (DNformat*) Bjcol.Store;
-    Bjcol_store->lda = ldb;
-    Bjcol_store->nzval = work; /* address aliasing */
-	
-    /* Do for each right hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	Bptr = &Bmat[j*ldb];
-	Xptr = &Xmat[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-	    
-#ifdef _CRAY
-	    SCOPY(&A->nrow, Bptr, &ione, work, &ione);
-#else
-            hypre_F90_NAME_BLAS(dcopy,DCOPY)(&A->nrow,Bptr,&ione,work,&ione);
-#endif
-	    sp_dgemv(transc, ndone, A, Xptr, ione, done, work, ione);
-
-	    /* Compute componentwise relative backward error from formula 
-	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
-	       where abs(Z) is the componentwise absolute value of the matrix
-	       or vector Z.  If the i-th component of the denominator is less
-	       than SAFE2, then SAFE1 is added to the i-th component of the   
-	       numerator and denominator before dividing. */
-
-	    for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] );
-	    
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    if (notran) {
-		for (k = 0; k < A->ncol; ++k) {
-		    xk = fabs( Xptr[k] );
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-			rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk;
-		}
-	    } else {
-		for (k = 0; k < A->ncol; ++k) {
-		    s = 0.;
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-			irow = Astore->rowind[i];
-			s += fabs(Aval[i]) * fabs(Xptr[irow]);
-		    }
-		    rwork[k] += s;
-		}
-	    }
-	    s = 0.;
-	    for (i = 0; i < A->nrow; ++i) {
-		if (rwork[i] > safe2)
-		    s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] );
-		else
-		    s = SUPERLU_MAX( s, (fabs(work[i]) + safe1) / 
-				(rwork[i] + safe1) );
-	    }
-	    berr[j] = s;
-
-	    /* Test stopping criterion. Continue iterating if   
-	       1) The residual BERR(J) is larger than machine epsilon, and   
-	       2) BERR(J) decreased by at least a factor of 2 during the   
-	          last iteration, and   
-	       3) At most ITMAX iterations tried. */
-
-	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
-		/* Update solution and try again. */
-		dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-#ifdef _CRAY
-		SAXPY(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#else
-                hypre_F90_NAME_BLAS(daxpy,DAXPY)(&A->nrow, &done, work, &ione,
-                       &Xmat[j*ldx], &ione);
-#endif
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-        
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-	/* Bound error from formula:
-	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
-	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
-          where   
-            norm(Z) is the magnitude of the largest component of Z   
-            inv(op(A)) is the inverse of op(A)   
-            abs(Z) is the componentwise absolute value of the matrix or
-	       vector Z   
-            NZ is the maximum number of nonzeros in any row of A, plus 1   
-            EPS is machine epsilon   
-
-          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
-          is incremented by SAFE1 if the i-th component of   
-          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   
-
-          Use DLACON to estimate the infinity-norm of the matrix   
-             inv(op(A)) * diag(W),   
-          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
-	
-	for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] );
-	
-	/* Compute abs(op(A))*abs(X) + abs(B). */
-	if ( notran ) {
-	    for (k = 0; k < A->ncol; ++k) {
-		xk = fabs( Xptr[k] );
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-		    rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk;
-	    }
-	} else {
-	    for (k = 0; k < A->ncol; ++k) {
-		s = 0.;
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    xk = fabs( Xptr[irow] );
-		    s += fabs(Aval[i]) * xk;
-		}
-		rwork[k] += s;
-	    }
-	}
-	
-	for (i = 0; i < A->nrow; ++i)
-	    if (rwork[i] > safe2)
-		rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i];
-	    else
-		rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;
-
-	kase = 0;
-
-	do {
-	    dlacon_(&A->nrow, &work[A->nrow], work,
-		    &iwork[A->nrow], &ferr[j], &kase);
-	    if (kase == 0) break;
-
-	    if (kase == 1) {
-		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) work[i] *= C[i];
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->nrow; ++i) work[i] *= R[i];
-		
-		dgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i];
-	    } else {
-		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
-		for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i];
-		
-		dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) work[i] *= C[i];
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->ncol; ++i) work[i] *= R[i];  
-	    }
-	    
-	} while ( kase != 0 );
-
-
-	/* Normalize error. */
-	lstres = 0.;
- 	if ( notran && colequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) );
-  	} else if ( !notran && rowequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) );
-	} else {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) );
-	}
-	if ( lstres != 0. )
-	    ferr[j] /= lstres;
-
-    } /* for each RHS j ... */
-    
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(rwork);
-    SUPERLU_FREE(iwork);
-    SUPERLU_FREE(Bjcol.Store);
-
-    return;
-
-} /* dgsrfs */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssv.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,226 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include "slu_ddefs.h"
-
-void
-dgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-      SuperMatrix *L, SuperMatrix *U, SuperMatrix *B,
-      SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * DGSSV solves the system of linear equations A*X=B, using the
- * LU factorization from DGSTRF. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *
- *      1.1. Permute the columns of A, forming A*Pc, where Pc
- *           is a permutation matrix. For more details of this step, 
- *           see sp_preorder.c.
- *
- *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
- *           by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      1.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
- *      above algorithm to the transpose of A:
- *
- *      2.1. Permute columns of transpose(A) (rows of A),
- *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
- *           determined by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      2.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- * 
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR; Dtype = SLU_D; Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- * perm_c  (input/output) int*
- *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
- *         which defines the permutation matrix Pc; perm_c[i] = j means 
- *         column i of A is in position j in A*Pc.
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
- *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
- *            On exit, perm_c may be overwritten by the product of the input
- *            perm_c and a permutation that postorders the elimination tree
- *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *            is already in postorder.
- *         Otherwise, it is an output argument.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined 
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->RowPerm = MY_PERMR or
- *            options->Fact = SamePattern_SameRowPerm, perm_r is an
- *            input argument.
- *         otherwise it is an output argument.
- *
- * L       (output) SuperMatrix*
- *         The factor L from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- *         
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *	   = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *   
- */
-    DNformat *Bstore;
-    SuperMatrix *AA=NULL;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int      lwork = 0, *etree, i;
-    
-    /* Set default values for some parameters */
-    double   drop_tol = 0.;
-    int      panel_size;     /* panel size */
-    int      relax;          /* no of columns in a relaxed snodes */
-    int      permc_spec;
-    trans_t  trans = NOTRANS;
-    double   *utime;
-    double   t;	/* Temporary time */
-
-    /* Test the input parameters ... */
-    *info = 0;
-    Bstore = (DNformat*) B->Store;
-    if ( options->Fact != DOFACT ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	 A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-	B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE )
-	*info = -7;
-    if ( *info != 0 ) {
-	i = -(*info);
-	superlu_xerbla("dgssv", &i);
-	return;
-    }
-
-    utime = stat->utime;
-
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-		NRformat *Astore = (NRformat *) A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-						   (double*)Astore->nzval, Astore->colind, Astore->rowptr,
-						   SLU_NC, A->Dtype, A->Mtype);
-	trans = TRANS;
-    } else {
-        if ( A->Stype == SLU_NC ) AA = A;
-    }
-
-    t = SuperLU_timer_();
-    /*
-     * Get column permutation vector perm_c[], according to permc_spec:
-     *   permc_spec = NATURAL:  natural ordering 
-     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-     *   permc_spec = COLAMD:   approximate minimum degree column ordering
-     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-     */
-    permc_spec = options->ColPerm;
-    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-      get_perm_c(permc_spec, AA, perm_c);
-    utime[COLPERM] = SuperLU_timer_() - t;
-
-    etree = intMalloc(A->ncol);
-
-    t = SuperLU_timer_();
-    sp_preorder(options, AA, perm_c, etree, &AC);
-    utime[ETREE] = SuperLU_timer_() - t;
-
-    panel_size = sp_ienv(1);
-    relax = sp_ienv(2);
-
-    /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	  relax, panel_size, sp_ienv(3), sp_ienv(4));*/
-    t = SuperLU_timer_(); 
-    /* Compute the LU factorization of A. */
-    dgstrf(options, &AC, drop_tol, relax, panel_size,
-	   etree, NULL, lwork, perm_c, perm_r, L, U, stat, info);
-    utime[FACT] = SuperLU_timer_() - t;
-
-    t = SuperLU_timer_();
-    if ( *info == 0 ) {
-        /* Solve the system A*X=B, overwriting B with X. */
-        dgstrs (trans, L, U, perm_c, perm_r, B, stat, info);
-    }
-    utime[SOLVE] = SuperLU_timer_() - t;
-
-    SUPERLU_FREE (etree);
-    Destroy_CompCol_Permuted(&AC);
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssvx.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgssvx.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgssvx.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,622 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-
-#include "slu_ddefs.h"
-#ifdef HYPRE_USING_HYPRE_LAPACK
-#include "hypre_lapack.h"
-#endif
-
-void
-dgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-       int *etree, char *equed, double *R, double *C,
-       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
-       SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, 
-       double *rcond, double *ferr, double *berr, 
-       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * DGSSVX solves the system of linear equations A*X=B or A'*X=B, using
- * the LU factorization from dgstrf(). Error bounds on the solution and
- * a condition estimate are also provided. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *  
- *      1.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A is
- *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
- *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
- *           = TRANS or CONJ).
- *
- *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
- *           matrix that usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the matrix A (after equilibration if options->Equil = YES)
- *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
- *
- *      1.4. Compute the reciprocal pivot growth factor.
- *
- *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form of 
- *           A is used to estimate the condition number of the matrix A. If
- *           the reciprocal of the condition number is less than machine
- *           precision, info = A->ncol+1 is returned as a warning, but the
- *           routine still goes on to solve for X and computes error bounds
- *           as described below.
- *
- *      1.6. The system of equations is solved for X using the factored form
- *           of A.
- *
- *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      1.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R)
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
- *      to the transpose of A:
- *
- *      2.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A' is
- *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
- *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
- *
- *      2.2. Permute columns of transpose(A) (rows of A), 
- *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
- *           usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the transpose(A) (after equilibration if 
- *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
- *           permutation Pr determined by partial pivoting.
- *
- *      2.4. Compute the reciprocal pivot growth factor.
- *
- *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form 
- *           of transpose(A) is used to estimate the condition number of the
- *           matrix A. If the reciprocal of the condition number
- *           is less than machine precision, info = A->nrow+1 is returned as
- *           a warning, but the routine still goes on to solve for X and
- *           computes error bounds as described below.
- *
- *      2.6. The system of equations is solved for X using the factored form
- *           of transpose(A).
- *
- *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      2.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input/output) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- *         On entry, If options->Fact = FACTORED and equed is not 'N', 
- *         then A must have been equilibrated by the scaling factors in
- *         R and/or C.  
- *         On exit, A is not modified if options->Equil = NO, or if 
- *         options->Equil = YES but equed = 'N' on exit.
- *         Otherwise, if options->Equil = YES and equed is not 'N',
- *         A is scaled as follows:
- *         If A->Stype = SLU_NC:
- *           equed = 'R':  A := diag(R) * A
- *           equed = 'C':  A := A * diag(C)
- *           equed = 'B':  A := diag(R) * A * diag(C).
- *         If A->Stype = SLU_NR:
- *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
- *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
- *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
- *
- * perm_c  (input/output) int*
- *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
- *         which defines the permutation matrix Pc; perm_c[i] = j means
- *         column i of A is in position j in A*Pc.
- *         On exit, perm_c may be overwritten by the product of the input
- *         perm_c and a permutation that postorders the elimination tree
- *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *         is already in postorder.
- *
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *         will try to use the input perm_r, unless a certain threshold
- *         criterion is violated. In that case, perm_r is overwritten by a
- *         new permutation determined by partial pivoting or diagonal
- *         threshold pivoting.
- *         Otherwise, perm_r is output argument.
- * 
- * etree   (input/output) int*,  dimension (A->ncol)
- *         Elimination tree of Pc'*A'*A*Pc.
- *         If options->Fact != FACTORED and options->Fact != DOFACT,
- *         etree is an input argument, otherwise it is an output argument.
- *         Note: etree is a vector of parent pointers for a forest whose
- *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- * equed   (input/output) char*
- *         Specifies the form of equilibration that was done.
- *         = 'N': No equilibration.
- *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
- *         = 'B': Both row and column equilibration, i.e., A was replaced 
- *                by diag(R)*A*diag(C).
- *         If options->Fact = FACTORED, equed is an input argument,
- *         otherwise it is an output argument.
- *
- * R       (input/output) double*, dimension (A->nrow)
- *         The row scale factors for A or transpose(A).
- *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
- *         If equed = 'N' or 'C', R is not accessed.
- *         If options->Fact = FACTORED, R is an input argument,
- *             otherwise, R is output.
- *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
- *             of R must be positive.
- * 
- * C       (input/output) double*, dimension (A->ncol)
- *         The column scale factors for A or transpose(A).
- *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
- *         If equed = 'N' or 'R', C is not accessed.
- *         If options->Fact = FACTORED, C is an input argument,
- *             otherwise, C is output.
- *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
- *             of C must be positive.
- *         
- * L       (output) SuperMatrix*
- *	   The factor L from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- *
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
- *
- * work    (workspace/output) void*, size (lwork) (in bytes)
- *         User supplied workspace, should be large enough
- *         to hold data structures for factors L and U.
- *         On exit, if fact is not 'F', L and U point to this array.
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               mem_usage->total_needed; no other side effects.
- *
- *         See argument 'mem_usage' for memory usage statistics.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         If B->ncol = 0, only LU decomposition is performed, the triangular
- *                         solve is skipped.
- *         On exit,
- *            if equed = 'N', B is not modified; otherwise
- *            if A->Stype = SLU_NC:
- *               if options->Trans = NOTRANS and equed = 'R' or 'B',
- *                  B is overwritten by diag(R)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
- *                  B is overwritten by diag(C)*B;
- *            if A->Stype = SLU_NR:
- *               if options->Trans = NOTRANS and equed = 'C' or 'B',
- *                  B is overwritten by diag(C)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
- *                  B is overwritten by diag(R)*B.
- *
- * X       (output) SuperMatrix*
- *         X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. 
- *         If info = 0 or info = A->ncol+1, X contains the solution matrix
- *         to the original system of equations. Note that A and B are modified
- *         on exit if equed is not 'N', and the solution to the equilibrated
- *         system is inv(diag(C))*X if options->Trans = NOTRANS and
- *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
- *         and equed = 'R' or 'B'.
- *
- * recip_pivot_growth (output) double*
- *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
- *         The infinity norm is used. If recip_pivot_growth is much less
- *         than 1, the stability of the LU factorization could be poor.
- *
- * rcond   (output) double*
- *         The estimate of the reciprocal condition number of the matrix A
- *         after equilibration (if done). If rcond is less than the machine
- *         precision (in particular, if rcond = 0), the matrix is singular
- *         to working precision. This condition is indicated by a return
- *         code of info > 0.
- *
- * FERR    (output) double*, dimension (B->ncol)   
- *         The estimated forward error bound for each solution vector   
- *         X(j) (the j-th column of the solution matrix X).   
- *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *         is an estimated upper bound for the magnitude of the largest 
- *         element in (X(j) - XTRUE) divided by the magnitude of the   
- *         largest element in X(j).  The estimate is as reliable as   
- *         the estimate for RCOND, and is almost always a slight   
- *         overestimate of the true error.
- *         If options->IterRefine = NOREFINE, ferr = 1.0.
- *
- * BERR    (output) double*, dimension (B->ncol)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *         If options->IterRefine = NOREFINE, berr = 1.0.
- *
- * mem_usage (output) mem_usage_t*
- *         Record the memory usage statistics, consisting of following fields:
- *         - for_lu (float)
- *           The amount of space used in bytes for L\U data structures.
- *         - total_needed (float)
- *           The amount of space needed in bytes to perform factorization.
- *         - expansions (int)
- *           The number of memory expansions during the LU factorization.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit   
- *         < 0: if info = -i, the i-th argument had an illegal value   
- *         > 0: if info = i, and i is   
- *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
- *                    been completed, but the factor U is exactly   
- *                    singular, so the solution and error bounds   
- *                    could not be computed.   
- *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
- *                    precision, meaning that the matrix is singular to
- *                    working precision. Nevertheless, the solution and
- *                    error bounds are computed because there are a number
- *                    of situations where the computed solution can be more
- *                    accurate than the value of RCOND would suggest.   
- *              > A->ncol+1: number of bytes allocated when memory allocation
- *                    failure occurred, plus A->ncol.
- *
- */
-
-    DNformat  *Bstore, *Xstore;
-    double    *Bmat, *Xmat;
-    int       ldb, ldx, nrhs;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int       colequ, equil, nofact, notran, rowequ, permc_spec;
-    trans_t   trant;
-    char      norm[1];
-    int       i, j, info1;
-    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
-    int       relax, panel_size;
-    double    drop_tol;
-    double    t0;      /* temporary time */
-    double    *utime;
-
-    /* External functions */
-    extern double dlangs(char *, SuperMatrix *);
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *);
-
-    Bstore = (DNformat*) B->Store;
-    Xstore = (DNformat*) X->Store;
-    Bmat   = (  double*) Bstore->nzval;
-    Xmat   = (  double*) Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-
-    *info = 0;
-    nofact = (options->Fact != FACTORED);
-    equil = (options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    if ( nofact ) {
-	*(unsigned char *)equed = 'N';
-	rowequ = FALSE;
-	colequ = FALSE;
-    } else {
-	rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B");
-	colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B");
-        smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum");
-	bignum = 1. / smlnum;
-    }
-
-#if 0
-printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
-       options->Fact, options->Trans, *equed);
-#endif
-
-    /* Test the input parameters */
-    if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern &&
-	options->Fact != SamePattern_SameRowPerm &&
-	!notran && options->Trans != TRANS && options->Trans != CONJ &&
-	!equil && options->Equil != NO)
-	*info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	      A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -2;
-    else if (options->Fact == FACTORED &&
-	     !(rowequ || colequ || superlu_lsame(equed, "N")))
-	*info = -6;
-    else {
-	if (rowequ) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, R[j]);
-		rcmax = SUPERLU_MAX(rcmax, R[j]);
-	    }
-	    if (rcmin <= 0.) *info = -7;
-	    else if ( A->nrow > 0)
-		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else rowcnd = 1.;
-	}
-	if (colequ && *info == 0) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, C[j]);
-		rcmax = SUPERLU_MAX(rcmax, C[j]);
-	    }
-	    if (rcmin <= 0.) *info = -8;
-	    else if (A->nrow > 0)
-		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else colcnd = 1.;
-	}
-	if (*info == 0) {
-	    if ( lwork < -1 ) *info = -12;
-	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      B->Stype != SLU_DN || B->Dtype != SLU_D || 
-		      B->Mtype != SLU_GE )
-		*info = -13;
-	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      (B->ncol != 0 && B->ncol != X->ncol) ||
-                      X->Stype != SLU_DN ||
-		      X->Dtype != SLU_D || X->Mtype != SLU_GE )
-		*info = -14;
-	}
-    }
-    if (*info != 0) {
-	i = -(*info);
-	superlu_xerbla("dgssvx", &i);
-	return;
-    }
-    
-    /* Initialization for factor parameters */
-    panel_size = sp_ienv(1);
-    relax      = sp_ienv(2);
-    drop_tol   = 0.0;
-
-    utime = stat->utime;
-    
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = (NRformat*) A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       (double*) Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	if ( notran ) { /* Reverse the transpose argument. */
-	    trant = TRANS;
-	    notran = 0;
-	} else {
-	    trant = NOTRANS;
-	    notran = 1;
-	}
-    } else { /* A->Stype == SLU_NC */
-	trant = options->Trans;
-	AA = A;
-    }
-
-    if ( nofact && equil ) {
-	t0 = SuperLU_timer_();
-	/* Compute row and column scalings to equilibrate the matrix A. */
-	dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
-	
-	if ( info1 == 0 ) {
-	    /* Equilibrate matrix A. */
-	    dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
-	    rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B");
-	    colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B");
-	}
-	utime[EQUIL] = SuperLU_timer_() - t0;
-    }
-
-    if ( nrhs > 0 ) {
-        /* Scale the right hand side if equilibration was performed. */
-        if ( notran ) {
-	    if ( rowequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-		        Bmat[i + j*ldb] *= R[i];
-	            }
-	    }
-        } else if ( colequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-	            Bmat[i + j*ldb] *= C[i];
-	        }
-        }
-    }
-
-    if ( nofact ) {
-	
-        t0 = SuperLU_timer_();
-	/*
-	 * Gnet column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-            get_perm_c(permc_spec, AA, perm_c);
-	utime[COLPERM] = SuperLU_timer_() - t0;
-
-	t0 = SuperLU_timer_();
-	sp_preorder(options, AA, perm_c, etree, &AC);
-	utime[ETREE] = SuperLU_timer_() - t0;
-    
-/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	       relax, panel_size, sp_ienv(3), sp_ienv(4));
-	fflush(stdout); */
-	
-	/* Compute the LU factorization of A*Pc. */
-	t0 = SuperLU_timer_();
-	dgstrf(options, &AC, drop_tol, relax, panel_size,
-	       etree, work, lwork, perm_c, perm_r, L, U, stat, info);
-	utime[FACT] = SuperLU_timer_() - t0;
-	
-	if ( lwork == -1 ) {
-	    mem_usage->total_needed = *info - A->ncol;
-	    return;
-	}
-    }
-
-    if ( options->PivotGrowth ) {
-        if ( *info > 0 ) {
-	    if ( *info <= A->ncol ) {
-	        /* Compute the reciprocal pivot growth factor of the leading
-	           rank-deficient *info columns of A. */
-	        *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U);
-	    }
-	    return;
-        }
-
-        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
-        *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U);
-    }
-
-    if ( options->ConditionNumber ) {
-        /* Estimate the reciprocal of the condition number of A. */
-        t0 = SuperLU_timer_();
-        if ( notran ) {
-	    *(unsigned char *)norm = '1';
-        } else {
-	    *(unsigned char *)norm = 'I';
-        }
-        anorm = dlangs(norm, AA);
-        dgscon(norm, L, U, anorm, rcond, stat, info);
-        utime[RCOND] = SuperLU_timer_() - t0;
-    }
-    
-    if ( nrhs > 0 ) {
-        /* Compute the solution matrix X. */
-        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
-            for (i = 0; i < B->nrow; i++)
-	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
-    
-        t0 = SuperLU_timer_();
-        dgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
-        utime[SOLVE] = SuperLU_timer_() - t0;
-    
-        /* Use iterative refinement to improve the computed solution and compute
-           error bounds and backward error estimates for it. */
-        t0 = SuperLU_timer_();
-        if ( options->IterRefine != NOREFINE ) {
-            dgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
-                   X, ferr, berr, stat, info);
-        } else {
-            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
-        }
-        utime[REFINE] = SuperLU_timer_() - t0;
-
-        /* Transform the solution matrix X to a solution of the original system. */
-        if ( notran ) {
-	    if ( colequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-                        Xmat[i + j*ldx] *= C[i];
-	            }
-	    }
-        } else if ( rowequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-	            Xmat[i + j*ldx] *= R[i];
-                }
-        }
-    } /* end if nrhs > 0 */
-
-    if ( options->ConditionNumber ) {
-        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
-       if (*rcond < hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("E")) *info=A->ncol+1;
-    }
-
-    if ( nofact ) {
-        dQuerySpace(L, U, mem_usage);
-        Destroy_CompCol_Permuted(&AC);
-    }
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrf.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrf.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,433 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-void
-dgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol,
-        int relax, int panel_size, int *etree, void *work, int lwork,
-        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * DGSTRF computes an LU factorization of a general sparse m-by-n
- * matrix A using partial pivoting with row interchanges.
- * The factorization has the form
- *     Pr * A = L * U
- * where Pr is a row permutation matrix, L is lower triangular with unit
- * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
- * triangular (upper trapezoidal if A->nrow < A->ncol).
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
- *
- * drop_tol (input) double (NOT IMPLEMENTED)
- *	    Drop tolerance parameter. At step j of the Gaussian elimination,
- *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
- *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
- *
- * relax    (input) int
- *          To control degree of relaxing supernodes. If the number
- *          of nodes (columns) in a subtree of the elimination tree is less
- *          than relax, this subtree is considered as one supernode,
- *          regardless of the row structures of those columns.
- *
- * panel_size (input) int
- *          A panel consists of at most panel_size consecutive columns.
- *
- * etree    (input) int*, dimension (A->ncol)
- *          Elimination tree of A'*A.
- *          Note: etree is a vector of parent pointers for a forest whose
- *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *          On input, the columns of A should be permuted so that the
- *          etree is in a certain postorder.
- *
- * work     (input/output) void*, size (lwork) (in bytes)
- *          User-supplied work space and space for the output data structures.
- *          Not referenced if lwork = 0;
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               *info; no other side effects.
- *
- * perm_c   (input) int*, dimension (A->ncol)
- *	    Column permutation vector, which defines the 
- *          permutation matrix Pc; perm_c[i] = j means column i of A is 
- *          in position j in A*Pc.
- *          When searching for diagonal, perm_c[*] is applied to the
- *          row subscripts of A, so that diagonal threshold pivoting
- *          can find the diagonal of A, rather than that of A*Pc.
- *
- * perm_r   (input/output) int*, dimension (A->nrow)
- *          Row permutation vector which defines the permutation matrix Pr,
- *          perm_r[i] = j means row i of A is in position j in Pr*A.
- *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *             will try to use the input perm_r, unless a certain threshold
- *             criterion is violated. In that case, perm_r is overwritten by
- *             a new permutation determined by partial pivoting or diagonal
- *             threshold pivoting.
- *          Otherwise, perm_r is output argument;
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = SLU_NC, 
- *          Dtype = SLU_D, Mtype = SLU_TRU.
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info     (output) int*
- *          = 0: successful exit
- *          < 0: if info = -i, the i-th argument had an illegal value
- *          > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                and division by zero will occur if it is used to solve a
- *                system of equations.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol. If lwork = -1, it is
- *                the estimated amount of space needed, plus A->ncol.
- *
- * ======================================================================
- *
- * Local Working Arrays: 
- * ======================
- *   m = number of rows in the matrix
- *   n = number of columns in the matrix
- *
- *   xprune[0:n-1]: xprune[*] points to locations in subscript 
- *	vector lsub[*]. For column i, xprune[i] denotes the point where 
- *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
- *	to be traversed for symbolic factorization.
- *
- *   marker[0:3*m-1]: marker[i] = j means that node i has been 
- *	reached when working on column j.
- *	Storage: relative to original row subscripts
- *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
- *	      see dpanel_dfs.c; marker2 is used for inner-factorization,
- *            see dcolumn_dfs.c.
- *
- *   parent[0:m-1]: parent vector used during dfs
- *      Storage: relative to new row subscripts
- *
- *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
- *	unexplored neighbor of i in lsub[*]
- *
- *   segrep[0:nseg-1]: contains the list of supernodal representatives
- *	in topological order of the dfs. A supernode representative is the 
- *	last column of a supernode.
- *      The maximum size of segrep[] is n.
- *
- *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
- *	supernodal representative r, repfnz[r] is the location of the first 
- *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
- *	indicates the supernode r has been explored.
- *	NOTE: There are W of them, each used for one column of a panel. 
- *
- *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
- *      the panel diagonal. These are filled in during dpanel_dfs(), and are
- *      used later in the inner LU factorization within the panel.
- *	panel_lsub[]/dense[] pair forms the SPA data structure.
- *	NOTE: There are W of them.
- *
- *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
- *	    	   NOTE: there are W of them.
- *
- *   tempv[0:*]: real temporary used for dense numeric kernels;
- *	The size of this array is defined by NUM_TEMPV() in dsp_defs.h.
- *
- */
-    /* Local working arrays */
-    NCPformat *Astore;
-    int       *iperm_r = NULL; /* inverse of perm_r; used when 
-                                  options->Fact == SamePattern_SameRowPerm */
-    int       *iperm_c; /* inverse of perm_c */
-    int       *iwork;
-    double    *dwork;
-    int	      *segrep, *repfnz, *parent, *xplore;
-    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
-    int	      *xprune;
-    int	      *marker;
-    double    *dense, *tempv;
-    int       *relax_end;
-    double    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int       *xsup, *supno;
-    int       *xlsub, *xlusup, *xusub;
-    int       nzlumax;
-    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
-
-    /* Local scalars */
-    fact_t    fact = options->Fact;
-    double    diag_pivot_thresh = options->DiagPivotThresh;
-    int       pivrow;   /* pivotal row number in the original matrix A */
-    int       nseg1;	/* no of segments in U-column above panel row jcol */
-    int       nseg;	/* no of segments in each U-column */
-    register int jcol;	
-    register int kcol;	/* end column of a relaxed snode */
-    register int icol;
-    register int i, k, jj, new_next, iinfo;
-    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
-    int       w_def;	/* upper bound on panel width */
-    int       usepr, iperm_r_allocated = 0;
-    int       nnzL, nnzU;
-    int       *panel_histo = stat->panel_histo;
-    flops_t   *ops = stat->ops;
-
-    iinfo    = 0;
-    m        = A->nrow;
-    n        = A->ncol;
-    min_mn   = SUPERLU_MIN(m, n);
-    Astore   = (NCPformat*) A->Store;
-    a        = (   double*) Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-
-    /* Allocate storage common to the factor routines */
-    *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz,
-                       panel_size, L, U, &Glu, &iwork, &dwork);
-    if ( *info ) return;
-    
-    xsup    = Glu.xsup;
-    supno   = Glu.supno;
-    xlsub   = Glu.xlsub;
-    xlusup  = Glu.xlusup;
-    xusub   = Glu.xusub;
-    
-    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
-	     &repfnz, &panel_lsub, &xprune, &marker);
-    dSetRWork(m, panel_size, dwork, &dense, &tempv);
-    
-    usepr = (fact == SamePattern_SameRowPerm);
-    if ( usepr ) {
-	/* Compute the inverse of perm_r */
-	iperm_r = (int *) intMalloc(m);
-	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
-	iperm_r_allocated = 1;
-    }
-    iperm_c = (int *) intMalloc(n);
-    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
-
-    /* Identify relaxed snodes */
-    relax_end = (int *) intMalloc(n);
-    if ( options->SymmetricMode == YES ) {
-        heap_relax_snode(n, etree, relax, marker, relax_end); 
-    } else {
-        relax_snode(n, etree, relax, marker, relax_end); 
-    }
-    
-    ifill (perm_r, m, EMPTY);
-    ifill (marker, m * NO_MARKER, EMPTY);
-    supno[0] = -1;
-    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
-    w_def    = panel_size;
-
-    /* 
-     * Work on one "panel" at a time. A panel is one of the following: 
-     *	   (a) a relaxed supernode at the bottom of the etree, or
-     *	   (b) panel_size contiguous columns, defined by the user
-     */
-    for (jcol = 0; jcol < min_mn; ) {
-
-	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
-   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
-	    panel_histo[kcol-jcol+1]++;
-
-	    /* --------------------------------------
-	     * Factorize the relaxed supernode(jcol:kcol) 
-	     * -------------------------------------- */
-	    /* Determine the union of the row structure of the snode */
-	    if ( (*info = dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
-				    xprune, marker, &Glu)) != 0 )
-		return;
-
-            nextu    = xusub[jcol];
-	    nextlu   = xlusup[jcol];
-	    jsupno   = supno[jcol];
-	    fsupc    = xsup[jsupno];
-	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
-	    nzlumax = Glu.nzlumax;
-	    while ( new_next > nzlumax ) {
-		if ( (*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
-		    return;
-	    }
-    
-	    for (icol = jcol; icol<= kcol; icol++) {
-		xusub[icol+1] = nextu;
-		
-    		/* Scatter into SPA dense[*] */
-    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
-        	    dense[asub[k]] = a[k];
-
-	       	/* Numeric update within the snode */
-	        dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);
-
-		if ( (*info = dpivotL(icol, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-		
-#ifdef DEBUG
-		dprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-	    jcol = icol;
-
-	} else { /* Work on one panel of panel_size columns */
-	    
-	    /* Adjust panel_size so that a panel won't overlap with the next 
-	     * relaxed snode.
-	     */
-	    panel_size = w_def;
-	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
-		if ( relax_end[k] != EMPTY ) {
-		    panel_size = k - jcol;
-		    break;
-		}
-	    if ( k == min_mn ) panel_size = min_mn - jcol;
-	    panel_histo[panel_size]++;
-
-	    /* symbolic factor on a panel of columns */
-	    dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
-		      dense, panel_lsub, segrep, repfnz, xprune,
-		      marker, parent, xplore, &Glu);
-	    
-	    /* numeric sup-panel updates in topological order */
-	    dpanel_bmod(m, panel_size, jcol, nseg1, dense,
-		        tempv, segrep, repfnz, &Glu, stat);
-	    
-	    /* Sparse LU within the panel, and below panel diagonal */
-    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
- 		k = (jj - jcol) * m; /* column index for w-wide arrays */
-
-		nseg = nseg1;	/* Begin after all the panel segments */
-
-	    	if ((*info = dcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
-					segrep, &repfnz[k], xprune, marker,
-					parent, xplore, &Glu)) != 0) return;
-
-	      	/* Numeric updates */
-	    	if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k],
-					 tempv, &segrep[nseg1], &repfnz[k],
-					 jcol, &Glu, stat)) != 0) return;
-		
-	        /* Copy the U-segments to ucol[*] */
-		if ((*info = dcopy_to_ucol(jj, nseg, segrep, &repfnz[k],
-					  perm_r, &dense[k], &Glu)) != 0)
-		    return;
-
-	    	if ( (*info = dpivotL(jj, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-
-		/* Prune columns (0:jj-1) using column jj */
-	    	dpruneL(jj, perm_r, pivrow, nseg, segrep,
-                        &repfnz[k], xprune, &Glu);
-
-		/* Reset repfnz[] for this column */
-	    	resetrep_col (nseg, segrep, &repfnz[k]);
-		
-#ifdef DEBUG
-		dprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-   	    jcol += panel_size;	/* Move to the next panel */
-
-	} /* else */
-
-    } /* for */
-
-    *info = iinfo;
-    
-    if ( m > n ) {
-	k = 0;
-        for (i = 0; i < m; ++i) 
-            if ( perm_r[i] == EMPTY ) {
-    		perm_r[i] = n + k;
-		++k;
-	    }
-    }
-
-    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
-    fixupL(min_mn, perm_r, &Glu);
-
-    dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */
-
-    if ( fact == SamePattern_SameRowPerm ) {
-        /* L and U structures may have changed due to possibly different
-	   pivoting, even though the storage is available.
-	   There could also be memory expansions, so the array locations
-           may have changed, */
-        ((SCformat *)L->Store)->nnz = nnzL;
-	((SCformat *)L->Store)->nsuper = Glu.supno[n];
-	((SCformat *)L->Store)->nzval = Glu.lusup;
-	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
-	((SCformat *)L->Store)->rowind = Glu.lsub;
-	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
-	((NCformat *)U->Store)->nnz = nnzU;
-	((NCformat *)U->Store)->nzval = Glu.ucol;
-	((NCformat *)U->Store)->rowind = Glu.usub;
-	((NCformat *)U->Store)->colptr = Glu.xusub;
-    } else {
-        dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, 
-	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
-			         Glu.xsup, SLU_SC, SLU_D, SLU_TRLU);
-    	dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, 
-			       Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU);
-    }
-    
-    ops[FACT] += ops[TRSV] + ops[GEMV];	
-    
-    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
-    SUPERLU_FREE (iperm_c);
-    SUPERLU_FREE (relax_end);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,341 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include "slu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-/*
- * Function prototypes
- */
-void sludusolve(int, int, double*, double*);
-void sludlsolve(int, int, double*, double*);
-void sludmatvec(int, int, int, double*, double*, double*);
-
-void
-dgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U,
-        int *perm_c, int *perm_r, SuperMatrix *B,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * DGSTRS solves a system of linear equations A*X=B or A'*X=B
- * with A sparse and B dense, using the LU factorization computed by
- * DGSTRF.
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *
- * L       (input) SuperMatrix*
- *         The factor L from the factorization Pr*A*Pc=L*U as computed by
- *         dgstrf(). Use compressed row subscripts storage for supernodes,
- *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- *
- * U       (input) SuperMatrix*
- *         The factor U from the factorization Pr*A*Pc=L*U as computed by
- *         dgstrf(). Use column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
- *
- * perm_c  (input) int*, dimension (L->ncol)
- *	   Column permutation vector, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * perm_r  (input) int*, dimension (L->nrow)
- *         Row permutation vector, which defines the permutation matrix Pr; 
- *         perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
-#endif
-#if 0
-    int      incx = 1, incy = 1;
-#endif
-#ifdef USE_VENDOR_BLAS
-    double   alpha = 1.0, beta = 1.0;
-    double   *work_col;
-#endif
-    DNformat *Bstore;
-    double   *Bmat;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *Lval, *Uval;
-    int      fsupc, nrow, nsupr, nsupc, luptr, istart, irow;
-    int      i, j, k, iptr, jcol, n, ldb, nrhs;
-    double   *work, *rhs_work, *soln;
-    flops_t  solve_ops;
-    void dprint_soln();
-
-    /* Test input parameters ... */
-    *info = 0;
-    Bstore = (DNformat*) B->Store;
-    ldb = Bstore->lda;
-    nrhs = B->ncol;
-    if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
-	      L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU )
-	*info = -2;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
-	      U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU )
-	*info = -3;
-    else if ( ldb < SUPERLU_MAX(0, L->nrow) ||
-	      B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	superlu_xerbla("dgstrs", &i);
-	return;
-    }
-
-    n = L->nrow;
-    work = doubleCalloc(n * nrhs);
-    if ( !work ) ABORT("Malloc fails for local work[].");
-    soln = doubleMalloc(n);
-    if ( !soln ) ABORT("Malloc fails for local soln[].");
-
-    Bmat = (double*) Bstore->nzval;
-    Lstore = (SCformat*) L->Store;
-    Lval = (double*)Lstore->nzval;
-    Ustore = (NCformat*) U->Store;
-    Uval = (double*)Ustore->nzval;
-    solve_ops = 0;
-    
-    if ( trans == NOTRANS ) {
-	/* Permute right hand sides to form Pr*B */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-	/* Forward solve PLy=Pb. */
-	for (k = 0; k <= Lstore->nsuper; k++) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    nrow = nsupr - nsupc;
-
-	    solve_ops += nsupc * (nsupc - 1) * nrhs;
-	    solve_ops += 2 * nrow * nsupc * nrhs;
-	    
-	    if ( nsupc == 1 ) {
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-	    	    luptr = L_NZ_START(fsupc);
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){
-			irow = L_SUB(iptr);
-			++luptr;
-			rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr];
-		    }
-		}
-	    } else {
-	    	luptr = L_NZ_START(fsupc);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("N", strlen("N"));
-		ftcs3 = _cptofcd("U", strlen("U"));
-		STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#else
-                hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L","L","N","U",&nsupc,
-                       &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], 
-                       &ldb);
-                hypre_F90_NAME_BLAS(dgemm,DGEMM)("N","N",&nrow,&nrhs,&nsupc, 
-                       &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb,
-                       &beta, &work[0], &n );
-#endif
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    work_col = &work[j*n];
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			rhs_work[irow] -= work_col[i]; /* Scatter */
-			work_col[i] = 0.0;
-			iptr++;
-		    }
-		}
-#else		
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    sludlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
-		    sludmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
-			    &rhs_work[fsupc], &work[0] );
-
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			rhs_work[irow] -= work[i];
-			work[i] = 0.0;
-			iptr++;
-		    }
-		}
-#endif		    
-	    } /* else ... */
-	} /* for L-solve */
-
-#ifdef DEBUG
-  	printf("After L-solve: y=\n");
-	dprint_soln(n, nrhs, Bmat);
-#endif
-
-	/*
-	 * Back solve Ux=y.
-	 */
-	for (k = Lstore->nsuper; k >= 0; k--) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    luptr = L_NZ_START(fsupc);
-
-	    solve_ops += nsupc * (nsupc + 1) * nrhs;
-
-	    if ( nsupc == 1 ) {
-		rhs_work = &Bmat[0];
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work[fsupc] /= Lval[luptr];
-		    rhs_work += ldb;
-		}
-	    } else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("U", strlen("U"));
-		ftcs3 = _cptofcd("N", strlen("N"));
-		STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#else
-                hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L","U","N","N", &nsupc, 
-                       &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], 
-                       &ldb);
-#endif
-#else		
-		for (j = 0; j < nrhs; j++)
-		    sludusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] );
-#endif		
-	    }
-
-	    for (j = 0; j < nrhs; ++j) {
-		rhs_work = &Bmat[j*ldb];
-		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
-		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){
-			irow = U_SUB(i);
-			rhs_work[irow] -= rhs_work[jcol] * Uval[i];
-		    }
-		}
-	    }
-	    
-	} /* for U-solve */
-
-#ifdef DEBUG
-  	printf("After U-solve: x=\n");
-	dprint_soln(n, nrhs, Bmat);
-#endif
-
-	/* Compute the final solution X := Pc*X. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-        stat->ops[SOLVE] = solve_ops;
-
-    } else { /* Solve A'*X=B or CONJ(A)*X=B */
-	/* Permute right hand sides to form Pc'*B. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-	stat->ops[SOLVE] = 0;
-	for (k = 0; k < nrhs; ++k) {
-	    
-	    /* Multiply by inv(U'). */
-	    sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info);
-	    
-	    /* Multiply by inv(L'). */
-	    sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info);
-	    
-	}
-	/* Compute the final solution X := Pr'*X (=inv(Pr)*X) */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-    }
-
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(soln);
-}
-
-/*
- * Diagnostic print of the solution vector 
- */
-void
-dprint_soln(int n, int nrhs, double *soln)
-{
-    int i;
-
-    for (i = 0; i < n; i++) 
-  	printf("\t%d: %.4f\n", i, soln[i]);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrsL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrsL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dgstrsL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dgstrsL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,233 +0,0 @@
-
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * September 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-#include "slu_util.h"
-
-
-/* 
- * Function prototypes 
- */
-void dusolve(int, int, double*, double*);
-void dlsolve(int, int, double*, double*);
-void dmatvec(int, int, int, double*, double*, double*);
-
-
-void
-dgstrsL(char *trans, SuperMatrix *L, int *perm_r, SuperMatrix *B, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * DGSTRSL only performs the L-solve using the LU factorization computed
- * by DGSTRF.
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * trans   (input) char*
- *          Specifies the form of the system of equations:
- *          = 'N':  A * X = B  (No transpose)
- *          = 'T':  A'* X = B  (Transpose)
- *          = 'C':  A**H * X = B  (Conjugate transpose)
- *
- * L       (input) SuperMatrix*
- *         The factor L from the factorization Pr*A*Pc=L*U as computed by
- *         dgstrf(). Use compressed row subscripts storage for supernodes,
- *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
- *
- * U       (input) SuperMatrix*
- *         The factor U from the factorization Pr*A*Pc=L*U as computed by
- *         dgstrf(). Use column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
- *
- * perm_r  (input) int*, dimension (L->nrow)
- *         Row permutation vector, which defines the permutation matrix Pr; 
- *         perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * info    (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
-#endif
-    int      incx = 1, incy = 1;
-    double   alpha = 1.0, beta = 1.0;
-    DNformat *Bstore;
-    double   *Bmat;
-    SCformat *Lstore;
-    double   *Lval, *Uval;
-    int      nrow, notran;
-    int      fsupc, nsupr, nsupc, luptr, istart, irow;
-    int      i, j, k, iptr, jcol, n, ldb, nrhs;
-    double   *work, *work_col, *rhs_work, *soln;
-    flops_t  solve_ops;
-    extern SuperLUStat_t SuperLUStat;
-    void dprint_soln();
-
-    /* Test input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    ldb = Bstore->lda;
-    nrhs = B->ncol;
-    notran = lsame_(trans, "N");
-    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C") ) *info = -1;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
-	      L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU )
-	*info = -2;
-    else if ( ldb < SUPERLU_MAX(0, L->nrow) ||
-	      B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE )
-	*info = -4;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("dgstrsL", &i);
-	return;
-    }
-
-    n = L->nrow;
-    work = doubleCalloc(n * nrhs);
-    if ( !work ) ABORT("Malloc fails for local work[].");
-    soln = doubleMalloc(n);
-    if ( !soln ) ABORT("Malloc fails for local soln[].");
-
-    Bmat = Bstore->nzval;
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    solve_ops = 0;
-    
-    if ( notran ) {
-	/* Permute right hand sides to form Pr*B */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-	/* Forward solve PLy=Pb. */
-	for (k = 0; k <= Lstore->nsuper; k++) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    nrow = nsupr - nsupc;
-
-	    solve_ops += nsupc * (nsupc - 1) * nrhs;
-	    solve_ops += 2 * nrow * nsupc * nrhs;
-	    
-	    if ( nsupc == 1 ) {
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-	    	    luptr = L_NZ_START(fsupc);
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){
-			irow = L_SUB(iptr);
-			++luptr;
-			rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr];
-		    }
-		}
-	    } else {
-	    	luptr = L_NZ_START(fsupc);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("N", strlen("N"));
-		ftcs3 = _cptofcd("U", strlen("U"));
-		STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#else
-		dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#endif
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    work_col = &work[j*n];
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			rhs_work[irow] -= work_col[i]; /* Scatter */
-			work_col[i] = 0.0;
-			iptr++;
-		    }
-		}
-#else		
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
-		    dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
-			    &rhs_work[fsupc], &work[0] );
-
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			rhs_work[irow] -= work[i];
-			work[i] = 0.0;
-			iptr++;
-		    }
-		}
-#endif		    
-	    } /* else ... */
-	} /* for L-solve */
-
-#ifdef DEBUG
-  	printf("After L-solve: y=\n");
-	dprint_soln(n, nrhs, Bmat);
-#endif
-	
-        SuperLUStat.ops[SOLVE] = solve_ops;
-
-    } else { 
-      printf("Transposed solve not implemented.\n");
-      exit(0);
-    }
-
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(soln);
-}
-
-/*
- * Diagnostic print of the solution vector 
- */
-void
-dprint_soln(int n, int nrhs, double *soln)
-{
-    int i;
-
-    for (i = 0; i < n; i++) 
-  	printf("\t%d: %.4f\n", i, soln[i]);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlacon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlacon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlacon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,237 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include <fortran.h>
-#ifdef HYPRE_USING_HYPRE_BLAS
-#include "hypre_blas.h"
-#endif
-/*#include "slu_Cnames.h"*/
-
-int
-dlacon_(int *n, double *v, double *x, int *isgn, double *est, int *kase)
-
-{
-/*
-    Purpose   
-    =======   
-
-    DLACON estimates the 1-norm of a square matrix A.   
-    Reverse communication is used for evaluating matrix-vector products. 
-  
-
-    Arguments   
-    =========   
-
-    N      (input) INT
-           The order of the matrix.  N >= 1.   
-
-    V      (workspace) DOUBLE PRECISION array, dimension (N)   
-           On the final return, V = A*W,  where  EST = norm(V)/norm(W)   
-           (W is not returned).   
-
-    X      (input/output) DOUBLE PRECISION array, dimension (N)   
-           On an intermediate return, X should be overwritten by   
-                 A * X,   if KASE=1,   
-                 A' * X,  if KASE=2,
-           and DLACON must be re-called with all the other parameters   
-           unchanged.   
-
-    ISGN   (workspace) INT array, dimension (N)
-
-    EST    (output) DOUBLE PRECISION   
-           An estimate (a lower bound) for norm(A).   
-
-    KASE   (input/output) INT
-           On the initial call to DLACON, KASE should be 0.   
-           On an intermediate return, KASE will be 1 or 2, indicating   
-           whether X should be overwritten by A * X  or A' * X.   
-           On the final return from DLACON, KASE will again be 0.   
-
-    Further Details   
-    ======= =======   
-
-    Contributed by Nick Higham, University of Manchester.   
-    Originally named CONEST, dated March 16, 1988.   
-
-    Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of 
-    a real or complex matrix, with applications to condition estimation", 
-    ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988.   
-    ===================================================================== 
-*/
-
-    /* Table of constant values */
-    int c__1 = 1;
-    double      zero = 0.0;
-    double      one = 1.0;
-    
-    /* Local variables */
-    static int iter;
-    static int jump, jlast;
-    static double altsgn, estold;
-    static int i, j;
-    double temp;
-#ifdef _CRAY
-    extern int ISAMAX(int *, double *, int *);
-    extern double SASUM(int *, double *, int *);
-    extern int SCOPY(int *, double *, int *, double *, int *);
-#else	
-    extern int hypre_F90_NAME_BLAS(idamax,IDAMAX)(int *, double *, int *);
-    extern double hypre_F90_NAME_BLAS(dasum,DASUM)(int *, double *, int *);
-    extern int hypre_F90_NAME_BLAS(dcopy,DCOPY)(int *, double *, int *, double *, int *);	
-#endif
-#define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a))    /* Copy sign */
-#define i_dnnt(a) \
-	( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */
-
-    if ( *kase == 0 ) {
-	for (i = 0; i < *n; ++i) {
-	    x[i] = 1. / (double) (*n);
-	}
-	*kase = 1;
-	jump = 1;
-	return 0;
-    }
-
-    switch (jump) {
-	case 1:  goto L20;
-	case 2:  goto L40;
-	case 3:  goto L70;
-	case 4:  goto L110;
-	case 5:  goto L140;
-    }
-
-    /*     ................ ENTRY   (JUMP = 1)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY A*X. */
-  L20:
-    if (*n == 1) {
-	v[0] = x[0];
-	*est = fabs(v[0]);
-	/*        ... QUIT */
-	goto L150;
-    }
-#ifdef _CRAY
-    *est = SASUM(n, x, &c__1);
-#else
-    *est = hypre_F90_NAME_BLAS(dasum,DASUM)(n, x, &c__1);
-#endif
-
-    for (i = 0; i < *n; ++i) {
-	x[i] = d_sign(one, x[i]);
-	isgn[i] = i_dnnt(x[i]);
-    }
-    *kase = 2;
-    jump = 2;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 2)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */
-L40:
-#ifdef _CRAY
-    j = ISAMAX(n, &x[0], &c__1);
-#else
-    j = hypre_F90_NAME_BLAS(idamax,IDAMAX)(n, &x[0], &c__1);
-#endif
-    --j;
-    iter = 2;
-
-    /*     MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */
-L50:
-    for (i = 0; i < *n; ++i) x[i] = zero;
-    x[j] = one;
-    *kase = 1;
-    jump = 3;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 3)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L70:
-#ifdef _CRAY
-    SCOPY(n, x, &c__1, v, &c__1);
-#else
-    hypre_F90_NAME_BLAS(dcopy,DCOPY)(n, x, &c__1, v, &c__1);
-#endif
-    estold = *est;
-#ifdef _CRAY
-    *est = SASUM(n, v, &c__1);
-#else
-    *est = hypre_F90_NAME_BLAS(dasum,DASUM)(n, v, &c__1);
-#endif
-
-    for (i = 0; i < *n; ++i)
-	if (i_dnnt(d_sign(one, x[i])) != isgn[i])
-	    goto L90;
-
-    /*     REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */
-    goto L120;
-
-L90:
-    /*     TEST FOR CYCLING. */
-    if (*est <= estold) goto L120;
-
-    for (i = 0; i < *n; ++i) {
-	x[i] = d_sign(one, x[i]);
-	isgn[i] = i_dnnt(x[i]);
-    }
-    *kase = 2;
-    jump = 4;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 4)   
-	   X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */
-L110:
-    jlast = j;
-#ifdef _CRAY
-    j = ISAMAX(n, &x[0], &c__1);
-#else
-    j = hypre_F90_NAME_BLAS(idamax,IDAMAX)(n, &x[0], &c__1);
-#endif
-    --j;
-    if (x[jlast] != fabs(x[j]) && iter < 5) {
-	++iter;
-	goto L50;
-    }
-
-    /*     ITERATION COMPLETE.  FINAL STAGE. */
-L120:
-    altsgn = 1.;
-    for (i = 1; i <= *n; ++i) {
-	x[i-1] = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.);
-	altsgn = -altsgn;
-    }
-    *kase = 1;
-    jump = 5;
-    return 0;
-    
-    /*     ................ ENTRY   (JUMP = 5)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L140:
-#ifdef _CRAY
-    temp = SASUM(n, x, &c__1) / (double)(*n * 3) * 2.;
-#else
-    temp = hypre_F90_NAME_BLAS(dasum,DASUM)(n,x,&c__1)/(double)(*n * 3) * 2.;
-#endif
-    if (temp > *est) {
-#ifdef _CRAY
-	SCOPY(n, &x[0], &c__1, &v[0], &c__1);
-#else
-        hypre_F90_NAME_BLAS(dcopy,DCOPY)(n, &x[0], &c__1, &v[0], &c__1);
-#endif
-	*est = temp;
-    }
-
-L150:
-    *kase = 0;
-    return 0;
-
-} /* dlacon_ */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlamch.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlamch.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlamch.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,965 +0,0 @@
-#include <stdio.h>
-#include "slu_Cnames.h"
-
-#define TRUE_ (1)
-#define FALSE_ (0)
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-
-double dlamch_(char *cmach)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-    Purpose   
-    =======   
-
-    DLAMCH determines double precision machine parameters.   
-
-    Arguments   
-    =========   
-
-    CMACH   (input) CHARACTER*1   
-            Specifies the value to be returned by DLAMCH:   
-            = 'E' or 'e',   DLAMCH := eps   
-            = 'S' or 's ,   DLAMCH := sfmin   
-            = 'B' or 'b',   DLAMCH := base   
-            = 'P' or 'p',   DLAMCH := eps*base   
-            = 'N' or 'n',   DLAMCH := t   
-            = 'R' or 'r',   DLAMCH := rnd   
-            = 'M' or 'm',   DLAMCH := emin   
-            = 'U' or 'u',   DLAMCH := rmin   
-            = 'L' or 'l',   DLAMCH := emax   
-            = 'O' or 'o',   DLAMCH := rmax   
-
-            where   
-
-            eps   = relative machine precision   
-            sfmin = safe minimum, such that 1/sfmin does not overflow   
-            base  = base of the machine   
-            prec  = eps*base   
-            t     = number of (base) digits in the mantissa   
-            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
-            emin  = minimum exponent before (gradual) underflow   
-            rmin  = underflow threshold - base**(emin-1)   
-            emax  = largest exponent before overflow   
-            rmax  = overflow threshold  - (base**emax)*(1-eps)   
-
-   ===================================================================== 
-*/
-
-    static int first = TRUE_;
-
-    /* System generated locals */
-    int i__1;
-    double ret_val;
-    /* Builtin functions */
-    double pow_di(double *, int *);
-    /* Local variables */
-    static double base;
-    static int beta;
-    static double emin, prec, emax;
-    static int imin, imax;
-    static int lrnd;
-    static double rmin, rmax, t, rmach;
-    extern int lsame_(char *, char *);
-    static double small, sfmin;
-    extern /* Subroutine */ int dlamc2_(int *, int *, int *, 
-	    double *, int *, double *, int *, double *);
-    static int it;
-    static double rnd, eps;
-
-    if (first) {
-	first = FALSE_;
-	dlamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
-	base = (double) beta;
-	t = (double) it;
-	if (lrnd) {
-	    rnd = 1.;
-	    i__1 = 1 - it;
-	    eps = pow_di(&base, &i__1) / 2;
-	} else {
-	    rnd = 0.;
-	    i__1 = 1 - it;
-	    eps = pow_di(&base, &i__1);
-	}
-	prec = eps * base;
-	emin = (double) imin;
-	emax = (double) imax;
-	sfmin = rmin;
-	small = 1. / rmax;
-	if (small >= sfmin) {
-
-	/* Use SMALL plus a bit, to avoid the possibility of rounding   
-             causing overflow when computing  1/sfmin. */
-	    sfmin = small * (eps + 1.);
-	}
-    }
-
-    if (lsame_(cmach, "E")) {
-	rmach = eps;
-    } else if (lsame_(cmach, "S")) {
-	rmach = sfmin;
-    } else if (lsame_(cmach, "B")) {
-	rmach = base;
-    } else if (lsame_(cmach, "P")) {
-	rmach = prec;
-    } else if (lsame_(cmach, "N")) {
-	rmach = t;
-    } else if (lsame_(cmach, "R")) {
-	rmach = rnd;
-    } else if (lsame_(cmach, "M")) {
-	rmach = emin;
-    } else if (lsame_(cmach, "U")) {
-	rmach = rmin;
-    } else if (lsame_(cmach, "L")) {
-	rmach = emax;
-    } else if (lsame_(cmach, "O")) {
-	rmach = rmax;
-    }
-
-    ret_val = rmach;
-    return ret_val;
-
-/*     End of DLAMCH */
-
-} /* dlamch_ */
-
-
-/* Subroutine */ int dlamc1_(int *beta, int *t, int *rnd, int 
-	*ieee1)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC1 determines the machine parameters given by BETA, T, RND, and   
-    IEEE1.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    IEEE1   (output) INT   
-            Specifies whether rounding appears to be done in the IEEE   
-            'round to nearest' style.   
-
-    Further Details   
-    ===============   
-
-    The routine is based on the routine  ENVRON  by Malcolm and   
-    incorporates suggestions by Gentleman and Marovich. See   
-
-       Malcolm M. A. (1972) Algorithms to reveal properties of   
-          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
-
-       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
-          that reveal properties of floating point arithmetic units.   
-          Comms. of the ACM, 17, 276-277.   
-
-   ===================================================================== 
-*/
-    /* Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    double d__1, d__2;
-    /* Local variables */
-    static int lrnd;
-    static double a, b, c, f;
-    static int lbeta;
-    static double savec;
-    extern double dlamc3_(double *, double *);
-    static int lieee1;
-    static double t1, t2;
-    static int lt;
-    static double one, qtr;
-
-    if (first) {
-	first = FALSE_;
-	one = 1.;
-
-/*        LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BE
-TA,   
-          IEEE1, T and RND.   
-
-          Throughout this routine  we use the function  DLAMC3  to ens
-ure   
-          that relevant values are  stored and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          Compute  a = 2.0**m  with the  smallest positive integer m s
-uch   
-          that   
-
-             fl( a + 1.0 ) = a. */
-
-	a = 1.;
-	c = 1.;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L10:
-	if (c == one) {
-	    a *= 2;
-	    c = dlamc3_(&a, &one);
-	    d__1 = -a;
-	    c = dlamc3_(&c, &d__1);
-	    goto L10;
-	}
-/* +       END WHILE   
-
-          Now compute  b = 2.0**m  with the smallest positive integer 
-m   
-          such that   
-
-             fl( a + b ) .gt. a. */
-
-	b = 1.;
-	c = dlamc3_(&a, &b);
-
-/* +       WHILE( C.EQ.A )LOOP */
-L20:
-	if (c == a) {
-	    b *= 2;
-	    c = dlamc3_(&a, &b);
-	    goto L20;
-	}
-/* +       END WHILE   
-
-          Now compute the base.  a and c  are neighbouring floating po
-int   
-          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
- so   
-          their difference is beta. Adding 0.25 to c is to ensure that
- it   
-          is truncated to beta and not ( beta - 1 ). */
-
-	qtr = one / 4;
-	savec = c;
-	d__1 = -a;
-	c = dlamc3_(&c, &d__1);
-	lbeta = (int) (c + qtr);
-
-/*        Now determine whether rounding or chopping occurs,  by addin
-g a   
-          bit  less  than  beta/2  and a  bit  more  than  beta/2  to 
- a. */
-
-	b = (double) lbeta;
-	d__1 = b / 2;
-	d__2 = -b / 100;
-	f = dlamc3_(&d__1, &d__2);
-	c = dlamc3_(&f, &a);
-	if (c == a) {
-	    lrnd = TRUE_;
-	} else {
-	    lrnd = FALSE_;
-	}
-	d__1 = b / 2;
-	d__2 = b / 100;
-	f = dlamc3_(&d__1, &d__2);
-	c = dlamc3_(&f, &a);
-	if (lrnd && c == a) {
-	    lrnd = FALSE_;
-	}
-
-/*        Try and decide whether rounding is done in the  IEEE  'round
- to   
-          nearest' style. B/2 is half a unit in the last place of the 
-two   
-          numbers A and SAVEC. Furthermore, A is even, i.e. has last  
-bit   
-          zero, and SAVEC is odd. Thus adding B/2 to A should not  cha
-nge   
-          A, but adding B/2 to SAVEC should change SAVEC. */
-
-	d__1 = b / 2;
-	t1 = dlamc3_(&d__1, &a);
-	d__1 = b / 2;
-	t2 = dlamc3_(&d__1, &savec);
-	lieee1 = t1 == a && t2 > savec && lrnd;
-
-/*        Now find  the  mantissa, t.  It should  be the  integer part
- of   
-          log to the base beta of a,  however it is safer to determine
-  t   
-          by powering.  So we find t as the smallest positive integer 
-for   
-          which   
-
-             fl( beta**t + 1.0 ) = 1.0. */
-
-	lt = 0;
-	a = 1.;
-	c = 1.;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L30:
-	if (c == one) {
-	    ++lt;
-	    a *= lbeta;
-	    c = dlamc3_(&a, &one);
-	    d__1 = -a;
-	    c = dlamc3_(&c, &d__1);
-	    goto L30;
-	}
-/* +       END WHILE */
-
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *ieee1 = lieee1;
-    return 0;
-
-/*     End of DLAMC1 */
-
-} /* dlamc1_ */
-
-
-/* Subroutine */ int dlamc2_(int *beta, int *t, int *rnd, 
-	double *eps, int *emin, double *rmin, int *emax, 
-	double *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC2 determines the machine parameters specified in its argument   
-    list.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    EPS     (output) DOUBLE PRECISION   
-            The smallest positive number such that   
-
-               fl( 1.0 - EPS ) .LT. 1.0,   
-
-            where fl denotes the computed value.   
-
-    EMIN    (output) INT   
-            The minimum exponent before (gradual) underflow occurs.   
-
-    RMIN    (output) DOUBLE PRECISION   
-            The smallest normalized number for the machine, given by   
-            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
-  
-            of BETA.   
-
-    EMAX    (output) INT   
-            The maximum exponent before overflow occurs.   
-
-    RMAX    (output) DOUBLE PRECISION   
-            The largest positive number for the machine, given by   
-            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
-  
-            value of BETA.   
-
-    Further Details   
-    ===============   
-
-    The computation of  EPS  is based on a routine PARANOIA by   
-    W. Kahan of the University of California at Berkeley.   
-
-   ===================================================================== 
-*/
-    /* Table of constant values */
-    static int c__1 = 1;
-    
-    /* Initialized data */
-    static int first = TRUE_;
-    static int iwarn = FALSE_;
-    /* System generated locals */
-    int i__1;
-    double d__1, d__2, d__3, d__4, d__5;
-    /* Builtin functions */
-    double pow_di(double *, int *);
-    /* Local variables */
-    static int ieee;
-    static double half;
-    static int lrnd;
-    static double leps, zero, a, b, c;
-    static int i, lbeta;
-    static double rbase;
-    static int lemin, lemax, gnmin;
-    static double small;
-    static int gpmin;
-    static double third, lrmin, lrmax, sixth;
-    extern /* Subroutine */ int dlamc1_(int *, int *, int *, 
-	    int *);
-    extern double dlamc3_(double *, double *);
-    static int lieee1;
-    extern /* Subroutine */ int dlamc4_(int *, double *, int *), 
-	    dlamc5_(int *, int *, int *, int *, int *, 
-	    double *);
-    static int lt, ngnmin, ngpmin;
-    static double one, two;
-
-    if (first) {
-	first = FALSE_;
-	zero = 0.;
-	one = 1.;
-	two = 2.;
-
-/*        LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values
- of   
-          BETA, T, RND, EPS, EMIN and RMIN.   
-
-          Throughout this routine  we use the function  DLAMC3  to ens
-ure   
-          that relevant values are stored  and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          DLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1. 
-*/
-
-	dlamc1_(&lbeta, &lt, &lrnd, &lieee1);
-
-/*        Start to find EPS. */
-
-	b = (double) lbeta;
-	i__1 = -lt;
-	a = pow_di(&b, &i__1);
-	leps = a;
-
-/*        Try some tricks to see whether or not this is the correct  E
-PS. */
-
-	b = two / 3;
-	half = one / 2;
-	d__1 = -half;
-	sixth = dlamc3_(&b, &d__1);
-	third = dlamc3_(&sixth, &sixth);
-	d__1 = -half;
-	b = dlamc3_(&third, &d__1);
-	b = dlamc3_(&b, &sixth);
-	b = abs(b);
-	if (b < leps) {
-	    b = leps;
-	}
-
-	leps = 1.;
-
-/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
-L10:
-	if (leps > b && b > zero) {
-	    leps = b;
-	    d__1 = half * leps;
-/* Computing 5th power */
-	    d__3 = two, d__4 = d__3, d__3 *= d__3;
-/* Computing 2nd power */
-	    d__5 = leps;
-	    d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5);
-	    c = dlamc3_(&d__1, &d__2);
-	    d__1 = -c;
-	    c = dlamc3_(&half, &d__1);
-	    b = dlamc3_(&half, &c);
-	    d__1 = -b;
-	    c = dlamc3_(&half, &d__1);
-	    b = dlamc3_(&half, &c);
-	    goto L10;
-	}
-/* +       END WHILE */
-
-	if (a < leps) {
-	    leps = a;
-	}
-
-/*        Computation of EPS complete.   
-
-          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3
-)).   
-          Keep dividing  A by BETA until (gradual) underflow occurs. T
-his   
-          is detected when we cannot recover the previous A. */
-
-	rbase = one / lbeta;
-	small = one;
-	for (i = 1; i <= 3; ++i) {
-	    d__1 = small * rbase;
-	    small = dlamc3_(&d__1, &zero);
-/* L20: */
-	}
-	a = dlamc3_(&one, &small);
-	dlamc4_(&ngpmin, &one, &lbeta);
-	d__1 = -one;
-	dlamc4_(&ngnmin, &d__1, &lbeta);
-	dlamc4_(&gpmin, &a, &lbeta);
-	d__1 = -a;
-	dlamc4_(&gnmin, &d__1, &lbeta);
-	ieee = FALSE_;
-
-	if (ngpmin == ngnmin && gpmin == gnmin) {
-	    if (ngpmin == gpmin) {
-		lemin = ngpmin;
-/*            ( Non twos-complement machines, no gradual under
-flow;   
-                e.g.,  VAX ) */
-	    } else if (gpmin - ngpmin == 3) {
-		lemin = ngpmin - 1 + lt;
-		ieee = TRUE_;
-/*            ( Non twos-complement machines, with gradual und
-erflow;   
-                e.g., IEEE standard followers ) */
-	    } else {
-		lemin = min(ngpmin,gpmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if (ngpmin == gpmin && ngnmin == gnmin) {
-	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
-		lemin = max(ngpmin,ngnmin);
-/*            ( Twos-complement machines, no gradual underflow
-;   
-                e.g., CYBER 205 ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
-		 {
-	    if (gpmin - min(ngpmin,ngnmin) == 3) {
-		lemin = max(ngpmin,ngnmin) - 1 + lt;
-/*            ( Twos-complement machines with gradual underflo
-w;   
-                no known machine ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else {
-/* Computing MIN */
-	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
-	    lemin = min(i__1,gnmin);
-/*         ( A guess; no known machine ) */
-	    iwarn = TRUE_;
-	}
-/* **   
-   Comment out this if block if EMIN is ok */
-	if (iwarn) {
-	    first = TRUE_;
-	    printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    printf("EMIN = %8i\n",lemin);
-	    printf("If, after inspection, the value EMIN looks acceptable");
-            printf("please comment out \n the IF block as marked within the"); 
-            printf("code of routine DLAMC2, \n otherwise supply EMIN"); 
-            printf("explicitly.\n");
-	}
-/* **   
-
-          Assume IEEE arithmetic if we found denormalised  numbers abo
-ve,   
-          or if arithmetic seems to round in the  IEEE style,  determi
-ned   
-          in routine DLAMC1. A true IEEE machine should have both  thi
-ngs   
-          true; however, faulty machines may have one or the other. */
-
-	ieee = ieee || lieee1;
-
-/*        Compute  RMIN by successive division by  BETA. We could comp
-ute   
-          RMIN as BASE**( EMIN - 1 ),  but some machines underflow dur
-ing   
-          this computation. */
-
-	lrmin = 1.;
-	i__1 = 1 - lemin;
-	for (i = 1; i <= 1-lemin; ++i) {
-	    d__1 = lrmin * rbase;
-	    lrmin = dlamc3_(&d__1, &zero);
-/* L30: */
-	}
-
-/*        Finally, call DLAMC5 to compute EMAX and RMAX. */
-
-	dlamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *eps = leps;
-    *emin = lemin;
-    *rmin = lrmin;
-    *emax = lemax;
-    *rmax = lrmax;
-
-    return 0;
-
-
-/*     End of DLAMC2 */
-
-} /* dlamc2_ */
-
-
-double dlamc3_(double *a, double *b)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC3  is intended to force  A  and  B  to be stored prior to doing 
-  
-    the addition of  A  and  B ,  for use in situations where optimizers 
-  
-    might hold one of these in a register.   
-
-    Arguments   
-    =========   
-
-    A, B    (input) DOUBLE PRECISION   
-            The values A and B.   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       System generated locals */
-    double ret_val;
-
-    ret_val = *a + *b;
-
-    return ret_val;
-
-/*     End of DLAMC3 */
-
-} /* dlamc3_ */
-
-
-/* Subroutine */ int dlamc4_(int *emin, double *start, int *base)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC4 is a service routine for DLAMC2.   
-
-    Arguments   
-    =========   
-
-    EMIN    (output) EMIN   
-            The minimum exponent before (gradual) underflow, computed by 
-  
-            setting A = START and dividing by BASE until the previous A   
-            can not be recovered.   
-
-    START   (input) DOUBLE PRECISION   
-            The starting point for determining EMIN.   
-
-    BASE    (input) INT   
-            The base of the machine.   
-
-   ===================================================================== 
-*/
-    /* System generated locals */
-    int i__1;
-    double d__1;
-    /* Local variables */
-    static double zero, a;
-    static int i;
-    static double rbase, b1, b2, c1, c2, d1, d2;
-    extern double dlamc3_(double *, double *);
-    static double one;
-
-    a = *start;
-    one = 1.;
-    rbase = one / *base;
-    zero = 0.;
-    *emin = 1;
-    d__1 = a * rbase;
-    b1 = dlamc3_(&d__1, &zero);
-    c1 = a;
-    c2 = a;
-    d1 = a;
-    d2 = a;
-/* +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.   
-      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP */
-L10:
-    if (c1 == a && c2 == a && d1 == a && d2 == a) {
-	--(*emin);
-	a = b1;
-	d__1 = a / *base;
-	b1 = dlamc3_(&d__1, &zero);
-	d__1 = b1 * *base;
-	c1 = dlamc3_(&d__1, &zero);
-	d1 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d1 += b1;
-/* L20: */
-	}
-	d__1 = a * rbase;
-	b2 = dlamc3_(&d__1, &zero);
-	d__1 = b2 / rbase;
-	c2 = dlamc3_(&d__1, &zero);
-	d2 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d2 += b2;
-/* L30: */
-	}
-	goto L10;
-    }
-/* +    END WHILE */
-
-    return 0;
-
-/*     End of DLAMC4 */
-
-} /* dlamc4_ */
-
-
-/* Subroutine */ int dlamc5_(int *beta, int *p, int *emin, 
-	int *ieee, int *emax, double *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    DLAMC5 attempts to compute RMAX, the largest machine floating-point   
-    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
-    approximately to a power of 2.  It will fail on machines where this   
-    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
-  
-    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
-    too large (i.e. too close to zero), probably with overflow.   
-
-    Arguments   
-    =========   
-
-    BETA    (input) INT   
-            The base of floating-point arithmetic.   
-
-    P       (input) INT   
-            The number of base BETA digits in the mantissa of a   
-            floating-point value.   
-
-    EMIN    (input) INT   
-            The minimum exponent before (gradual) underflow.   
-
-    IEEE    (input) INT   
-            A int flag specifying whether or not the arithmetic   
-            system is thought to comply with the IEEE standard.   
-
-    EMAX    (output) INT   
-            The largest exponent before overflow   
-
-    RMAX    (output) DOUBLE PRECISION   
-            The largest machine floating-point number.   
-
-   ===================================================================== 
-  
-
-
-       First compute LEXP and UEXP, two powers of 2 that bound   
-       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
-       approximately to the bound that is closest to abs(EMIN).   
-       (EMAX is the exponent of the required number RMAX). */
-    /* Table of constant values */
-    static double c_b5 = 0.;
-    
-    /* System generated locals */
-    int i__1;
-    double d__1;
-    /* Local variables */
-    static int lexp;
-    static double oldy;
-    static int uexp, i;
-    static double y, z;
-    static int nbits;
-    extern double dlamc3_(double *, double *);
-    static double recbas;
-    static int exbits, expsum, try__;
-
-
-
-    lexp = 1;
-    exbits = 1;
-L10:
-    try__ = lexp << 1;
-    if (try__ <= -(*emin)) {
-	lexp = try__;
-	++exbits;
-	goto L10;
-    }
-    if (lexp == -(*emin)) {
-	uexp = lexp;
-    } else {
-	uexp = try__;
-	++exbits;
-    }
-
-/*     Now -LEXP is less than or equal to EMIN, and -UEXP is greater   
-       than or equal to EMIN. EXBITS is the number of bits needed to   
-       store the exponent. */
-
-    if (uexp + *emin > -lexp - *emin) {
-	expsum = lexp << 1;
-    } else {
-	expsum = uexp << 1;
-    }
-
-/*     EXPSUM is the exponent range, approximately equal to   
-       EMAX - EMIN + 1 . */
-
-    *emax = expsum + *emin - 1;
-    nbits = exbits + 1 + *p;
-
-/*     NBITS is the total number of bits needed to store a   
-       floating-point number. */
-
-    if (nbits % 2 == 1 && *beta == 2) {
-
-/*        Either there are an odd number of bits used to store a   
-          floating-point number, which is unlikely, or some bits are 
-  
-          not used in the representation of numbers, which is possible
-,   
-          (e.g. Cray machines) or the mantissa has an implicit bit,   
-          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
-   
-          most likely. We have to assume the last alternative.   
-          If this is true, then we need to reduce EMAX by one because 
-  
-          there must be some way of representing zero in an implicit-b
-it   
-          system. On machines like Cray, we are reducing EMAX by one 
-  
-          unnecessarily. */
-
-	--(*emax);
-    }
-
-    if (*ieee) {
-
-/*        Assume we are on an IEEE machine which reserves one exponent
-   
-          for infinity and NaN. */
-
-	--(*emax);
-    }
-
-/*     Now create RMAX, the largest machine number, which should   
-       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .   
-
-       First compute 1.0 - BETA**(-P), being careful that the   
-       result is less than 1.0 . */
-
-    recbas = 1. / *beta;
-    z = *beta - 1.;
-    y = 0.;
-    i__1 = *p;
-    for (i = 1; i <= *p; ++i) {
-	z *= recbas;
-	if (y < 1.) {
-	    oldy = y;
-	}
-	y = dlamc3_(&y, &z);
-/* L20: */
-    }
-    if (y >= 1.) {
-	y = oldy;
-    }
-
-/*     Now multiply by BETA**EMAX to get RMAX. */
-
-    i__1 = *emax;
-    for (i = 1; i <= *emax; ++i) {
-	d__1 = y * *beta;
-	y = dlamc3_(&d__1, &c_b5);
-/* L30: */
-    }
-
-    *rmax = y;
-    return 0;
-
-/*     End of DLAMC5 */
-
-} /* dlamc5_ */
-
-double pow_di(double *ap, int *bp)
-{
-    double pow, x;
-    int n;
-
-    pow = 1;
-    x = *ap;
-    n = *bp;
-
-    if(n != 0){
-	if(n < 0) {
-	    n = -n;
-	    x = 1/x;
-	}
-	for( ; ; ) {
-	    if(n & 01) pow *= x;
-	    if(n >>= 1)	x *= x;
-	    else break;
-	}
-    }
-    return(pow);
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlangs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,115 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	dlangs.c
- * History:     Modified from lapack routine DLANGE
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-
-double dlangs(char *norm, SuperMatrix *A)
-{
-/* 
-    Purpose   
-    =======   
-
-    DLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    DLANGE returns the value   
-
-       DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                (   
-                ( norm1(A),         NORM = '1', 'O' or 'o'   
-                (   
-                ( normI(A),         NORM = 'I' or 'i'   
-                (   
-                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in DLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int      i, j, irow;
-    double   value=0.0, sum;
-    double   *rwork;
-
-    Astore = (NCformat*) A->Store;
-    Aval   = (double*)Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-	
-    } else if (superlu_lsame(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
-		value = SUPERLU_MAX( value, fabs( Aval[i]) );
-	
-    } else if (superlu_lsame(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += fabs(Aval[i]);
-	    value = SUPERLU_MAX(value,sum);
-	}
-	
-    } else if (superlu_lsame(norm, "I")) {
-	/* Find normI(A). */
-	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
-	    ABORT("SUPERLU_MALLOC fails for rwork.");
-	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
-		irow = Astore->rowind[i];
-		rwork[irow] += fabs(Aval[i]);
-	    }
-	value = 0.;
-	for (i = 0; i < A->nrow; ++i)
-	    value = SUPERLU_MAX(value, rwork[i]);
-	
-	SUPERLU_FREE (rwork);
-	
-    } else if (superlu_lsame(norm, "F") || superlu_lsame(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else
-	ABORT("Illegal norm specified.");
-
-    return (value);
-
-} /* dlangs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlaqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlaqgs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dlaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dlaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,144 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	dlaqgs.c
- * History:     Modified from LAPACK routine DLAQGE
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-#ifdef HYPRE_USING_HYPRE_LAPACK
-#include "hypre_lapack.h"
-#endif
-
-void
-dlaqgs(SuperMatrix *A, double *r, double *c, 
-	double rowcnd, double colcnd, double amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    DLAQGS equilibrates a general sparse M by N matrix A using the row and   
-    scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = NC; Dtype = SLU_D; Mtype = GE.
-	    
-    R       (input) double*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) double*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) double
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) double
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) double
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int i, j, irow;
-    double large, small, cj;
-    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *);
-
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = (NCformat*) A->Store;
-    Aval = (double*) Astore->nzval;
-    
-    /* Initialize LARGE and SMALL. */
-    small = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum") / hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    for (j = 0; j < A->ncol; ++j) {
-		cj = c[j];
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    Aval[i] *= cj;
-                }
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		Aval[i] *= r[irow];
-	    }
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Row and column scaling */
-	for (j = 0; j < A->ncol; ++j) {
-	    cj = c[j];
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		Aval[i] *= cj * r[irow];
-	    }
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* dlaqgs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmemory.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmemory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmemory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,690 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_ddefs.h"
-
-/* Constants */
-#define NO_MEMTYPE  4      /* 0: lusup;
-			      1: ucol;
-			      2: lsub;
-			      3: usub */
-#define GluIntArray(n)   (5 * (n) + 5)
-
-/* Internal prototypes */
-void  *dexpand (int *, MemType,int, int, GlobalLU_t *);
-int   dLUWorkInit (int, int, int, int **, double **, LU_space_t);
-void  copy_mem_double (int, void *, void *);
-void  dStackCompress (GlobalLU_t *);
-void  dSetupSpace (void *, int, LU_space_t *);
-void  *duser_malloc (int, int);
-void  duser_free (int, int);
-
-/* External prototypes (in memory.c - prec-indep) */
-extern void    copy_mem_int    (int, void *, void *);
-extern void    user_bcopy      (char *, char *, int);
-
-/* Headers for 4 types of dynamatically managed memory */
-typedef struct e_node {
-    int size;      /* length of the memory that has been used */
-    void *mem;     /* pointer to the new malloc'd store */
-} ExpHeader;
-
-typedef struct {
-    int  size;
-    int  used;
-    int  top1;  /* grow upward, relative to &array[0] */
-    int  top2;  /* grow downward */
-    void *array;
-} LU_stack_t;
-
-/* Variables local to this file */
-static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */
-static LU_stack_t stack;
-static int no_expand;
-
-/* Macros to manipulate stack */
-#define StackFull(x)         ( x + stack.used >= stack.size )
-#define NotDoubleAlign(addr) ( (long int)addr & 7 )
-#define DoubleAlign(addr)    ( ((long int)addr + 7) & ~7L )
-#define TempSpace(m, w)      ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \
-			      (w + 1) * m * sizeof(double) )
-#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
-
-
-
-
-/*
- * Setup the memory model to be used for factorization.
- *    lwork = 0: use system malloc;
- *    lwork > 0: use user-supplied work[] space.
- */
-void dSetupSpace(void *work, int lwork, LU_space_t *MemModel)
-{
-    if ( lwork == 0 ) {
-	*MemModel = SYSTEM; /* malloc/free */
-    } else if ( lwork > 0 ) {
-	*MemModel = USER;   /* user provided space */
-	stack.used = 0;
-	stack.top1 = 0;
-	stack.top2 = (lwork/4)*4; /* must be word addressable */
-	stack.size = stack.top2;
-	stack.array = (void *) work;
-    }
-}
-
-
-
-void *duser_malloc(int bytes, int which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-
-void duser_free(int bytes, int which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total_needed (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int dQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    register int n, iword, dword, panel_size = sp_ienv(1);
-
-    Lstore = (SCformat*) L->Store;
-    Ustore = (NCformat*) U->Store;
-    n = L->ncol;
-    iword = sizeof(int);
-    dword = sizeof(double);
-
-    /* For LU factors */
-    mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] *
-				 dword + Lstore->rowind_colptr[n] * iword );
-    mem_usage->for_lu += (float)( (n + 1) * iword +
-				 Ustore->colptr[n] * (dword + iword) );
-
-    /* Working storage to support factorization */
-    mem_usage->total_needed = mem_usage->for_lu +
-	(float)( (2 * panel_size + 4 + NO_MARKER) * n * iword +
-		(panel_size + 1) * n * dword );
-
-    mem_usage->expansions = --no_expand;
-
-    return 0;
-} /* dQuerySpace */
-
-/*
- * Allocate storage for the data structures common to all factor routines.
- * For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     If lwork = -1, return the estimated amount of space required, plus n;
- *     otherwise, return the amount of space actually allocated when
- *     memory allocation failure occurred.
- */
-int
-dLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz,
-	  int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu,
-	  int **iwork, double **dwork)
-{
-    int      info, iword, dword;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    int      *xsup, *supno;
-    int      *lsub, *xlsub;
-    double   *lusup;
-    int      *xlusup;
-    double   *ucol;
-    int      *usub, *xusub;
-    int      nzlmax, nzumax, nzlumax;
-    int      FILL = sp_ienv(6);
-    
-    Glu->n    = n;
-    no_expand = 0;
-    iword     = sizeof(int);
-    dword     = sizeof(double);
-
-    if ( !expanders )	
-        expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader));
-    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
-    
-    if ( fact != SamePattern_SameRowPerm ) {
-	/* Guess for L\U factors */
-	nzumax = nzlumax = FILL * annz;
-	nzlmax = SUPERLU_MAX(1, FILL/4.) * annz;
-
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else {
-	    dSetupSpace(work, lwork, &Glu->MemModel);
-	}
-	
-#if ( PRNTlevel >= 1 )
-	printf("dLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", 
-	       FILL, nzlmax, nzumax);
-	fflush(stdout);
-#endif	
-	
-	/* Integer pointers for L\U factors */
-	if ( Glu->MemModel == SYSTEM ) {
-	    xsup   = intMalloc(n+1);
-	    supno  = intMalloc(n+1);
-	    xlsub  = intMalloc(n+1);
-	    xlusup = intMalloc(n+1);
-	    xusub  = intMalloc(n+1);
-	} else {
-	    xsup   = (int *)duser_malloc((n+1) * iword, HEAD);
-	    supno  = (int *)duser_malloc((n+1) * iword, HEAD);
-	    xlsub  = (int *)duser_malloc((n+1) * iword, HEAD);
-	    xlusup = (int *)duser_malloc((n+1) * iword, HEAD);
-	    xusub  = (int *)duser_malloc((n+1) * iword, HEAD);
-	}
-
-	lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	ucol  = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu );
-	lsub  = (int *)    dexpand( &nzlmax, LSUB, 0, 0, Glu );
-	usub  = (int *)    dexpand( &nzumax, USUB, 0, 1, Glu );
-
-	while ( !lusup || !ucol || !lsub || !usub ) {
-	    if ( Glu->MemModel == SYSTEM ) {
-		SUPERLU_FREE(lusup); 
-		SUPERLU_FREE(ucol); 
-		SUPERLU_FREE(lsub); 
-		SUPERLU_FREE(usub);
-	    } else {
-		duser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD);
-	    }
-	    nzlumax /= 2;
-	    nzumax /= 2;
-	    nzlmax /= 2;
-	    if ( nzlumax < annz ) {
-		printf("Not enough memory to perform factorization.\n");
-		return (dmemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-	    }
-#if ( PRNTlevel >= 1)
-	    printf("dLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", 
-		   nzlmax, nzumax);
-	    fflush(stdout);
-#endif
-	    lusup = (double *) dexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	    ucol  = (double *) dexpand( &nzumax, UCOL, 0, 0, Glu );
-	    lsub  = (int *)    dexpand( &nzlmax, LSUB, 0, 0, Glu );
-	    usub  = (int *)    dexpand( &nzumax, USUB, 0, 1, Glu );
-	}
-	
-    } else {
-	/* fact == SamePattern_SameRowPerm */
-	Lstore   = (SCformat*) L->Store;
-	Ustore   = (NCformat*) U->Store;
-	xsup     = Lstore->sup_to_col;
-	supno    = Lstore->col_to_sup;
-	xlsub    = Lstore->rowind_colptr;
-	xlusup   = Lstore->nzval_colptr;
-	xusub    = Ustore->colptr;
-	nzlmax   = Glu->nzlmax;    /* max from previous factorization */
-	nzumax   = Glu->nzumax;
-	nzlumax  = Glu->nzlumax;
-	
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else if ( lwork == 0 ) {
-	    Glu->MemModel = SYSTEM;
-	} else {
-	    Glu->MemModel = USER;
-	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
-	    stack.size = stack.top2;
-	}
-	
-	lsub  = (int*)  Lstore->rowind;
-	expanders[LSUB].mem  = (int*)  Lstore->rowind;
-	lusup = (double*) Lstore->nzval;
-	expanders[LUSUP].mem = (double*) Lstore->nzval;
-	usub  = (int*) Ustore->rowind;
-	expanders[USUB].mem  = (int*) Ustore->rowind;
-	ucol  = (double*) Ustore->nzval;
-	expanders[UCOL].mem  = (double*) Ustore->nzval;;
-	expanders[LSUB].size         = nzlmax;
-	expanders[LUSUP].size        = nzlumax;
-	expanders[USUB].size         = nzumax;
-	expanders[UCOL].size         = nzumax;	
-    }
-
-    Glu->xsup    = xsup;
-    Glu->supno   = supno;
-    Glu->lsub    = lsub;
-    Glu->xlsub   = xlsub;
-    Glu->lusup   = lusup;
-    Glu->xlusup  = xlusup;
-    Glu->ucol    = ucol;
-    Glu->usub    = usub;
-    Glu->xusub   = xusub;
-    Glu->nzlmax  = nzlmax;
-    Glu->nzumax  = nzumax;
-    Glu->nzlumax = nzlumax;
-    
-    info = dLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel);
-    if ( info )
-	return ( info + dmemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-    
-    ++no_expand;
-    return 0;
-    
-} /* dLUMemInit */
-
-/* Allocate known working storage. Returns 0 if success, otherwise
-   returns the number of bytes allocated so far when failure occurred. */
-int
-dLUWorkInit(int m, int n, int panel_size, int **iworkptr, 
-            double **dworkptr, LU_space_t MemModel)
-{
-    int    isize, dsize, extra;
-    double *old_ptr;
-    int    maxsuper = sp_ienv(3),
-           rowblk   = sp_ienv(4);
-
-    isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int);
-    dsize = (m * panel_size +
-	     NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(double);
-    
-    if ( MemModel == SYSTEM ) 
-	*iworkptr = (int *) intCalloc(isize/sizeof(int));
-    else
-	*iworkptr = (int *) duser_malloc(isize, TAIL);
-    if ( ! *iworkptr ) {
-	fprintf(stderr, "dLUWorkInit: malloc fails for local iworkptr[]\n");
-	return (isize + n);
-    }
-
-    if ( MemModel == SYSTEM )
-	*dworkptr = (double *) SUPERLU_MALLOC(dsize);
-    else {
-	*dworkptr = (double *) duser_malloc(dsize, TAIL);
-	if ( NotDoubleAlign(*dworkptr) ) {
-	    old_ptr = *dworkptr;
-	    *dworkptr = (double*) DoubleAlign(*dworkptr);
-	    *dworkptr = (double*) ((double*)*dworkptr - 1);
-	    extra = (char*)old_ptr - (char*)*dworkptr;
-#ifdef DEBUG	    
-	    printf("dLUWorkInit: not aligned, extra %d\n", extra);
-#endif	    
-	    stack.top2 -= extra;
-	    stack.used += extra;
-	}
-    }
-    if ( ! *dworkptr ) {
-	fprintf(stderr, "malloc fails for local dworkptr[].");
-	return (isize + dsize + n);
-    }
-	
-    return 0;
-}
-
-
-/*
- * Set up pointers for real working arrays.
- */
-void
-dSetRWork(int m, int panel_size, double *dworkptr,
-	 double **dense, double **tempv)
-{
-    double zero = 0.0;
-
-    int maxsuper = sp_ienv(3),
-        rowblk   = sp_ienv(4);
-    *dense = dworkptr;
-    *tempv = *dense + panel_size*m;
-    dfill (*dense, m * panel_size, zero);
-    dfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero);     
-}
-	
-/*
- * Free the working storage used by factor routines.
- */
-void dLUWorkFree(int *iwork, double *dwork, GlobalLU_t *Glu)
-{
-    if ( Glu->MemModel == SYSTEM ) {
-	SUPERLU_FREE (iwork);
-	SUPERLU_FREE (dwork);
-    } else {
-	stack.used -= (stack.size - stack.top2);
-	stack.top2 = stack.size;
-/*	dStackCompress(Glu);  */
-    }
-    
-    SUPERLU_FREE (expanders);	
-    expanders = 0;
-}
-
-/* Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-dLUMemXpand(int jcol,
-	   int next,          /* number of elements currently in the factors */
-	   MemType mem_type,  /* which type of memory to expand  */
-	   int *maxlen,       /* modified - maximum length of a data structure */
-	   GlobalLU_t *Glu    /* modified - global LU data structures */
-	   )
-{
-    void   *new_mem;
-    
-#ifdef DEBUG    
-    printf("dLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n",
-	   jcol, next, *maxlen, mem_type);
-#endif    
-
-    if (mem_type == USUB) 
-    	new_mem = dexpand(maxlen, mem_type, next, 1, Glu);
-    else
-	new_mem = dexpand(maxlen, mem_type, next, 0, Glu);
-    
-    if ( !new_mem ) {
-	int    nzlmax  = Glu->nzlmax;
-	int    nzumax  = Glu->nzumax;
-	int    nzlumax = Glu->nzlumax;
-    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
-    	return (dmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n);
-    }
-
-    switch ( mem_type ) {
-      case LUSUP:
-	Glu->lusup   = (double *) new_mem;
-	Glu->nzlumax = *maxlen;
-	break;
-      case UCOL:
-	Glu->ucol   = (double *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-      case LSUB:
-	Glu->lsub   = (int *) new_mem;
-	Glu->nzlmax = *maxlen;
-	break;
-      case USUB:
-	Glu->usub   = (int *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-    }
-    
-    return 0;
-    
-}
-
-
-
-void
-copy_mem_double(int howmany, void *oldV, void *newV)
-{
-    register int i;
-    double *dold = (double *)oldV;
-    double *dnew = (double *)newV;
-    for (i = 0; i < howmany; i++) dnew[i] = dold[i];
-}
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-void
-*dexpand (
-	 int *prev_len,   /* length used from previous call */
-	 MemType type,    /* which part of the memory to expand */
-	 int len_to_copy, /* size of the memory to be copied to new store */
-	 int keep_prev,   /* = 1: use prev_len;
-			     = 0: compute new_len to expand */
-	 GlobalLU_t *Glu  /* modified - global LU data structures */
-	)
-{
-    float    EXPAND = 1.5;
-    float    alpha;
-    void     *new_mem, *old_mem;
-    int      new_len, tries, lword, extra, bytes_to_copy;
-
-    alpha = EXPAND;
-
-    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
-        new_len = *prev_len;
-    else {
-	new_len = alpha * *prev_len;
-    }
-    
-    if ( type == LSUB || type == USUB ) lword = sizeof(int);
-    else lword = sizeof(double);
-
-    if ( Glu->MemModel == SYSTEM ) {
-	new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-	if ( no_expand != 0 ) {
-	    tries = 0;
-	    if ( keep_prev ) {
-		if ( !new_mem ) return (NULL);
-	    } else {
-		while ( !new_mem ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-		}
-	    }
-	    if ( type == LSUB || type == USUB ) {
-		copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
-	    } else {
-		copy_mem_double(len_to_copy, expanders[type].mem, new_mem);
-	    }
-	    SUPERLU_FREE (expanders[type].mem);
-	}
-	expanders[type].mem = (void *) new_mem;
-	
-    } else { /* MemModel == USER */
-	if ( no_expand == 0 ) {
-	    new_mem = duser_malloc(new_len * lword, HEAD);
-	    if ( NotDoubleAlign(new_mem) &&
-		(type == LUSUP || type == UCOL) ) {
-		old_mem = new_mem;
-		new_mem = (void *)DoubleAlign(new_mem);
-		extra = (char*)new_mem - (char*)old_mem;
-#ifdef DEBUG		
-		printf("expand(): not aligned, extra %d\n", extra);
-#endif		
-		stack.top1 += extra;
-		stack.used += extra;
-	    }
-	    expanders[type].mem = (void *) new_mem;
-	}
-	else {
-	    tries = 0;
-	    extra = (new_len - *prev_len) * lword;
-	    if ( keep_prev ) {
-		if ( StackFull(extra) ) return (NULL);
-	    } else {
-		while ( StackFull(extra) ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    extra = (new_len - *prev_len) * lword;	    
-		}
-	    }
-
-	    if ( type != USUB ) {
-		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
-		bytes_to_copy = (char*)stack.array + stack.top1
-		    - (char*)expanders[type + 1].mem;
-		user_bcopy((char*)expanders[type+1].mem,(char*) new_mem, bytes_to_copy);
-
-		if ( type < USUB ) {
-		    Glu->usub =
-			(int*)((char*)expanders[USUB].mem + extra);
-			expanders[USUB].mem =
-			(int*)((char*)expanders[USUB].mem + extra);
-		}
-		if ( type < LSUB ) {
-		    Glu->lsub =
-			(int*)((char*)expanders[LSUB].mem + extra);
-		    expanders[LSUB].mem =
-			(int*)((char*)expanders[LSUB].mem + extra);
-		}
-		if ( type < UCOL ) {
-		    Glu->ucol =
-			(double*)((char*)expanders[UCOL].mem + extra);
-		    expanders[UCOL].mem =
-			(double*)((char*)expanders[UCOL].mem + extra);
-		}
-		stack.top1 += extra;
-		stack.used += extra;
-		if ( type == UCOL ) {
-		    stack.top1 += extra;   /* Add same amount for USUB */
-		    stack.used += extra;
-		}
-		
-	    } /* if ... */
-
-	} /* else ... */
-    }
-
-    expanders[type].size = new_len;
-    *prev_len = new_len;
-    if ( no_expand ) ++no_expand;
-    
-    return (void *) expanders[type].mem;
-    
-} /* dexpand */
-
-
-/*
- * Compress the work[] array to remove fragmentation.
- */
-void
-dStackCompress(GlobalLU_t *Glu)
-{
-    register int iword, dword, ndim;
-    char    *last, *fragment;
-    int      *ifrom, *ito;
-    double   *dfrom, *dto;
-    int      *xlsub, *lsub, *xusub, *usub, *xlusup;
-    double   *ucol, *lusup;
-    
-    iword = sizeof(int);
-    dword = sizeof(double);
-    ndim = Glu->n;
-
-    xlsub  = Glu->xlsub;
-    lsub   = Glu->lsub;
-    xusub  = Glu->xusub;
-    usub   = Glu->usub;
-    xlusup = Glu->xlusup;
-    ucol   = Glu->ucol;
-    lusup  = Glu->lusup;
-    
-    dfrom = ucol;
-    dto = (double *)((char*)lusup + xlusup[ndim] * dword);
-    copy_mem_double(xusub[ndim], dfrom, dto);
-    ucol = dto;
-
-    ifrom = lsub;
-    ito = (int *) ((char*)ucol + xusub[ndim] * iword);
-    copy_mem_int(xlsub[ndim], ifrom, ito);
-    lsub = ito;
-    
-    ifrom = usub;
-    ito = (int *) ((char*)lsub + xlsub[ndim] * iword);
-    copy_mem_int(xusub[ndim], ifrom, ito);
-    usub = ito;
-    
-    last = (char*)usub + xusub[ndim] * iword;
-    fragment = (char*) (((char*)stack.array + stack.top1) - last);
-    stack.used -= (long int) fragment;
-    stack.top1 -= (long int) fragment;
-
-    Glu->ucol = ucol;
-    Glu->lsub = lsub;
-    Glu->usub = usub;
-    
-#ifdef DEBUG
-    printf("dStackCompress: fragment %d\n", fragment);
-    /* for (last = 0; last < ndim; ++last)
-	print_lu_col("After compress:", last, 0);*/
-#endif    
-    
-}
-
-/*
- * Allocate storage for original matrix A
- */
-void
-dallocateA(int n, int nnz, double **a, int **asub, int **xa)
-{
-    *a    = (double *) doubleMalloc(nnz);
-    *asub = (int *) intMalloc(nnz);
-    *xa   = (int *) intMalloc(n+1);
-}
-
-
-double *doubleMalloc(int n)
-{
-    double *buf;
-    buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double)); 
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in doubleMalloc()\n");
-    }
-    return (buf);
-}
-
-double *doubleCalloc(int n)
-{
-    double *buf;
-    register int i;
-    double zero = 0.0;
-    buf = (double *) SUPERLU_MALLOC((size_t)n * sizeof(double));
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in doubleCalloc()\n");
-    }
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
-
-int dmemory_usage(const int nzlmax, const int nzumax, 
-		  const int nzlumax, const int n)
-{
-    register int iword, dword;
-
-    iword   = sizeof(int);
-    dword   = sizeof(double);
-    
-    return (10 * n * iword +
-	    nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmyblas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dmyblas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dmyblas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,224 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		dmyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void dlsolve ( int ldm, int ncol, double *M, double *rhs )
-{
-    int k;
-    double x0, x1, x2, x3, x4, x5, x6, x7;
-    double *M0;
-    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-      Mki4 = Mki3 + ldm + 1;
-      Mki5 = Mki4 + ldm + 1;
-      Mki6 = Mki5 + ldm + 1;
-      Mki7 = Mki6 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++;
-      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++;
-      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
-      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
-			   - x6 * *Mki6++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      rhs[++firstcol] = x4;
-      rhs[++firstcol] = x5;
-      rhs[++firstcol] = x6;
-      rhs[++firstcol] = x7;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++
-                        - x4 * *Mki4++ - x5 * *Mki5++
-			- x6 * *Mki6++ - x7 * *Mki7++;
- 
-      M0 += 8 * ldm + 8;
-    }
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++;
- 
-      M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-
-      rhs[++firstcol] = x1;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
- 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-dusolve ( ldm, ncol, M, rhs )
-int ldm;	/* in */
-int ncol;	/* in */
-double *M;	/* in */
-double *rhs;	/* modified */
-{
-    double xj;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++)
-	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void dmatvec ( ldm, nrow, ncol, M, vec, Mxvec )
-
-int ldm;	/* in -- leading dimension of M */
-int nrow;	/* in */ 
-int ncol;	/* in */
-double *M;	/* in */
-double *vec;	/* in */
-double *Mxvec;	/* in/out */
-
-{
-    double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
-    double *M0;
-    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-        Mki2 = Mki1 + ldm;
-        Mki3 = Mki2 + ldm;
-	Mki4 = Mki3 + ldm;
-	Mki5 = Mki4 + ldm;
-	Mki6 = Mki5 + ldm;
-	Mki7 = Mki6 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	vi4 = vec[firstcol++];
-	vi5 = vec[firstcol++];
-	vi6 = vec[firstcol++];
-	vi7 = vec[firstcol++];	
-
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
-		      + vi4 * *Mki4++ + vi5 * *Mki5++
-		      + vi6 * *Mki6++ + vi7 * *Mki7++;
-
-	M0 += 8 * ldm;
-    }
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
-
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++)
-	    Mxvec[k] += vi0 * *Mki0++;
-
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,462 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-/* 
- * Function prototypes 
- */
-void sludlsolve(int, int, double *, double *);
-void sludmatvec(int, int, int, double *, double *, double *);
-extern void dcheck_tempv();
-
-void
-dpanel_bmod (
-	    const int  m,          /* in - number of rows in the matrix */
-	    const int  w,          /* in */
-	    const int  jcol,       /* in */
-	    const int  nseg,       /* in */
-	    double     *dense,     /* out, of size n by w */
-	    double     *tempv,     /* working array */
-	    int        *segrep,    /* in */
-	    int        *repfnz,    /* in, of size n by w */
-	    GlobalLU_t *Glu,       /* modified */
-	    SuperLUStat_t *stat    /* output */
-	    )
-{
-/* 
- * Purpose
- * =======
- *
- *    Performs numeric block updates (sup-panel) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- *    Before entering this routine, the original nonzeros in the panel 
- *    were already copied into the spa[m,w].
- *
- *    Updated/Output parameters-
- *	dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
- *      collectively in the m-by-w vector dense[*]. 
- *
- */
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int          incx = 1, incy = 1;
-    double       alpha, beta;
-#endif
-
-    register int k, ksub;
-    int          fsupc, nsupc, nsupr, nrow;
-    int          krep, krep_ind;
-    double       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          segsze;
-    int          block_nrow;  /* no of rows in a block row */
-    register int lptr;	      /* Points to the row subscripts of a supernode */
-    int          kfnz, irow, no_zeros; 
-    register int isub, isub1, i;
-    register int jj;	      /* Index through each column in the panel */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    double       *lusup;
-    int          *xlusup;
-    int          *repfnz_col; /* repfnz[] for a column in the panel */
-    double       *dense_col;  /* dense[] for a column in the panel */
-    double       *tempv1;             /* Used in 1-D update */
-    double       *TriTmp, *MatvecTmp; /* used in 2-D update */
-    double      zero = 0.0;
-#ifdef USE_VENDOR_BLAS
-    double      one = 1.0;
-#endif
-    register int ldaTmp;
-    register int r_ind, r_hi;
-    static   int first = 1, maxsuper, rowblk, colblk;
-    flops_t  *ops = stat->ops;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	rowblk   = sp_ienv(4);
-	colblk   = sp_ienv(5);
-	first = 0;
-    }
-    ldaTmp = maxsuper + rowblk;
-
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
-
-	/* krep = representative of current k-th supernode
-	 * fsupc = first supernodal column
-	 * nsupc = no of columns in a supernode
-	 * nsupr = no of rows in a supernode
-	 */
-        krep = segrep[k--];
-	fsupc = xsup[supno[krep]];
-	nsupc = krep - fsupc + 1;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nrow = nsupr - nsupc;
-	lptr = xlsub[fsupc];
-	krep_ind = lptr + nsupc - 1;
-
-	repfnz_col = repfnz;
-	dense_col = dense;
-	
-	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
-
-	    TriTmp = tempv;
-	
-	    /* Sequence through each column in panel -- triangular solves */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
-
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-	    
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += segsze * (segsze - 1);
-		ops[GEMV] += 2 * nrow * segsze;
-	
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-			dense_col[irow] -= ukj * lusup[luptr];
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-			ukj -= ukj1 * lusup[luptr1];
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++;
-			    dense_col[irow] -= (ukj*lusup[luptr]
-						+ ukj1*lusup[luptr1]);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-			ukj1 -= ukj2 * lusup[luptr2-1];
-			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++; luptr2++;
-			    dense_col[irow] -= ( ukj*lusup[luptr]
-                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
-			}
-		    }
-
-		} else  {	/* segsze >= 4 */
-		    
-		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
-		       holds the result of triangular solves.    */
-		    no_zeros = kfnz - fsupc;
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			TriTmp[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#else
-                    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&segsze,
-                           &lusup[luptr], &nsupr, TriTmp, &incx );
-#endif
-#else		
-		    sludlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
-#endif
-		    
-
-		} /* else ... */
-	    
-	    }  /* for jj ... end tri-solves */
-
-	    /* Block row updates; push all the way into dense[*] block */
-	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
-		
-		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
-		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
-		luptr = xlusup[fsupc] + nsupc + r_ind;
-		isub1 = lptr + nsupc + r_ind;
-		
-		repfnz_col = repfnz;
-		TriTmp = tempv;
-		dense_col = dense;
-		
-		/* Sequence through each column in panel -- matrix-vector */
-		for (jj = jcol; jj < jcol + w; jj++,
-		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		    
-		    kfnz = repfnz_col[krep];
-		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		    
-		    segsze = krep - kfnz + 1;
-		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
-		    
-		    /* Perform a block update, and scatter the result of
-		       matrix-vector to dense[].		 */
-		    no_zeros = kfnz - fsupc;
-		    luptr1 = luptr + nsupr * no_zeros;
-		    MatvecTmp = &TriTmp[maxsuper];
-		    
-#ifdef USE_VENDOR_BLAS
-		    alpha = one; 
-                    beta = zero;
-#ifdef _CRAY
-		    SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#else
-                    hypre_F90_NAME_BLAS(dgemv,DGEMV)("N", &block_nrow, &segsze,
-                           &alpha, &lusup[luptr1],
-                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#endif
-#else
-		    sludmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
-			   TriTmp, MatvecTmp);
-#endif
-		    
-		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
-		     * such that MatvecTmp[*] can be re-used for the
-		     * the next blok row update. dense[] will be copied into 
-		     * global store after the whole panel has been finished.
-		     */
-		    isub = isub1;
-		    for (i = 0; i < block_nrow; i++) {
-			irow = lsub[isub];
-			dense_col[irow] -= MatvecTmp[i];
-			MatvecTmp[i] = zero;
-			++isub;
-		    }
-		    
-		} /* for jj ... */
-		
-	    } /* for each block row ... */
-	    
-	    /* Scatter the triangular solves into SPA dense[*] */
-	    repfnz_col = repfnz;
-	    TriTmp = tempv;
-	    dense_col = dense;
-	    
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		if ( segsze <= 3 ) continue; /* skip unrolled cases */
-		
-		no_zeros = kfnz - fsupc;		
-		isub = lptr + no_zeros;
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    dense_col[irow] = TriTmp[i];
-		    TriTmp[i] = zero;
-		    ++isub;
-		}
-		
-	    } /* for jj ... */
-	    
-	} else { /* 1-D block modification */
-	    
-	    
-	    /* Sequence through each column in the panel */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m) {
-		
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += segsze * (segsze - 1);
-		ops[GEMV] += 2 * nrow * segsze;
-		
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-			dense_col[irow] -= ukj * lusup[luptr];
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-			ukj -= ukj1 * lusup[luptr1];
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr;  ++luptr1;
-			    dense_col[irow] -= (ukj*lusup[luptr]
-						+ ukj1*lusup[luptr1]);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-			ukj1 -= ukj2 * lusup[luptr2-1];
-			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr; ++luptr1; ++luptr2;
-			    dense_col[irow] -= ( ukj*lusup[luptr]
-                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
-			}
-		    }
-
-		} else  { /* segsze >= 4 */
-		    /* 
-		     * Perform a triangular solve and block update,
-		     * then scatter the result of sup-col update to dense[].
-		     */
-		    no_zeros = kfnz - fsupc;
-		    
-		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
-		     *    The result of triangular solve is in tempv[*];
-		     *    The result of matrix vector update is in dense_col[*]
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			tempv[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-		    
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#else
-                    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&segsze,
-                           &lusup[luptr], &nsupr, tempv, &incx );
-#endif
-		    
-		    luptr += segsze;	/* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-                    alpha = one;
-                    beta = zero;
-#ifdef _CRAY
-		    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-                    hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow,&segsze,&alpha,
-                           &lusup[luptr],
-                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		    sludlsolve ( nsupr, segsze, &lusup[luptr], tempv );
-		    
-		    luptr += segsze;        /* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-		    sludmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		    
-		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
-		     * that tempv[*] can be used for the triangular solve of
-		     * the next column of the panel. They will be copied into 
-		     * ucol[*] after the whole panel has been finished.
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; i++) {
-			irow = lsub[isub];
-			dense_col[irow] = tempv[i];
-			tempv[i] = zero;
-			isub++;
-		    }
-		    
-		    /* Scatter the update from tempv1[*] into SPA dense[*] */
-		    /* Start dense rectangular L */
-		    for (i = 0; i < nrow; i++) {
-			irow = lsub[isub];
-			dense_col[irow] -= tempv1[i];
-			tempv1[i] = zero;
-			++isub;	
-		    }
-		    
-		} /* else segsze>=4 ... */
-		
-	    } /* for each column in the panel... */
-	    
-	} /* else 1-D update ... */
-
-    } /* for each updating supernode ... */
-
-}
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpanel_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,247 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-void
-dpanel_dfs (
-	   const int  m,           /* in - number of rows in the matrix */
-	   const int  w,           /* in */
-	   const int  jcol,        /* in */
-	   SuperMatrix *A,       /* in - original matrix */
-	   int        *perm_r,     /* in */
-	   int        *nseg,	   /* out */
-	   double     *dense,      /* out */
-	   int        *panel_lsub, /* out */
-	   int        *segrep,     /* out */
-	   int        *repfnz,     /* out */
-	   int        *xprune,     /* out */
-	   int        *marker,     /* out */     
-	   int        *parent,     /* working array */
-	   int        *xplore,     /* working array */
-	   GlobalLU_t *Glu         /* modified */
-	   )
-{
-/*
- * Purpose
- * =======
- *
- *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives.
- *
- *   The routine returns one list of the supernodal representatives
- *   in topological order of the dfs that generates them. This list is
- *   a superset of the topological order of each individual column within
- *   the panel. 
- *   The location of the first nonzero in each supernodal segment
- *   (supernodal entry location) is also returned. Each column has a 
- *   separate list for this purpose.
- *
- *   Two marker arrays are used for dfs:
- *     marker[i] == jj, if i was visited during dfs of current column jj;
- *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
- *
- *   marker: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- */
-    NCPformat *Astore;
-    double    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int	      krep, chperm, chmark, chrep, oldrep, kchild, myfnz;
-    int       k, krow, kmark, kperm;
-    int       xdfs, maxdfs, kpar;
-    int       jj;	   /* index through each column in the panel */
-    int       *marker1;	   /* marker1[jj] >= jcol if vertex jj was visited 
-			      by a previous column within this panel.   */
-    int       *repfnz_col; /* start of each column in the panel */
-    double    *dense_col;  /* start of each column in the panel */
-    int       nextl_col;   /* next available position in panel_lsub[*,jj] */
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-
-    /* Initialize pointers */
-    Astore     = (NCPformat*) A->Store;
-    a          = (   double*) Astore->nzval;
-    asub       = Astore->rowind;
-    xa_begin   = Astore->colbeg;
-    xa_end     = Astore->colend;
-    marker1    = marker + m;
-    repfnz_col = repfnz;
-    dense_col  = dense;
-    *nseg      = 0;
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-
-    /* For each column in the panel */
-    for (jj = jcol; jj < jcol + w; jj++) {
-	nextl_col = (jj - jcol) * m;
-
-#ifdef CHK_DFS
-	printf("\npanel col %d: ", jj);
-#endif
-
-	/* For each nonz in A[*,jj] do dfs */
-	for (k = xa_begin[jj]; k < xa_end[jj]; k++) {
-	    krow = asub[k];
-            dense_col[krow] = a[k];
-	    kmark = marker[krow];    	
-	    if ( kmark == jj ) 
-		continue;     /* krow visited before, go to the next nonzero */
-
-	    /* For each unmarked nbr krow of jj
-	     * krow is in L: place it in structure of L[*,jj]
-	     */
-	    marker[krow] = jj;
-	    kperm = perm_r[krow];
-	    
-	    if ( kperm == EMPTY ) {
-		panel_lsub[nextl_col++] = krow; /* krow is indexed into A */
-	    }
-	    /* 
-	     * krow is in U: if its supernode-rep krep
-	     * has been explored, update repfnz[*]
-	     */
-	    else {
-		
-		krep = xsup[supno[kperm]+1] - 1;
-		myfnz = repfnz_col[krep];
-		
-#ifdef CHK_DFS
-		printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm);
-#endif
-		if ( myfnz != EMPTY ) {	/* Representative visited before */
-		    if ( myfnz > kperm ) repfnz_col[krep] = kperm;
-		    /* continue; */
-		}
-		else {
-		    /* Otherwise, perform dfs starting at krep */
-		    oldrep = EMPTY;
-		    parent[krep] = oldrep;
-		    repfnz_col[krep] = kperm;
-		    xdfs = xlsub[krep];
-		    maxdfs = xprune[krep];
-		    
-#ifdef CHK_DFS 
-		    printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-		    for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-		    printf("\n");
-#endif
-		    do {
-			/* 
-			 * For each unmarked kchild of krep 
-			 */
-			while ( xdfs < maxdfs ) {
-			    
-			    kchild = lsub[xdfs];
-			    xdfs++;
-			    chmark = marker[kchild];
-			    
-			    if ( chmark != jj ) { /* Not reached yet */
-				marker[kchild] = jj;
-				chperm = perm_r[kchild];
-			      
-				/* Case kchild is in L: place it in L[*,j] */
-				if ( chperm == EMPTY ) {
-				    panel_lsub[nextl_col++] = kchild;
-				} 
-				/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-				 *   been explored, update its repfnz[*]
-				 */
-				else {
-				    
-				    chrep = xsup[supno[chperm]+1] - 1;
-				    myfnz = repfnz_col[chrep];
-#ifdef CHK_DFS
-				    printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm);
-#endif
-				    if ( myfnz != EMPTY ) { /* Visited before */
-					if ( myfnz > chperm )
-					    repfnz_col[chrep] = chperm;
-				    }
-				    else {
-					/* Cont. dfs at snode-rep of kchild */
-					xplore[krep] = xdfs;	
-					oldrep = krep;
-					krep = chrep; /* Go deeper down G(L) */
-					parent[krep] = oldrep;
-					repfnz_col[krep] = chperm;
-					xdfs = xlsub[krep];     
-					maxdfs = xprune[krep];
-#ifdef CHK_DFS 
-					printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-					for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);	
-					printf("\n");
-#endif
-				    } /* else */
-				  
-				} /* else */
-			      
-			    } /* if... */
-			    
-			} /* while xdfs < maxdfs */
-			
-			/* krow has no more unexplored nbrs:
-			 *    Place snode-rep krep in postorder DFS, if this 
-			 *    segment is seen for the first time. (Note that
-			 *    "repfnz[krep]" may change later.)
-			 *    Backtrack dfs to its parent.
-			 */
-			if ( marker1[krep] < jcol ) {
-			    segrep[*nseg] = krep;
-			    ++(*nseg);
-			    marker1[krep] = jj;
-			}
-			
-			kpar = parent[krep]; /* Pop stack, mimic recursion */
-			if ( kpar == EMPTY ) break; /* dfs done */
-			krep = kpar;
-			xdfs = xplore[krep];
-			maxdfs = xprune[krep];
-			
-#ifdef CHK_DFS 
-			printf("  pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs);
-			for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-			printf("\n");
-#endif
-		    } while ( kpar != EMPTY ); /* do-while - until empty stack */
-		    
-		} /* else */
-		
-	    } /* else */
-	    
-	} /* for each nonz in A[*,jj] */
-	
-	repfnz_col += m;    /* Move to next column */
-        dense_col += m;
-	
-    } /* for jj ... */
-    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotgrowth.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,115 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-#ifdef HYPRE_USING_HYPRE_LAPACK
-#include "hypre_lapack.h"
-#endif
-
-double
-dPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
-             SuperMatrix *L, SuperMatrix *U)
-{
-/*
- * Purpose
- * =======
- *
- * Compute the reciprocal pivot growth factor of the leading ncols columns
- * of the matrix, using the formula:
- *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
- *
- * Arguments
- * =========
- *
- * ncols    (input) int
- *          The number of columns of matrices A, L and U.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = NC; Dtype = SLU_D; Mtype = GE.
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SC; Dtype = SLU_D; Mtype = TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = NC;
- *          Dtype = SLU_D; Mtype = TRU.
- *
- */
-    NCformat *Astore;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double  *Aval, *Lval, *Uval;
-    int      fsupc, nsupr, luptr, nz_in_U;
-    int      i, j, k, oldcol;
-    int      *inv_perm_c;
-    double   rpg, maxaj, maxuj;
-    extern   double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *);
-    double   smlnum;
-    double   *luval;
-   
-    /* Get machine constants. */
-    smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("S");
-    rpg = 1. / smlnum;
-
-    Astore = (NCformat*) A->Store;
-    Lstore = (SCformat*) L->Store;
-    Ustore = (NCformat*) U->Store;
-    Aval = (double*) Astore->nzval;
-    Lval = (double*) Lstore->nzval;
-    Uval = (double*) Ustore->nzval;
-    
-    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
-    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;
-
-    for (k = 0; k <= Lstore->nsuper; ++k) {
-	fsupc = L_FST_SUPC(k);
-	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	luptr = L_NZ_START(fsupc);
-	luval = &Lval[luptr];
-	nz_in_U = 1;
-	
-	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
-	    maxaj = 0.;
-            oldcol = inv_perm_c[j];
-	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
-		maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) );
-	
-	    maxuj = 0.;
-	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
-		maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) );
-	    
-	    /* Supernode */
-	    for (i = 0; i < nz_in_U; ++i)
-		maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) );
-
-	    ++nz_in_U;
-	    luval += nsupr;
-
-	    if ( maxuj == 0. )
-		rpg = SUPERLU_MIN( rpg, 1.);
-	    else
-		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
-	}
-	
-	if ( j >= ncols ) break;
-    }
-
-    SUPERLU_FREE(inv_perm_c);
-    return (rpg);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpivotL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpivotL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,173 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include "slu_ddefs.h"
-
-#undef DEBUG
-
-int
-dpivotL(
-        const int  jcol,     /* in */
-        const double u,      /* in - diagonal pivoting threshold */
-        int        *usepr,   /* re-use the pivot sequence given by perm_r/iperm_r */
-        int        *perm_r,  /* may be modified */
-        int        *iperm_r, /* in - inverse of perm_r */
-        int        *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */
-        int        *pivrow,  /* out */
-        GlobalLU_t *Glu,     /* modified - global LU data structures */
-	SuperLUStat_t *stat  /* output */
-       )
-{
-/*
- * Purpose
- * =======
- *   Performs the numerical pivoting on the current column of L,
- *   and the CDIV operation.
- *
- *   Pivot policy:
- *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
- *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
- *           pivot row = k;
- *       ELSE IF abs(A_jj) >= thresh THEN
- *           pivot row = j;
- *       ELSE
- *           pivot row = m;
- * 
- *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
- *
- *   Return value: 0      success;
- *                 i > 0  U(i,i) is exactly zero.
- *
- */
-    int          fsupc;	    /* first column in the supernode */
-    int          nsupc;	    /* no of columns in the supernode */
-    int          nsupr;     /* no of rows in the supernode */
-    int          lptr;	    /* points to the starting subscript of the supernode */
-    int          pivptr, old_pivptr, diag, diagind;
-    double       pivmax, rtemp, thresh;
-    double       temp;
-    double       *lu_sup_ptr; 
-    double       *lu_col_ptr;
-    int          *lsub_ptr;
-    int          isub, icol, k, itemp;
-    int          *lsub, *xlsub;
-    double       *lusup;
-    int          *xlusup;
-    flops_t      *ops = stat->ops;
-
-    /* Initialize pointers */
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    fsupc      = (Glu->xsup)[(Glu->supno)[jcol]];
-    nsupc      = jcol - fsupc;	        /* excluding jcol; nsupc >= 0 */
-    lptr       = xlsub[fsupc];
-    nsupr      = xlsub[fsupc+1] - lptr;
-    lu_sup_ptr = &lusup[xlusup[fsupc]];	/* start of the current supernode */
-    lu_col_ptr = &lusup[xlusup[jcol]];	/* start of jcol in the supernode */
-    lsub_ptr   = &lsub[lptr];	/* start of row indices of the supernode */
-
-#ifdef DEBUG
-if ( jcol == MIN_COL ) {
-    printf("Before cdiv: col %d\n", jcol);
-    for (k = nsupc; k < nsupr; k++) 
-	printf("  lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]);
-}
-#endif
-    
-    /* Determine the largest abs numerical value for partial pivoting;
-       Also search for user-specified pivot, and diagonal element. */
-    if ( *usepr ) *pivrow = iperm_r[jcol];
-    diagind = iperm_c[jcol];
-    pivmax = 0.0;
-    pivptr = nsupc;
-    diag = EMPTY;
-    old_pivptr = nsupc;
-    for (isub = nsupc; isub < nsupr; ++isub) {
-	rtemp = fabs (lu_col_ptr[isub]);
-	if ( rtemp > pivmax ) {
-	    pivmax = rtemp;
-	    pivptr = isub;
-	}
-	if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub;
-	if ( lsub_ptr[isub] == diagind ) diag = isub;
-    }
-
-    /* Test for singularity */
-    if ( pivmax == 0.0 ) {
-	*pivrow = lsub_ptr[pivptr];
-	perm_r[*pivrow] = jcol;
-	*usepr = 0;
-	return (jcol+1);
-    }
-
-    thresh = u * pivmax;
-    
-    /* Choose appropriate pivotal element by our policy. */
-    if ( *usepr ) {
-        rtemp = fabs (lu_col_ptr[old_pivptr]);
-	if ( rtemp != 0.0 && rtemp >= thresh )
-	    pivptr = old_pivptr;
-	else
-	    *usepr = 0;
-    }
-    if ( *usepr == 0 ) {
-	/* Use diagonal pivot? */
-	if ( diag >= 0 ) { /* diagonal exists */
-	    rtemp = fabs (lu_col_ptr[diag]);
-	    if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag;
-        }
-	*pivrow = lsub_ptr[pivptr];
-    }
-    
-    /* Record pivot row */
-    perm_r[*pivrow] = jcol;
-    
-    /* Interchange row subscripts */
-    if ( pivptr != nsupc ) {
-	itemp = lsub_ptr[pivptr];
-	lsub_ptr[pivptr] = lsub_ptr[nsupc];
-	lsub_ptr[nsupc] = itemp;
-
-	/* Interchange numerical values as well, for the whole snode, such 
-	 * that L is indexed the same way as A.
- 	 */
-	for (icol = 0; icol <= nsupc; icol++) {
-	    itemp = pivptr + icol * nsupr;
-	    temp = lu_sup_ptr[itemp];
-	    lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr];
-	    lu_sup_ptr[nsupc + icol*nsupr] = temp;
-	}
-    } /* if */
-
-    /* cdiv operation */
-    ops[FACT] += nsupr - nsupc;
-
-    temp = 1.0 / lu_col_ptr[nsupc];
-    for (k = nsupc+1; k < nsupr; k++) 
-	lu_col_ptr[k] *= temp;
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpruneL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dpruneL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dpruneL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,147 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-void
-dpruneL(
-       const int  jcol,	     /* in */
-       const int  *perm_r,   /* in */
-       const int  pivrow,    /* in */
-       const int  nseg,	     /* in */
-       const int  *segrep,   /* in */
-       const int  *repfnz,   /* in */
-       int        *xprune,   /* out */
-       GlobalLU_t *Glu       /* modified - global LU data structures */
-       )
-{
-/*
- * Purpose
- * =======
- *   Prunes the L-structure of supernodes whose L-structure
- *   contains the current pivot row "pivrow"
- *
- */
-    double     utemp;
-    int        jsupno, irep, irep1, kmin, kmax, krow, movnum;
-    int        i, ktemp, minloc, maxloc;
-    int        do_prune; /* logical variable */
-    int        *xsup, *supno;
-    int        *lsub, *xlsub;
-    double     *lusup;
-    int        *xlusup;
-
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    
-    /*
-     * For each supernode-rep irep in U[*,j]
-     */
-    jsupno = supno[jcol];
-    for (i = 0; i < nseg; i++) {
-
-	irep = segrep[i];
-	irep1 = irep + 1;
-	do_prune = FALSE;
-
-	/* Don't prune with a zero U-segment */
- 	if ( repfnz[irep] == EMPTY )
-		continue;
-
-     	/* If a snode overlaps with the next panel, then the U-segment 
-   	 * is fragmented into two parts -- irep and irep1. We should let
-	 * pruning occur at the rep-column in irep1's snode. 
-	 */
-	if ( supno[irep] == supno[irep1] ) 	/* Don't prune */
-		continue;
-
-	/*
-	 * If it has not been pruned & it has a nonz in row L[pivrow,i]
-	 */
-	if ( supno[irep] != jsupno ) {
-	    if ( xprune[irep] >= xlsub[irep1] ) {
-		kmin = xlsub[irep];
-		kmax = xlsub[irep1] - 1;
-		for (krow = kmin; krow <= kmax; krow++) 
-		    if ( lsub[krow] == pivrow ) {
-			do_prune = TRUE;
-			break;
-		    }
-	    }
-	    
-    	    if ( do_prune ) {
-
-	     	/* Do a quicksort-type partition
-	     	 * movnum=TRUE means that the num values have to be exchanged.
-	     	 */
-	        movnum = FALSE;
-	        if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */
-			movnum = TRUE;
-
-	        while ( kmin <= kmax ) {
-
-	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
-			kmax--;
-		    else if ( perm_r[lsub[kmin]] != EMPTY )
-			kmin++;
-		    else { /* kmin below pivrow, and kmax above pivrow: 
-		            * 	interchange the two subscripts
-			    */
-		        ktemp = lsub[kmin];
-		        lsub[kmin] = lsub[kmax];
-		        lsub[kmax] = ktemp;
-
-			/* If the supernode has only one column, then we
- 			 * only keep one set of subscripts. For any subscript 
-			 * interchange performed, similar interchange must be 
-			 * done on the numerical values.
- 			 */
-		        if ( movnum ) {
-		    	    minloc = xlusup[irep] + (kmin - xlsub[irep]);
-		    	    maxloc = xlusup[irep] + (kmax - xlsub[irep]);
-			    utemp = lusup[minloc];
-		  	    lusup[minloc] = lusup[maxloc];
-			    lusup[maxloc] = utemp;
-		        }
-
-		        kmin++;
-		        kmax--;
-
-		    }
-
-	        } /* while */
-
-	        xprune[irep] = kmin;	/* Pruning */
-
-#ifdef CHK_PRUNE
-	printf("    After dpruneL(),using col %d:  xprune[%d] = %d\n", 
-			jcol, irep, kmin);
-#endif
-	    } /* if do_prune */
-
-	} /* if */
-
-    } /* for each U-segment... */
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dreadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dreadhb.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dreadhb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dreadhb.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,258 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_ddefs.h"
-
-
-/* Eat up the rest of the current line */
-int dDumpLine(FILE *fp)
-{
-    register int c;
-    while ((c = fgetc(fp)) != '\n') ;
-    return 0;
-}
-
-int dParseIntFormat(char *buf, int *num, int *size)
-{
-    char *tmp;
-
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    sscanf(tmp, "%d", num);
-    while (*tmp != 'I' && *tmp != 'i') ++tmp;
-    ++tmp;
-    sscanf(tmp, "%d", size);
-    return 0;
-}
-
-int dParseFloatFormat(char *buf, int *num, int *size)
-{
-    char *tmp, *period;
-    
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
-	   && *tmp != 'F' && *tmp != 'f') {
-        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
-           num picked up refers to P, which should be skipped. */
-        if (*tmp=='p' || *tmp=='P') {
-           ++tmp;
-           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-        } else {
-           ++tmp;
-        }
-    }
-    ++tmp;
-    period = tmp;
-    while (*period != '.' && *period != ')') ++period ;
-    *period = '\0';
-    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
-
-    return 0;
-}
-
-int dReadVector(FILE *fp, int n, int *where, int perline, int persize)
-{
-    register int i, j, item;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	    where[i++] = item - 1;
-	}
-    }
-
-    return 0;
-}
-
-int dReadValues(FILE *fp, int n, double *destination, int perline, int persize)
-{
-    register int i, j, k, s;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    s = j*persize;
-	    for (k = 0; k < persize; ++k) /* No D_ format in C */
-		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
-	    destination[i++] = atof(&buf[s]);
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	}
-    }
-
-    return 0;
-}
-
-
-
-void
-dreadhb(int *nrow, int *ncol, int *nonz,
-	double **nzval, int **rowind, int **colptr)
-{
-/* 
- * Purpose
- * =======
- * 
- * Read a DOUBLE PRECISION matrix stored in Harwell-Boeing format 
- * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
- *
- */
-
-    register int i, numer_lines = 0, rhscrd = 0;
-    int tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
-    char buf[100], type[4];
-#if 0
-    char key[10];
-#endif
-    FILE *fp;
-
-    fp = stdin;
-
-    /* Line 1 */
-    fgets(buf, 100, fp);
-    fputs(buf, stdout);
-#if 0
-    fscanf(fp, "%72c", buf); buf[72] = 0;
-    printf("Title: %s", buf);
-    fscanf(fp, "%8c", key);  key[8] = 0;
-    printf("Key: %s\n", key);
-    dDumpLine(fp);
-#endif
-
-    /* Line 2 */
-    for (i=0; i<5; i++) {
-	fscanf(fp, "%14c", buf); buf[14] = 0;
-	sscanf(buf, "%d", &tmp);
-	if (i == 3) numer_lines = tmp;
-	if (i == 4 && tmp) rhscrd = tmp;
-    }
-    dDumpLine(fp);
-
-    /* Line 3 */
-    fscanf(fp, "%3c", type);
-    fscanf(fp, "%11c", buf); /* pad */
-    type[3] = 0;
-#ifdef DEBUG
-    printf("Matrix type %s\n", type);
-#endif
-    
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nrow);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", ncol);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nonz);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", &tmp);
-    
-    if (tmp != 0)
-	  printf("This is not an assembled matrix!\n");
-    if (*nrow != *ncol)
-	printf("Matrix is not square.\n");
-    dDumpLine(fp);
-
-    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
-    dallocateA(*ncol, *nonz, nzval, rowind, colptr);
-
-    /* Line 4: format statement */
-    fscanf(fp, "%16c", buf);
-    dParseIntFormat(buf, &colnum, &colsize);
-    fscanf(fp, "%16c", buf);
-    dParseIntFormat(buf, &rownum, &rowsize);
-    fscanf(fp, "%20c", buf);
-    dParseFloatFormat(buf, &valnum, &valsize);
-    fscanf(fp, "%20c", buf);
-    dDumpLine(fp);
-
-    /* Line 5: right-hand side */    
-    if ( rhscrd ) dDumpLine(fp); /* skip RHSFMT */
-    
-#ifdef DEBUG
-    printf("%d rows, %d nonzeros\n", *nrow, *nonz);
-    printf("colnum %d, colsize %d\n", colnum, colsize);
-    printf("rownum %d, rowsize %d\n", rownum, rowsize);
-    printf("valnum %d, valsize %d\n", valnum, valsize);
-#endif
-    
-    dReadVector(fp, *ncol+1, *colptr, colnum, colsize);
-    dReadVector(fp, *nonz, *rowind, rownum, rowsize);
-    if ( numer_lines ) {
-        dReadValues(fp, *nonz, *nzval, valnum, valsize);
-    }
-    
-    fclose(fp);
-
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsnode_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsnode_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsnode_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsnode_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,134 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include "slu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-
-/*
- * Function prototypes
- */
-extern int hypre_F90_NAME_BLAS(dtrsv,DTRSV)(char *, char *, char *, int *, double *, int *,
-                                            double *, int *);
-extern int hypre_F90_NAME_BLAS(dgemv,DGEMV)(char *, int *, int *, double *, double *, int *,
-                                            double *, int *, double *, double *, int *);
-void sludlsolve(int, int, double*, double*);
-void sludmatvec(int, int, int, double*, double*, double*);
-
-/*
- * Performs numeric block updates within the relaxed snode. 
- */
-int
-dsnode_bmod (
-	    const int  jcol,	  /* in */
-	    const int  jsupno,    /* in */
-	    const int  fsupc,     /* in */
-	    double     *dense,    /* in */
-	    double     *tempv,    /* working array */
-	    GlobalLU_t *Glu,      /* modified */
-	    SuperLUStat_t *stat   /* output */
-	    )
-{
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int            incx = 1, incy = 1;
-    double         alpha = -1.0, beta = 1.0;
-#endif
-
-    int            luptr, nsupc, nsupr, nrow;
-    int            isub, irow, i, iptr; 
-    register int   ufirst, nextlu;
-    int            *lsub, *xlsub;
-    double         *lusup;
-    int            *xlusup;
-    flops_t *ops = stat->ops;
-
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-
-    nextlu = xlusup[jcol];
-    
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-	dense[irow] = 0;
-	++nextlu;
-    }
-
-    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */
-    
-    if ( fsupc < jcol ) {
-
-	luptr = xlusup[fsupc];
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nsupc = jcol - fsupc;	/* Excluding jcol */
-	ufirst = xlusup[jcol];	/* Points to the beginning of column
-				   jcol in supernode L\U(jsupno). */
-	nrow = nsupr - nsupc;
-
-	ops[TRSV] += nsupc * (nsupc - 1);
-	ops[GEMV] += 2 * nrow * nsupc;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-        hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U",&nsupc,&lusup[luptr],
-              &nsupr, &lusup[ufirst], &incx );
-        hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow,&nsupc,&alpha,
-                &lusup[luptr+nsupc], &nsupr,
-                &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	sludlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-	sludmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
-			&lusup[ufirst], &tempv[0] );
-
-        /* Scatter tempv[*] into lusup[*] */
-	iptr = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    lusup[iptr++] -= tempv[i];
-	    tempv[i] = 0.0;
-	}
-#endif
-
-    }
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsnode_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-int
-dsnode_dfs (
-	   const int  jcol,	    /* in - start of the supernode */
-	   const int  kcol, 	    /* in - end of the supernode */
-	   const int  *asub,        /* in */
-	   const int  *xa_begin,    /* in */
-	   const int  *xa_end,      /* in */
-	   int        *xprune,      /* out */
-	   int        *marker,      /* modified */
-	   GlobalLU_t *Glu          /* modified */
-	   )
-{
-/* Purpose
- * =======
- *    dsnode_dfs() - Determine the union of the row structures of those 
- *    columns within the relaxed snode.
- *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
- *    the portion outside the rectangular supernode must be zero.
- *
- * Return value
- * ============
- *     0   success;
- *    >0   number of bytes allocated when run out of memory.
- *
- */
-    register int i, k, ifrom, ito, nextl, new_next;
-    int          nsuper, krow, kmark, mem_error;
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    int          nzlmax;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    nsuper = ++supno[jcol];	/* Next available supernode number */
-    nextl = xlsub[jcol];
-
-    for (i = jcol; i <= kcol; i++) {
-	/* For each nonzero in A[*,i] */
-	for (k = xa_begin[i]; k < xa_end[i]; k++) {	
-	    krow = asub[k];
-	    kmark = marker[krow];
-	    if ( kmark != kcol ) { /* First time visit krow */
-		marker[krow] = kcol;
-		lsub[nextl++] = krow;
-		if ( nextl >= nzlmax ) {
-		    if ((mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)))
-			return (mem_error);
-		    lsub = Glu->lsub;
-		}
-	    }
-    	}
-	supno[i] = nsuper;
-    }
-
-    /* Supernode > 1, then make a copy of the subscripts for pruning */
-    if ( jcol < kcol ) {
-	new_next = nextl + (nextl - xlsub[jcol]);
-	while ( new_next > nzlmax ) {
-	    if ((mem_error = dLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)))
-		return (mem_error);
-	    lsub = Glu->lsub;
-	}
-	ito = nextl;
-	for (ifrom = xlsub[jcol]; ifrom < nextl; )
-	    lsub[ito++] = lsub[ifrom++];	
-        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
-	nextl = ito;
-    }
-
-    xsup[nsuper+1] = kcol + 1;
-    supno[kcol+1]  = nsuper;
-    xprune[kcol]   = nextl;
-    xlsub[kcol+1]  = nextl;
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,487 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:		dsp_blas2.c
- * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include "slu_ddefs.h"
-
-#ifndef HYPRE_USING_HYPRE_BLAS
-#define USE_VENDOR_BLAS
-#endif
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern int hypre_F90_NAME_BLAS(dtrsv,DTRSV)(const char *,const char *,const char *,int *,double *,int *,double *,int *);
-#ifdef __cplusplus
-}
-#endif
-/* 
- * Function prototypes 
- */
-extern void sludusolve(int, int, double*, double*);
-extern void sludlsolve(int, int, double*, double*);
-extern void sludmatvec(int, int, int, double*, double*, double*);
-
-int
-sp_dtrsv(const char *uplo,const char *trans,const char *diag, SuperMatrix *L, 
-         SuperMatrix *U, double *x, SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose
- *   =======
- *
- *   sp_dtrsv() solves one of the systems of equations   
- *       A*x = b,   or   A'*x = b,
- *   where b and x are n element vectors and A is a sparse unit , or   
- *   non-unit, upper or lower triangular matrix.   
- *   No test for singularity or near-singularity is included in this   
- *   routine. Such tests must be performed before calling this routine.   
- *
- *   Parameters   
- *   ==========   
- *
- *   uplo   - (input) char*
- *            On entry, uplo specifies whether the matrix is an upper or   
- *             lower triangular matrix as follows:   
- *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
- *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
- *
- *   trans  - (input) char*
- *             On entry, trans specifies the equations to be solved as   
- *             follows:   
- *                trans = 'N' or 'n'   A*x = b.   
- *                trans = 'T' or 't'   A'*x = b.
- *                trans = 'C' or 'c'   A'*x = b.   
- *
- *   diag   - (input) char*
- *             On entry, diag specifies whether or not A is unit   
- *             triangular as follows:   
- *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
- *                diag = 'N' or 'n'   A is not assumed to be unit   
- *                                    triangular.   
- *	     
- *   L       - (input) SuperMatrix*
- *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes,
- *             i.e., L has types: Stype = SC, Dtype = SLU_D, Mtype = TRLU.
- *
- *   U       - (input) SuperMatrix*
- *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = NC, Dtype = SLU_D, Mtype = TRU.
- *    
- *   x       - (input/output) double*
- *             Before entry, the incremented array X must contain the n   
- *             element right-hand side vector b. On exit, X is overwritten 
- *             with the solution vector x.
- *
- *   info    - (output) int*
- *             If *info = -i, the i-th argument had an illegal value.
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *Lval, *Uval;
-    int incx = 1;
-#ifdef USE_VENDOR_BLAS
-    int incy = 1;
-    double alpha = 1.0, beta = 1.0;
-#endif
-    int nrow;
-    int fsupc, nsupr, nsupc, luptr, istart, irow;
-    int i, k, iptr, jcol;
-    double *work;
-    flops_t solve_ops;
-
-    /* Test the input parameters */
-    *info = 0;
-    if ( !superlu_lsame(uplo,"L") && !superlu_lsame(uplo, "U") ) *info = -1;
-    else if ( !superlu_lsame(trans, "N") && !superlu_lsame(trans, "T") && 
-              !superlu_lsame(trans, "C")) *info = -2;
-    else if ( !superlu_lsame(diag, "U") && !superlu_lsame(diag, "N") ) *info = -3;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
-    if ( *info ) {
-	i = -(*info);
-	superlu_xerbla("sp_dtrsv", &i);
-	return 0;
-    }
-
-    Lstore = (SCformat*) L->Store;
-    Lval = (double*) Lstore->nzval;
-    Ustore = (NCformat*) U->Store;
-    Uval = (double*) Ustore->nzval;
-    solve_ops = 0;
-
-    if ( !(work = doubleCalloc(L->nrow)) )
-	ABORT("Malloc fails for work in sp_dtrsv().");
-    
-    if ( superlu_lsame(trans, "N") ) {	/* Form x := inv(A)*x. */
-	
-	if ( superlu_lsame(uplo, "L") ) {
-	    /* Form x := inv(L)*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-		fsupc = L_FST_SUPC(k);
-		istart = L_SUB_START(fsupc);
-		nsupr = L_SUB_START(fsupc+1) - istart;
-		nsupc = L_FST_SUPC(k+1) - fsupc;
-		luptr = L_NZ_START(fsupc);
-		nrow = nsupr - nsupc;
-
-	        solve_ops += nsupc * (nsupc - 1);
-	        solve_ops += 2 * nrow * nsupc;
-
-		if ( nsupc == 1 ) {
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
-			irow = L_SUB(iptr);
-			++luptr;
-			x[irow] -= x[fsupc] * Lval[luptr];
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#else
-                    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","N","U", &nsupc, 
-                        &Lval[luptr], &nsupr, &x[fsupc], &incx);
-
-                    hypre_F90_NAME_BLAS(dgemv,DGEMV)("N",&nrow, &nsupc, 
-                        &alpha, &Lval[luptr+nsupc],
-                        &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#endif
-#else
-		    sludlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
-		
-		    sludmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
-                             &x[fsupc], &work[0] );
-#endif		
-		
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; ++i, ++iptr) {
-			irow = L_SUB(iptr);
-			x[irow] -= work[i];	/* Scatter */
-			work[i] = 0.0;
-
-		    }
-	 	}
-	    } /* for k ... */
-	    
-	} else {
-	    /* Form x := inv(U)*x */
-	    
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; k--) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-		
-    	        solve_ops += nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    x[fsupc] /= Lval[luptr];
-		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
-			irow = U_SUB(i);
-			x[irow] -= x[fsupc] * Uval[i];
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx);
-#else
-                    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("U","N","N",&nsupc,
-                        &Lval[luptr], &nsupr, &x[fsupc], &incx);
-#endif
-#else		
-		    sludusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
-#endif		
-
-		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
-				i++) {
-			    irow = U_SUB(i);
-			    x[irow] -= x[jcol] * Uval[i];
-		    	}
-                    }
-		}
-	    } /* for k ... */
-	    
-	}
-    } else { /* Form x := inv(A')*x */
-	
-	if ( superlu_lsame(uplo, "L") ) {
-	    /* Form x := inv(L')*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 2 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-			x[jcol] -= x[irow] * Lval[i];
-			iptr++;
-		    }
-		}
-		
-		if ( nsupc > 1 ) {
-		    solve_ops += nsupc * (nsupc - 1);
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-                    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("L","T","U",&nsupc, 
-                        &Lval[luptr], &nsupr, &x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := inv(U')*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-			x[jcol] -= x[irow] * Uval[i];
-		    }
-		}
-
-		solve_ops += nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    x[fsupc] /= Lval[luptr];
-		} else {
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-                    hypre_F90_NAME_BLAS(dtrsv,DTRSV)("U","T","N", &nsupc, 
-                           &Lval[luptr], &nsupr, &x[fsupc], &incx);
-#endif
-		}
-	    } /* for k ... */
-	}
-    }
-
-    stat->ops[SOLVE] += solve_ops;
-    SUPERLU_FREE(work);
-    return 0;
-}
-
-
-
-
-int
-sp_dgemv(const char *trans, double alpha, SuperMatrix *A, double *x, 
-	 int incx, double beta, double *y, int incy)
-{
-/*  Purpose   
-    =======   
-
-    sp_dgemv()  performs one of the matrix-vector operations   
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-    where alpha and beta are scalars, x and y are vectors and A is a
-    sparse A->nrow by A->ncol matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - (input) char*
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-    ALPHA  - (input) double
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    X      - (input) double*, array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-
-    INCX   - (input) int
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-
-    BETA   - (input) double
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-
-    Y      - (output) double*,  array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-             updated vector y.
-	     
-    INCY   - (input) int
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-
-    ==== Sparse Level 2 Blas routine.   
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    double   *Aval;
-    int info;
-    double temp;
-    int lenx, leny, i, j, irow;
-    int iy, jx, jy, kx, ky;
-    int notran;
-
-    notran = superlu_lsame(trans, "N");
-    Astore = (NCformat*) A->Store;
-    Aval = (double*) Astore->nzval;
-    
-    /* Test the input parameters */
-    info = 0;
-    if ( !notran && !superlu_lsame(trans, "T") && !superlu_lsame(trans, "C")) info = 1;
-    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
-    else if (incx == 0) info = 5;
-    else if (incy == 0)	info = 8;
-    if (info != 0) {
-	superlu_xerbla("sp_dgemv ", &info);
-	return 0;
-    }
-
-    /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
-	return 0;
-
-    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-       up the start points in  X  and  Y. */
-    if (superlu_lsame(trans, "N")) {
-	lenx = A->ncol;
-	leny = A->nrow;
-    } else {
-	lenx = A->nrow;
-	leny = A->ncol;
-    }
-    if (incx > 0) kx = 0;
-    else kx =  - (lenx - 1) * incx;
-    if (incy > 0) ky = 0;
-    else ky =  - (leny - 1) * incy;
-
-    /* Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-    /* First form  y := beta*y. */
-    if (beta != 1.) {
-	if (incy == 1) {
-	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) y[i] = 0.;
-	    else
-		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
-	} else {
-	    iy = ky;
-	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = 0.;
-		    iy += incy;
-		}
-	    else
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = beta * y[iy];
-		    iy += incy;
-		}
-	}
-    }
-    
-    if (alpha == 0.) return 0;
-
-    if ( notran ) {
-	/* Form  y := alpha*A*x + y. */
-	jx = kx;
-	if (incy == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		if (x[jx] != 0.) {
-		    temp = alpha * x[jx];
-		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-			irow = Astore->rowind[i];
-			y[irow] += temp * Aval[i];
-		    }
-		}
-		jx += incx;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    } else {
-	/* Form  y := alpha*A'*x + y. */
-	jy = ky;
-	if (incx == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		temp = 0.;
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    temp += Aval[i] * x[irow];
-		}
-		y[jy] += alpha * temp;
-		jy += incy;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    }
-    return 0;
-} /* sp_dgemv */
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas3.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dsp_blas3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dsp_blas3.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,119 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		sp_blas3.c
- * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
- */
-
-#include "slu_ddefs.h"
-
-int
-sp_dgemm(const char *transa,const char *transb, int m, int n, int k, 
-         double alpha, SuperMatrix *A, double *b, int ldb, 
-         double beta, double *c, int ldc)
-{
-/*  Purpose   
-    =======   
-
-    sp_d performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of 
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - (input) char*
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-             Unchanged on exit.   
-
-    TRANSB - (input) char*
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-             Unchanged on exit.   
-
-    M      - (input) int   
-             On entry,  M  specifies  the number of rows of the matrix 
-	     op( A ) and of the matrix C.  M must be at least zero. 
-	     Unchanged on exit.   
-
-    N      - (input) int
-             On entry,  N specifies the number of columns of the matrix 
-	     op( B ) and the number of columns of the matrix C. N must be 
-	     at least zero.
-	     Unchanged on exit.   
-
-    K      - (input) int
-             On entry, K specifies the number of columns of the matrix 
-	     op( A ) and the number of rows of the matrix op( B ). K must 
-	     be at least  zero.   
-             Unchanged on exit.
-	     
-    ALPHA  - (input) double
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
-             part of the array B must contain the matrix B, otherwise 
-             the leading n by k part of the array B must contain the 
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - (input) int
-             On entry, LDB specifies the first dimension of B as declared 
-             in the calling (sub) program. LDB must be at least max( 1, n ).  
-             Unchanged on exit.   
-
-    BETA   - (input) double
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then C need not be set on input.   
-
-    C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading m by n part of the array C must 
-             contain the matrix C,  except when beta is zero, in which 
-             case C need not be set on entry.   
-             On exit, the array C is overwritten by the m by n matrix 
-	     ( alpha*op( A )*B + beta*C ).   
-
-    LDC    - (input) int
-             On entry, LDC specifies the first dimension of C as declared 
-             in the calling (sub)program. LDC must be at least max(1,m).   
-             Unchanged on exit.   
-
-    ==== Sparse Level 3 Blas routine.   
-*/
-    int    incx = 1, incy = 1;
-    int    j;
-
-    for (j = 0; j < n; ++j) {
-	sp_dgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
-    }
-    return 0;    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dutil.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,478 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-
-void
-dCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       double *nzval, int *rowind, int *colptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NCformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = (NCformat*) A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->rowind = rowind;
-    Astore->colptr = colptr;
-}
-
-void
-dCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       double *nzval, int *colind, int *rowptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NRformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = (NRformat*) A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->colind = colind;
-    Astore->rowptr = rowptr;
-}
-
-/* Copy matrix A into matrix B. */
-void
-dCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore, *Bstore;
-    int      ncol, nnz, i;
-
-    B->Stype = A->Stype;
-    B->Dtype = A->Dtype;
-    B->Mtype = A->Mtype;
-    B->nrow  = A->nrow;;
-    B->ncol  = ncol = A->ncol;
-    Astore   = (NCformat *) A->Store;
-    Bstore   = (NCformat *) B->Store;
-    Bstore->nnz = nnz = Astore->nnz;
-    for (i = 0; i < nnz; ++i)
-	((double *)Bstore->nzval)[i] = ((double *)Astore->nzval)[i];
-    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
-    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
-}
-
-
-void
-dCreate_Dense_Matrix(SuperMatrix *X, int m, int n, double *x, int ldx,
-		    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    DNformat    *Xstore;
-    
-    X->Stype = stype;
-    X->Dtype = dtype;
-    X->Mtype = mtype;
-    X->nrow = m;
-    X->ncol = n;
-    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
-    Xstore = (DNformat *) X->Store;
-    Xstore->lda = ldx;
-    Xstore->nzval = (double *) x;
-}
-
-void
-dCopy_Dense_Matrix(int M, int N, double *X, int ldx,
-			double *Y, int ldy)
-{
-/*
- *
- *  Purpose
- *  =======
- *
- *  Copies a two-dimensional matrix X to another matrix Y.
- */
-    int    i, j;
-    
-    for (j = 0; j < N; ++j)
-        for (i = 0; i < M; ++i)
-            Y[i + j*ldy] = X[i + j*ldx];
-}
-
-void
-dCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, 
-			double *nzval, int *nzval_colptr, int *rowind,
-			int *rowind_colptr, int *col_to_sup, int *sup_to_col,
-			Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    SCformat *Lstore;
-
-    L->Stype = stype;
-    L->Dtype = dtype;
-    L->Mtype = mtype;
-    L->nrow = m;
-    L->ncol = n;
-    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
-    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
-    Lstore = (SCformat*) L->Store;
-    Lstore->nnz = nnz;
-    Lstore->nsuper = col_to_sup[n];
-    Lstore->nzval = nzval;
-    Lstore->nzval_colptr = nzval_colptr;
-    Lstore->rowind = rowind;
-    Lstore->rowind_colptr = rowind_colptr;
-    Lstore->col_to_sup = col_to_sup;
-    Lstore->sup_to_col = sup_to_col;
-
-}
-
-
-/*
- * Convert a row compressed storage into a column compressed storage.
- */
-void
-dCompRow_to_CompCol(int m, int n, int nnz, 
-		    double *a, int *colind, int *rowptr,
-		    double **at, int **rowind, int **colptr)
-{
-    register int i, j, col, relpos;
-    int *marker;
-
-    /* Allocate storage for another copy of the matrix. */
-    *at = (double *) doubleMalloc(nnz);
-    *rowind = (int *) intMalloc(nnz);
-    *colptr = (int *) intMalloc(n+1);
-    marker = (int *) intCalloc(n);
-    
-    /* Get counts of each column of A, and set up column pointers */
-    for (i = 0; i < m; ++i)
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
-    (*colptr)[0] = 0;
-    for (j = 0; j < n; ++j) {
-	(*colptr)[j+1] = (*colptr)[j] + marker[j];
-	marker[j] = (*colptr)[j];
-    }
-
-    /* Transfer the matrix into the compressed column storage. */
-    for (i = 0; i < m; ++i) {
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    col = colind[j];
-	    relpos = marker[col];
-	    (*rowind)[relpos] = i;
-	    (*at)[relpos] = a[j];
-	    ++marker[col];
-	}
-    }
-
-    SUPERLU_FREE(marker);
-}
-
-
-void
-dPrint_CompCol_Matrix(char *what, SuperMatrix *A)
-{
-    NCformat     *Astore;
-    register int i,n;
-    double       *dp;
-    
-    printf("\nCompCol matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (NCformat *) A->Store;
-    dp = (double *) Astore->nzval;
-    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
-    printf("nzval: ");
-    for (i = 0; i < Astore->colptr[n]; ++i) printf("%f  ", dp[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->colptr[n]; ++i) printf("%d  ", Astore->rowind[i]);
-    printf("\ncolptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->colptr[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-dPrint_SuperNode_Matrix(char *what, SuperMatrix *A)
-{
-    SCformat     *Astore;
-    register int i, j, k, c, d, n, nsup;
-    double       *dp;
-    int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr;
-    
-    printf("\nSuperNode matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (SCformat *) A->Store;
-    dp = (double *) Astore->nzval;
-    col_to_sup = Astore->col_to_sup;
-    sup_to_col = Astore->sup_to_col;
-    rowind_colptr = Astore->rowind_colptr;
-    rowind = Astore->rowind;
-    printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", 
-	   A->nrow,A->ncol,Astore->nnz,Astore->nsuper);
-    printf("nzval:\n");
-    for (k = 0; k <= Astore->nsuper; ++k) {
-      c = sup_to_col[k];
-      nsup = sup_to_col[k+1] - c;
-      for (j = c; j < c + nsup; ++j) {
-	d = Astore->nzval_colptr[j];
-	for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) {
-	  printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]);
-	}
-      }
-    }
-#if 0
-    for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f  ", dp[i]);
-#endif
-    printf("\nnzval_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->nzval_colptr[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->rowind_colptr[n]; ++i) 
-        printf("%d  ", Astore->rowind[i]);
-    printf("\nrowind_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->rowind_colptr[i]);
-    printf("\ncol_to_sup: ");
-    for (i = 0; i < n; ++i) printf("%d  ", col_to_sup[i]);
-    printf("\nsup_to_col: ");
-    for (i = 0; i <= Astore->nsuper+1; ++i) 
-        printf("%d  ", sup_to_col[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-dPrint_Dense_Matrix(char *what, SuperMatrix *A)
-{
-    DNformat     *Astore;
-    register int i, j, lda;
-    double       *dp;
-    
-    printf("\nDense matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (DNformat *) A->Store;
-    lda = Astore->lda;
-    dp = (double *) Astore->nzval;
-    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda);
-    printf("\nnzval: ");
-    for (j = 0; j < A->ncol; ++j) {
-        for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i + j*lda]);
-        printf("\n");
-    }
-    printf("\n");
-    fflush(stdout);
-}
-
-/*
- * Diagnostic print of column "jcol" in the U/L factor.
- */
-void
-dprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu)
-{
-    int     i, k, fsupc;
-    int     *xsup, *supno;
-    int     *xlsub, *lsub;
-    double  *lusup;
-    int     *xlusup;
-    double  *ucol;
-    int     *usub, *xusub;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    
-    printf("%s", msg);
-    printf("col %d: pivrow %d, supno %d, xprune %d\n", 
-	   jcol, pivrow, supno[jcol], xprune[jcol]);
-    
-    printf("\tU-col:\n");
-    for (i = xusub[jcol]; i < xusub[jcol+1]; i++)
-	printf("\t%d%10.4f\n", usub[i], ucol[i]);
-    printf("\tL-col in rectangular snode:\n");
-    fsupc = xsup[supno[jcol]];	/* first col of the snode */
-    i = xlsub[fsupc];
-    k = xlusup[jcol];
-    while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) {
-	printf("\t%d\t%10.4f\n", lsub[i], lusup[k]);
-	i++; k++;
-    }
-    fflush(stdout);
-}
-
-
-/*
- * Check whether tempv[] == 0. This should be true before and after 
- * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". 
- */
-void dcheck_tempv(int n, double *tempv)
-{
-    int i;
-	
-    for (i = 0; i < n; i++) {
-	if (tempv[i] != 0.0) 
-	{
-	    fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]);
-	    ABORT("dcheck_tempv");
-	}
-    }
-}
-
-
-void
-dGenXtrue(int n, int nrhs, double *x, int ldx)
-{
-    int  i, j;
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) {
-	    x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/
-	}
-}
-
-/*
- * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
- */
-void
-dFillRHS(trans_t trans, int nrhs, double *x, int ldx,
-         SuperMatrix *A, SuperMatrix *B)
-{
-    DNformat *Bstore;
-    double   *rhs;
-    double one = 1.0;
-    double zero = 0.0;
-    int      ldc;
-    char transc[1];
-
-    Bstore = (DNformat*) B->Store;
-    rhs    = (double*) Bstore->nzval;
-    ldc    = Bstore->lda;
-    
-    if ( trans == NOTRANS ) *(unsigned char *)transc = 'N';
-    else *(unsigned char *)transc = 'T';
-
-    sp_dgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A,
-	     x, ldx, zero, rhs, ldc);
-
-}
-
-/* 
- * Fills a double precision array with a given value.
- */
-void 
-dfill(double *a, int alen, double dval)
-{
-    register int i;
-    for (i = 0; i < alen; i++) a[i] = dval;
-}
-
-
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void dinf_norm_error(int nrhs, SuperMatrix *X, double *xtrue)
-{
-    DNformat *Xstore;
-    double err, xnorm;
-    double *Xmat, *soln_work;
-    int i, j;
-
-    Xstore = (DNformat*) X->Store;
-    Xmat = (double*) Xstore->nzval;
-
-    for (j = 0; j < nrhs; j++) {
-      soln_work = &Xmat[j*Xstore->lda];
-      err = xnorm = 0.0;
-      for (i = 0; i < X->nrow; i++) {
-	err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i]));
-	xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i]));
-      }
-      err = err / xnorm;
-      printf("||X - Xtrue||/||X|| = %e\n", err);
-    }
-}
-
-
-
-/* Print performance of the code. */
-void
-dPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage,
-           double rpg, double rcond, double *ferr,
-           double *berr, char *equed, SuperLUStat_t *stat)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *utime;
-    flops_t  *ops;
-    
-    utime = stat->utime;
-    ops   = stat->ops;
-    
-    if ( utime[FACT] != 0. )
-	printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
-	       ops[FACT]*1e-6/utime[FACT]);
-    printf("Identify relaxed snodes	= %8.2f\n", utime[RELAX]);
-    if ( utime[SOLVE] != 0. )
-	printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE],
-	       ops[SOLVE]*1e-6/utime[SOLVE]);
-    
-    Lstore = (SCformat *) L->Store;
-    Ustore = (NCformat *) U->Store;
-    printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz);
-    printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz);
-    printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz);
-	
-    printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n",
-	   mem_usage->for_lu/1e6, mem_usage->total_needed/1e6,
-	   mem_usage->expansions);
-	
-    printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n");
-    printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n",
-	   utime[FACT], ops[FACT]*1e-6/utime[FACT],
-	   utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE],
-	   utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]);
-    
-    printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n");
-    printf("NUM:\t%e\t%e\t%e\t%e\t%s\n",
-	   rpg, rcond, ferr[0], berr[0], equed);
-    
-}
-
-
-
-
-int print_double_vec(char *what, int n, double *vec)
-{
-    int i;
-    printf("%s: n %d\n", what, n);
-    for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]);
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dzsum1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dzsum1.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/dzsum1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/dzsum1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,84 +0,0 @@
-#include "slu_Cnames.h"
-#include "slu_dcomplex.h"
-
-double dzsum1_(int *n, doublecomplex *cx, int *incx)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-    Purpose   
-    =======   
-
-    DZSUM1 takes the sum of the absolute values of a complex   
-    vector and returns a double precision result.   
-
-    Based on DZASUM from the Level 1 BLAS.   
-    The change is to use the 'genuine' absolute value.   
-
-    Contributed by Nick Higham for use with ZLACON.   
-
-    Arguments   
-    =========   
-
-    N       (input) INT   
-            The number of elements in the vector CX.   
-
-    CX      (input) COMPLEX*16 array, dimension (N)   
-            The vector whose elements will be summed.   
-
-    INCX    (input) INT   
-            The spacing between successive values of CX.  INCX > 0.   
-
-    ===================================================================== 
-*/  
-
-    /* Builtin functions */
-    double z_abs(doublecomplex *);
-    
-    /* Local variables */
-    int i, nincx;
-    double stemp;
-
-
-#define CX(I) cx[(I)-1]
-
-    stemp = 0.;
-    if (*n <= 0) {
-	return stemp;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-    /*     CODE FOR INCREMENT NOT EQUAL TO 1 */
-
-    nincx = *n * *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-
-	/*        NEXT LINE MODIFIED. */
-
-	stemp += z_abs(&CX(i));
-/* L10: */
-    }
-    
-    return stemp;
-
-    /*     CODE FOR INCREMENT EQUAL TO 1 */
-
-L20:
-    for (i = 1; i <= *n; ++i) {
-
-	/*        NEXT LINE MODIFIED. */
-
-	stemp += z_abs(&CX(i));
-/* L30: */
-    }
-    
-    return stemp;
-
-    /*     End of DZSUM1 */
-
-} /* dzsum1_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/get_perm_c.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/get_perm_c.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/get_perm_c.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/get_perm_c.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,451 +0,0 @@
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include "slu_ddefs.h"
-#include "slu_util.h"
-#include "colamd.h"
-
-extern int  genmmd_(int *, int *, int *, int *, int *, int *, int *, 
-		    int *, int *, int *, int *, int *);
-
-void
-get_colamd(
-	   const int m,  /* number of rows in matrix A. */
-	   const int n,  /* number of columns in matrix A. */
-	   const int nnz,/* number of nonzeros in matrix A. */
-	   int *colptr,  /* column pointer of size n+1 for matrix A. */
-	   int *rowind,  /* row indices of size nz for matrix A. */
-	   int *perm_c   /* out - the column permutation vector. */
-	   )
-{
-    int Alen, *A, i, info, *p;
-    double knobs[COLAMD_KNOBS];
-    int stats[COLAMD_STATS];
-
-    Alen = colamd_recommended(nnz, m, n);
-
-    colamd_set_defaults(knobs);
-
-    if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) )
-        ABORT("Malloc fails for A[]");
-    if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) )
-        ABORT("Malloc fails for p[]");
-    for (i = 0; i <= n; ++i) p[i] = colptr[i];
-    for (i = 0; i < nnz; ++i) A[i] = rowind[i];
-    info = colamd(m, n, Alen, A, p, knobs, stats);
-    if ( info == FALSE ) ABORT("COLAMD failed");
-
-    for (i = 0; i < n; ++i) perm_c[p[i]] = i;
-
-    SUPERLU_FREE(A);
-    SUPERLU_FREE(p);
-}
-
-void
-getata(
-       const int m,      /* number of rows in matrix A. */
-       const int n,      /* number of columns in matrix A. */
-       const int nz,     /* number of nonzeros in matrix A */
-       int *colptr,      /* column pointer of size n+1 for matrix A. */
-       int *rowind,      /* row indices of size nz for matrix A. */
-       int *atanz,       /* out - on exit, returns the actual number of
-                            nonzeros in matrix A'*A. */
-       int **ata_colptr, /* out - size n+1 */
-       int **ata_rowind  /* out - size *atanz */
-       )
-/*
- * Purpose
- * =======
- *
- * Form the structure of A'*A. A is an m-by-n matrix in column oriented
- * format represented by (colptr, rowind). The output A'*A is in column
- * oriented format (symmetrically, also row oriented), represented by
- * (ata_colptr, ata_rowind).
- *
- * This routine is modified from GETATA routine by Tim Davis.
- * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
- * i.e., the sum of the square of the row counts.
- *
- * Questions
- * =========
- *     o  Do I need to withhold the *dense* rows?
- *     o  How do I know the number of nonzeros in A'*A?
- * 
- */
-{
-    register int i, j, k, col, num_nz, ti, trow;
-    int *marker, *b_colptr, *b_rowind;
-    int *t_colptr, *t_rowind; /* a column oriented form of T = A' */
-
-    if ( !(marker = (int*) SUPERLU_MALLOC((SUPERLU_MAX(m,n)+1)*sizeof(int))) )
-	ABORT("SUPERLU_MALLOC fails for marker[]");
-    if ( !(t_colptr = (int*) SUPERLU_MALLOC((m+1) * sizeof(int))) )
-	ABORT("SUPERLU_MALLOC t_colptr[]");
-    if ( !(t_rowind = (int*) SUPERLU_MALLOC(nz * sizeof(int))) )
-	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
-
-    
-    /* Get counts of each column of T, and set up column pointers */
-    for (i = 0; i < m; ++i) marker[i] = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = colptr[j]; i < colptr[j+1]; ++i)
-	    ++marker[rowind[i]];
-    }
-    t_colptr[0] = 0;
-    for (i = 0; i < m; ++i) {
-	t_colptr[i+1] = t_colptr[i] + marker[i];
-	marker[i] = t_colptr[i];
-    }
-
-    /* Transpose the matrix from A to T */
-    for (j = 0; j < n; ++j)
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    col = rowind[i];
-	    t_rowind[marker[col]] = j;
-	    ++marker[col];
-	}
-
-    
-    /* ----------------------------------------------------------------
-       compute B = T * A, where column j of B is:
-
-       Struct (B_*j) =    UNION   ( Struct (T_*k) )
-                        A_kj != 0
-
-       do not include the diagonal entry
-   
-       ( Partition A as: A = (A_*1, ..., A_*n)
-         Then B = T * A = (T * A_*1, ..., T * A_*n), where
-         T * A_*j = (T_*1, ..., T_*m) * A_*j.  )
-       ---------------------------------------------------------------- */
-
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-
-    /* First pass determines number of nonzeros in B */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
-	    k = rowind[i];
-	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
-		trow = t_rowind[ti];
-		if ( marker[trow] != j ) {
-		    marker[trow] = j;
-		    num_nz++;
-		}
-	    }
-	}
-    }
-    *atanz = num_nz;
-    
-    /* Allocate storage for A'*A */
-    if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
-	ABORT("SUPERLU_MALLOC fails for ata_colptr[]");
-    if ( *atanz ) {
-	if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) )
-	    ABORT("SUPERLU_MALLOC fails for ata_rowind[]");
-    }
-    b_colptr = *ata_colptr; /* aliasing */
-    b_rowind = *ata_rowind;
-    
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-    
-    /* Compute each column of B, one at a time */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	b_colptr[j] = num_nz;
-	
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
-	    k = rowind[i];
-	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
-		trow = t_rowind[ti];
-		if ( marker[trow] != j ) {
-		    marker[trow] = j;
-		    b_rowind[num_nz++] = trow;
-		}
-	    }
-	}
-    }
-    b_colptr[n] = num_nz;
-       
-    SUPERLU_FREE(marker);
-    SUPERLU_FREE(t_colptr);
-    SUPERLU_FREE(t_rowind);
-}
-
-
-void
-at_plus_a(
-	  const int n,      /* number of columns in matrix A. */
-	  const int nz,     /* number of nonzeros in matrix A */
-	  int *colptr,      /* column pointer of size n+1 for matrix A. */
-	  int *rowind,      /* row indices of size nz for matrix A. */
-	  int *bnz,         /* out - on exit, returns the actual number of
-                               nonzeros in matrix A'*A. */
-	  int **b_colptr,   /* out - size n+1 */
-	  int **b_rowind    /* out - size *bnz */
-	  )
-{
-/*
- * Purpose
- * =======
- *
- * Form the structure of A'+A. A is an n-by-n matrix in column oriented
- * format represented by (colptr, rowind). The output A'+A is in column
- * oriented format (symmetrically, also row oriented), represented by
- * (b_colptr, b_rowind).
- *
- */
-    register int i, j, k, col, num_nz;
-    int *t_colptr, *t_rowind; /* a column oriented form of T = A' */
-    int *marker;
-
-    if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) )
-	ABORT("SUPERLU_MALLOC fails for marker[]");
-    if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
-	ABORT("SUPERLU_MALLOC fails for t_colptr[]");
-    if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) )
-	ABORT("SUPERLU_MALLOC fails t_rowind[]");
-
-    
-    /* Get counts of each column of T, and set up column pointers */
-    for (i = 0; i < n; ++i) marker[i] = 0;
-    for (j = 0; j < n; ++j) {
-	for (i = colptr[j]; i < colptr[j+1]; ++i)
-	    ++marker[rowind[i]];
-    }
-    t_colptr[0] = 0;
-    for (i = 0; i < n; ++i) {
-	t_colptr[i+1] = t_colptr[i] + marker[i];
-	marker[i] = t_colptr[i];
-    }
-
-    /* Transpose the matrix from A to T */
-    for (j = 0; j < n; ++j)
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    col = rowind[i];
-	    t_rowind[marker[col]] = j;
-	    ++marker[col];
-	}
-
-
-    /* ----------------------------------------------------------------
-       compute B = A + T, where column j of B is:
-
-       Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k)
-
-       do not include the diagonal entry
-       ---------------------------------------------------------------- */
-
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-
-    /* First pass determines number of nonzeros in B */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	/* Add pattern of column A_*k to B_*j */
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    k = rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		++num_nz;
-	    }
-	}
-
-	/* Add pattern of column T_*k to B_*j */
-	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
-	    k = t_rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		++num_nz;
-	    }
-	}
-    }
-    *bnz = num_nz;
-    
-    /* Allocate storage for A+A' */
-    if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
-	ABORT("SUPERLU_MALLOC fails for b_colptr[]");
-    if ( *bnz) {
-      if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) )
-	ABORT("SUPERLU_MALLOC fails for b_rowind[]");
-    }
-    
-    /* Zero the diagonal flag */
-    for (i = 0; i < n; ++i) marker[i] = -1;
-    
-    /* Compute each column of B, one at a time */
-    num_nz = 0;
-    for (j = 0; j < n; ++j) {
-	(*b_colptr)[j] = num_nz;
-	
-	/* Flag the diagonal so it's not included in the B matrix */
-	marker[j] = j;
-
-	/* Add pattern of column A_*k to B_*j */
-	for (i = colptr[j]; i < colptr[j+1]; ++i) {
-	    k = rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		(*b_rowind)[num_nz++] = k;
-	    }
-	}
-
-	/* Add pattern of column T_*k to B_*j */
-	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
-	    k = t_rowind[i];
-	    if ( marker[k] != j ) {
-		marker[k] = j;
-		(*b_rowind)[num_nz++] = k;
-	    }
-	}
-    }
-    (*b_colptr)[n] = num_nz;
-       
-    SUPERLU_FREE(marker);
-    SUPERLU_FREE(t_colptr);
-    SUPERLU_FREE(t_rowind);
-}
-
-void
-get_perm_c(int ispec, SuperMatrix *A, int *perm_c)
-/*
- * Purpose
- * =======
- *
- * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple
- * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'.
- * or using approximate minimum degree column ordering by Davis et. al.
- * The LU factorization of A*Pc tends to have less fill than the LU 
- * factorization of A.
- *
- * Arguments
- * =========
- *
- * ispec   (input) int
- *         Specifies the type of column ordering to reduce fill:
- *         = 1: minimum degree on the structure of A^T * A
- *         = 2: minimum degree on the structure of A^T + A
- *         = 3: approximate minimum degree for unsymmetric matrices
- *         If ispec == 0, the natural ordering (i.e., Pc = I) is returned.
- * 
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A 
- *         can be: Stype = NC; Dtype = _D; Mtype = GE. In the future,
- *         more general A can be handled.
- *
- * perm_c  (output) int*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- */
-{
-    NCformat *Astore = (NCformat*) A->Store;
-    int m, n, bnz = 0, *b_colptr, i;
-    int delta, maxint, nofsub, *invp;
-    int *b_rowind, *dhead, *qsize, *llist, *marker;
-    double t;//, SuperLU_timer_();
-    
-    m = A->nrow;
-    n = A->ncol;
-
-    t = SuperLU_timer_();
-    switch ( ispec ) {
-        case 0: /* Natural ordering */
-	      for (i = 0; i < n; ++i) perm_c[i] = i;
-#if ( PRNTlevel>=1 )
-	      printf("Use natural column ordering.\n");
-#endif
-	      return;
-        case 1: /* Minimum degree ordering on A'*A */
-	      getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
-		     &bnz, &b_colptr, &b_rowind);
-#if ( PRNTlevel>=1 )
-	      printf("Use minimum degree ordering on A'*A.\n");
-#endif
-	      t = SuperLU_timer_() - t;
-	      /*printf("Form A'*A time = %8.3f\n", t);*/
-	      break;
-        case 2: /* Minimum degree ordering on A'+A */
-	      if ( m != n ) ABORT("Matrix is not square");
-	      at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind,
-			&bnz, &b_colptr, &b_rowind);
-#if ( PRNTlevel>=1 )
-	      printf("Use minimum degree ordering on A'+A.\n");
-#endif
-	      t = SuperLU_timer_() - t;
-	      /*printf("Form A'+A time = %8.3f\n", t);*/
-	      break;
-        case 3: /* Approximate minimum degree column ordering. */
-	      get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
-			 perm_c);
-#if ( PRNTlevel>=1 )
-	      printf(".. Use approximate minimum degree column ordering.\n");
-#endif
-	      return; 
-        default:
-	      ABORT("Invalid ISPEC");
-    }
-
-    if ( bnz != 0 ) {
-	t = SuperLU_timer_();
-
-	/* Initialize and allocate storage for GENMMD. */
-	delta = 1; /* DELTA is a parameter to allow the choice of nodes
-		      whose degree <= min-degree + DELTA. */
-	maxint = 2147483647; /* 2**31 - 1 */
-	invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
-	if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp.");
-	dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
-	if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead.");
-	qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
-	if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize.");
-	llist = (int *) SUPERLU_MALLOC(n*sizeof(int));
-	if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist.");
-	marker = (int *) SUPERLU_MALLOC(n*sizeof(int));
-	if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker.");
-
-	/* Transform adjacency list into 1-based indexing required by GENMMD.*/
-	for (i = 0; i <= n; ++i) ++b_colptr[i];
-	for (i = 0; i < bnz; ++i) ++b_rowind[i];
-	
-	genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, 
-		qsize, llist, marker, &maxint, &nofsub);
-
-	/* Transform perm_c into 0-based indexing. */
-	for (i = 0; i < n; ++i) --perm_c[i];
-
-	SUPERLU_FREE(invp);
-	SUPERLU_FREE(dhead);
-	SUPERLU_FREE(qsize);
-	SUPERLU_FREE(llist);
-	SUPERLU_FREE(marker);
-	SUPERLU_FREE(b_rowind);
-
-	t = SuperLU_timer_() - t;
-	/*  printf("call GENMMD time = %8.3f\n", t);*/
-
-    } else { /* Empty adjacency structure */
-	for (i = 0; i < n; ++i) perm_c[i] = i;
-    }
-
-    SUPERLU_FREE(b_colptr);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/heap_relax_snode.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,116 +0,0 @@
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-void
-heap_relax_snode (
-	     const     int n,
-	     int       *et,           /* column elimination tree */
-	     const int relax_columns, /* max no of columns allowed in a
-					 relaxed snode */
-	     int       *descendants,  /* no of descendants of each node
-					 in the etree */
-	     int       *relax_end     /* last column in a supernode */
-	     )
-{
-/*
- * Purpose
- * =======
- *    relax_snode() - Identify the initial relaxed supernodes, assuming that 
- *    the matrix has been reordered according to the postorder of the etree.
- *
- */ 
-    register int i, j, k, l, parent;
-    register int snode_start;	/* beginning of a snode */
-    int *et_save, *post, *inv_post, *iwork;
-    int nsuper_et = 0, nsuper_et_post = 0;
-
-    /* The etree may not be postordered, but is heap ordered. */
-
-    iwork = (int*) intMalloc(3*n+2); 
-    if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
-    inv_post = iwork + n+1;
-    et_save = inv_post + n+1;
-
-    /* Post order etree */
-    post = (int *) TreePostorder(n, et);
-    for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;
-
-    /* Renumber etree in postorder */
-    for (i = 0; i < n; ++i) {
-        iwork[post[i]] = post[et[i]];
-	et_save[i] = et[i]; /* Save the original etree */
-    }
-    for (i = 0; i < n; ++i) et[i] = iwork[i];
-
-    /* Compute the number of descendants of each node in the etree */
-    ifill (relax_end, n, EMPTY);
-    for (j = 0; j < n; j++) descendants[j] = 0;
-    for (j = 0; j < n; j++) {
-	parent = et[j];
-	if ( parent != n )  /* not the dummy root */
-	    descendants[parent] += descendants[j] + 1;
-    }
-
-    /* Identify the relaxed supernodes by postorder traversal of the etree. */
-    for (j = 0; j < n; ) { 
-     	parent = et[j];
-        snode_start = j;
- 	while ( parent != n && descendants[parent] < relax_columns ) {
-	    j = parent;
-	    parent = et[j];
-	}
-	/* Found a supernode in postordered etree; j is the last column. */
-	++nsuper_et_post;
-	k = n;
-	for (i = snode_start; i <= j; ++i)
-	    k = SUPERLU_MIN(k, inv_post[i]);
-	l = inv_post[j];
-	if ( (l - k) == (j - snode_start) ) {
-	    /* It's also a supernode in the original etree */
-	    relax_end[k] = l;		/* Last column is recorded */
-	    ++nsuper_et;
-	} else {
-	    for (i = snode_start; i <= j; ++i) {
-	        l = inv_post[i];
-	        if ( descendants[i] == 0 ) relax_end[l] = l;
-	    }
-	}
-	j++;
-	/* Search for a new leaf */
-	while ( descendants[j] != 0 && j < n ) j++;
-    }
-
-#if ( PRNTlevel>=1 )
-    printf(".. heap_snode_relax:\n"
-	   "\tNo of relaxed snodes in postordered etree:\t%d\n"
-	   "\tNo of relaxed snodes in original etree:\t%d\n",
-	   nsuper_et_post, nsuper_et);
-#endif
-
-    /* Recover the original etree */
-    for (i = 0; i < n; ++i) et[i] = et_save[i];
-
-    SUPERLU_FREE(post);
-    SUPERLU_FREE(iwork);
-}
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/icmax1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/icmax1.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/icmax1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/icmax1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,109 +0,0 @@
-#include <math.h>
-#include "slu_scomplex.h"
-#include "slu_Cnames.h"
-
-int icmax1_(int *n, complex *cx, int *incx)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-
-    Purpose   
-    =======   
-
-    ICMAX1 finds the index of the element whose real part has maximum   
-    absolute value.   
-
-    Based on ICAMAX from Level 1 BLAS.   
-    The change is to use the 'genuine' absolute value.   
-
-    Contributed by Nick Higham for use with CLACON.   
-
-    Arguments   
-    =========   
-
-    N       (input) INT   
-            The number of elements in the vector CX.   
-
-    CX      (input) COMPLEX array, dimension (N)   
-            The vector whose elements will be summed.   
-
-    INCX    (input) INT   
-            The spacing between successive values of CX.  INCX >= 1.   
-
-   ===================================================================== 
-  
-
-
-       NEXT LINE IS THE ONLY MODIFICATION.   
-
-    
-   Parameter adjustments   
-       Function Body */
-    /* System generated locals */
-    int ret_val, i__1, i__2;
-    float r__1;
-    /* Local variables */
-    static float smax;
-    static int i, ix;
-
-
-#define CX(I) cx[(I)-1]
-
-
-    ret_val = 0;
-    if (*n < 1) {
-	return ret_val;
-    }
-    ret_val = 1;
-    if (*n == 1) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L30;
-    }
-
-/*     CODE FOR INCREMENT NOT EQUAL TO 1 */
-
-    ix = 1;
-    smax = (r__1 = CX(1).r, fabs(r__1));
-    ix += *incx;
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	i__2 = ix;
-	if ((r__1 = CX(ix).r, fabs(r__1)) <= smax) {
-	    goto L10;
-	}
-	ret_val = i;
-	i__2 = ix;
-	smax = (r__1 = CX(ix).r, fabs(r__1));
-L10:
-	ix += *incx;
-/* L20: */
-    }
-    return ret_val;
-
-/*     CODE FOR INCREMENT EQUAL TO 1 */
-
-L30:
-    smax = (r__1 = CX(1).r, fabs(r__1));
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	i__2 = i;
-	if ((r__1 = CX(i).r, fabs(r__1)) <= smax) {
-	    goto L40;
-	}
-	ret_val = i;
-	i__2 = i;
-	smax = (r__1 = CX(i).r, fabs(r__1));
-L40:
-	;
-    }
-    return ret_val;
-
-/*     End of ICMAX1 */
-
-} /* icmax1_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/izmax1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/izmax1.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/izmax1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/izmax1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,103 +0,0 @@
-#include <math.h>
-#include "slu_Cnames.h"
-#include "slu_dcomplex.h"
-
-int
-izmax1_(int *n, doublecomplex *cx, int *incx)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-
-    Purpose   
-    =======   
-
-    IZMAX1 finds the index of the element whose real part has maximum   
-    absolute value.   
-
-    Based on IZAMAX from Level 1 BLAS.   
-    The change is to use the 'genuine' absolute value.   
-
-    Contributed by Nick Higham for use with ZLACON.   
-
-    Arguments   
-    =========   
-
-    N       (input) INT   
-            The number of elements in the vector CX.   
-
-    CX      (input) COMPLEX*16 array, dimension (N)   
-            The vector whose elements will be summed.   
-
-    INCX    (input) INT   
-            The spacing between successive values of CX.  INCX >= 1.   
-
-   ===================================================================== 
-*/  
-
-    /* System generated locals */
-    int ret_val, i__1, i__2;
-    double d__1;
-    
-    /* Local variables */
-    double smax;
-    int i, ix;
-
-#define CX(I) cx[(I)-1]
-
-    ret_val = 0;
-    if (*n < 1) {
-	return ret_val;
-    }
-    ret_val = 1;
-    if (*n == 1) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L30;
-    }
-
-/*     CODE FOR INCREMENT NOT EQUAL TO 1 */
-
-    ix = 1;
-    smax = (d__1 = CX(1).r, fabs(d__1));
-    ix += *incx;
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	i__2 = ix;
-	if ((d__1 = CX(ix).r, fabs(d__1)) <= smax) {
-	    goto L10;
-	}
-	ret_val = i;
-	i__2 = ix;
-	smax = (d__1 = CX(ix).r, fabs(d__1));
-L10:
-	ix += *incx;
-/* L20: */
-    }
-    return ret_val;
-
-/*     CODE FOR INCREMENT EQUAL TO 1 */
-
-L30:
-    smax = (d__1 = CX(1).r, fabs(d__1));
-    i__1 = *n;
-    for (i = 2; i <= *n; ++i) {
-	i__2 = i;
-	if ((d__1 = CX(i).r, fabs(d__1)) <= smax) {
-	    goto L40;
-	}
-	ret_val = i;
-	i__2 = i;
-	smax = (d__1 = CX(i).r, fabs(d__1));
-L40:
-	;
-    }
-    return ret_val;
-
-/*     End of IZMAX1 */
-
-} /* izmax1_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/lsame.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/lsame.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/lsame.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/lsame.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,72 +0,0 @@
-#include "slu_Cnames.h"
-
-int lsame_(char *ca, char *cb)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-    Purpose   
-    =======   
-
-    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.   
-
-    Arguments   
-    =========   
-
-    CA      (input) CHARACTER*1   
-    CB      (input) CHARACTER*1   
-            CA and CB specify the single characters to be compared.   
-
-   ===================================================================== 
-*/  
-
-    /* System generated locals */
-    int ret_val;
-    
-    /* Local variables */
-    int inta, intb, zcode;
-
-    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
-    if (ret_val) {
-	return ret_val;
-    }
-
-    /* Now test for equivalence if both characters are alphabetic. */
-
-    zcode = 'Z';
-
-    /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime   
-       machines, on which ICHAR returns a value with bit 8 set.   
-       ICHAR('A') on Prime machines returns 193 which is the same as   
-       ICHAR('A') on an EBCDIC machine. */
-
-    inta = *(unsigned char *)ca;
-    intb = *(unsigned char *)cb;
-
-    if (zcode == 90 || zcode == 122) {
-	/* ASCII is assumed - ZCODE is the ASCII code of either lower or   
-          upper case 'Z'. */
-	if (inta >= 97 && inta <= 122) inta += -32;
-	if (intb >= 97 && intb <= 122) intb += -32;
-
-    } else if (zcode == 233 || zcode == 169) {
-	/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or   
-          upper case 'Z'. */
-	if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta 
-		>= 162 && inta <= 169)
-	    inta += 64;
-	if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb 
-		>= 162 && intb <= 169)
-	    intb += 64;
-    } else if (zcode == 218 || zcode == 250) {
-	/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code   
-          plus 128 of either lower or upper case 'Z'. */
-	if (inta >= 225 && inta <= 250) inta += -32;
-	if (intb >= 225 && intb <= 250) intb += -32;
-    }
-    ret_val = inta == intb;
-    return ret_val;
-    
-} /* lsame_ */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Makefile hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Makefile
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/Makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,218 +0,0 @@
-# makefile for sparse supernodal LU, implemented in ANSI C
-include ../make.inc
-
-#######################################################################
-#  This is the makefile to create a library for SuperLU.
-#  The files are organized as follows:
-#
-#       ALLAUX -- Auxiliary routines called from all precisions of SuperLU
-#	LAAUX  -- LAPACK auxiliary routines called from all precisions
-#	SLASRC -- LAPACK single precision real routines
-#	DLASRC -- LAPACK double precision real routines
-#	CLASRC -- LAPACK single precision complex routines
-#	ZLASRC -- LAPACK double precision complex routines
-#       SCLAUX -- LAPACK Auxiliary routines called from both real and complex
-#       DZLAUX -- LAPACK Auxiliary routines called from both double precision
-#                 and complex*16
-#	SLUSRC -- Single precision real SuperLU routines
-#       DLUSRC -- Double precision real SuperLU routines
-#	CLUSRC -- Single precision complex SuperLU routines
-#       ZLUSRC -- Double precision complex SuperLU routines
-#
-#  The library can be set up to include routines for any combination
-#  of the four precisions.  To create or add to the library, enter make
-#  followed by one or more of the precisions desired.  Some examples:
-#       make single
-#       make single double
-#       make single double complex complex16
-#  Alternatively, the command
-#       make
-#  without any arguments creates a library of all four precisions.
-#  The library is called
-#       superlu.a
-#  and is created at the next higher directory level.
-#
-#  To remove the object files after the library is created, enter
-#       make clean
-#
-#######################################################################
-
-#######################################
-### added for hypre
-#######################################
-include ../../../config/Makefile.config
-CINCLUDES = ${INCLUDES} ${MPIINCLUDE}
-C_COMPILE_FLAGS = \
- -I$(srcdir)\
- -I$(srcdir)/../../../\
- -I$(srcdir)/../../../utilities\
- -I$(srcdir)/../../../blas\
- -I$(srcdir)/../../../lapack\
- ${CINCLUDES}
-
-HEADERS =\
- colamd.h\
- slu_ddefs.h\
- slu_util.h\
- supermatrix.h
-#######################################
-
-### LAPACK 
-LAAUX 	= lsame.o xerbla.o
-SLASRC 	= slacon.o 
-DLASRC	= dlacon.o
-CLASRC	= clacon.o scsum1.o icmax1.o
-ZLASRC	= zlacon.o dzsum1.o izmax1.o
-SCLAUX 	= slamch.o
-DZLAUX 	= dlamch.o
-
-### SuperLU 
-ALLAUX 	= superlu_timer.o util.o memory.o get_perm_c.o mmd.o \
-	  sp_coletree.o sp_preorder.o sp_ienv.o relax_snode.o \
-	  heap_relax_snode.o colamd.o
-
-SLUSRC = \
-	sgssv.o sgssvx.o \
-	ssp_blas2.o ssp_blas3.o sgscon.o  \
-	slangs.o sgsequ.o slaqgs.o spivotgrowth.o \
-	sgsrfs.o sgstrf.o sgstrs.o scopy_to_ucol.o \
-	ssnode_dfs.o ssnode_bmod.o \
-	spanel_dfs.o spanel_bmod.o sreadhb.o \
-	scolumn_dfs.o scolumn_bmod.o spivotL.o spruneL.o \
-	smemory.o sutil.o smyblas2.o
-
-DLUSRC = \
-	dgssv.o dgssvx.o \
-	dsp_blas2.o dsp_blas3.o dgscon.o \
-	dlangs.o dgsequ.o dlaqgs.o dpivotgrowth.o  \
-	dgsrfs.o dgstrf.o dgstrs.o dcopy_to_ucol.o \
-	dsnode_dfs.o dsnode_bmod.o \
-	dpanel_dfs.o dpanel_bmod.o dreadhb.o \
-	dcolumn_dfs.o dcolumn_bmod.o dpivotL.o dpruneL.o \
-	dmemory.o dutil.o dmyblas2.o
-
-CLUSRC = \
-	scomplex.o cgssv.o cgssvx.o csp_blas2.o csp_blas3.o cgscon.o \
-	clangs.o cgsequ.o claqgs.o cpivotgrowth.o  \
-	cgsrfs.o cgstrf.o cgstrs.o ccopy_to_ucol.o \
-	csnode_dfs.o csnode_bmod.o \
-	cpanel_dfs.o cpanel_bmod.o creadhb.o \
-	ccolumn_dfs.o ccolumn_bmod.o cpivotL.o cpruneL.o \
-	cmemory.o cutil.o cmyblas2.o
-
-ZLUSRC = \
-	dcomplex.o zgssv.o zgssvx.o zsp_blas2.o zsp_blas3.o zgscon.o \
-	zlangs.o zgsequ.o zlaqgs.o zpivotgrowth.o  \
-	zgsrfs.o zgstrf.o zgstrs.o zcopy_to_ucol.o \
-	zsnode_dfs.o zsnode_bmod.o \
-	zpanel_dfs.o zpanel_bmod.o zreadhb.o \
-	zcolumn_dfs.o zcolumn_bmod.o zpivotL.o zpruneL.o \
-	zmemory.o zutil.o zmyblas2.o
-
-#######################################
-### changed for hypre
-#######################################
-
-FILES =\
- colamd.c \
- dcolumn_bmod.c \
- dcolumn_dfs.c \
- dcopy_to_ucol.c \
- dgscon.c \
- dgsequ.c \
- dgsrfs.c \
- dgssv.c \
- dgssvx.c \
- dgstrf.c \
- dgstrs.c \
- dlacon.c \
- dlangs.c \
- dlaqgs.c \
- dmemory.c \
- dpanel_bmod.c \
- dpanel_dfs.c \
- dpivotgrowth.c \
- dpivotL.c \
- dpruneL.c \
- dreadhb.c \
- dsnode_bmod.c \
- dsnode_dfs.c \
- dsp_blas2.c \
- dsp_blas3.c \
- dutil.c \
- get_perm_c.c \
- heap_relax_snode.o \
- memory.o \
- mmd.c \
- relax_snode.c \
- sp_coletree.c \
- sp_ienv.c \
- sp_preorder.c \
- superlu_timer.c \
- slu_util.c
-
-OBJS = ${FILES:.c=.o}
-
-all: libHYPRE_superlu${HYPRE_LIB_SUFFIX}
-	cp -fp *.h $(HYPRE_BUILD_DIR)/include
-#	cp -fp libHYPRE* $(HYPRE_BUILD_DIR)/lib
-
-install: libHYPRE_superlu${HYPRE_LIB_SUFFIX}
-	cp -f *.h $(HYPRE_INC_INSTALL)
-#	cp -f libHYPRE* $(HYPRE_LIB_INSTALL)
-	@echo  " "
-
-clean:
-	rm -f *.o libHYPRE*
-	rm -rf pchdir tca.map *inslog*
-distclean: clean
-
-libHYPRE_superlu.a: ${OBJS}
-	@echo  "Building $@ ... "
-	${AR} $@ ${OBJS}
-	${RANLIB} $@
-
-libHYPRE_superlu.so: ${OBJS}
-	@echo  "Building $@ ... "
-	${BUILD_CC_SHARED} -o $@ ${OBJS}
-
-${OBJS}: ${HEADERS}
-
-#######################################
-
-all4:    single double complex complex16
-
-single: $(SLUSRC) $(ALLAUX) $(LAAUX) $(SLASRC) $(SCLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \
-		$(SLUSRC) $(ALLAUX) $(LAAUX) $(SLASRC) $(SCLAUX)
-	$(RANLIB) ../$(SUPERLULIB)
-
-double: $(DLUSRC) $(ALLAUX) $(LAAUX) $(DLASRC) $(DZLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \
-                $(DLUSRC) $(ALLAUX) $(LAAUX) $(DLASRC) $(DZLAUX)
-	$(RANLIB) ../$(SUPERLULIB)
-
-complex: $(CLUSRC) $(ALLAUX) $(LAAUX) $(CLASRC) $(SCLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \
-		$(CLUSRC) $(ALLAUX) $(LAAUX) $(CLASRC) $(SCLAUX)
-	$(RANLIB) ../$(SUPERLULIB)
-
-complex16: $(ZLUSRC) $(ALLAUX) $(LAAUX) $(ZLASRC) $(DZLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(SUPERLULIB) \
-		$(ZLUSRC) $(ALLAUX) $(LAAUX) $(ZLASRC) $(DZLAUX)
-	$(RANLIB) ../$(SUPERLULIB)
-
-
-##################################
-# Do not optimize these routines #
-##################################
-slamch.o: slamch.c ; $(CC) -g -c ${SHARED_COMPILE_FLAG} $<
-dlamch.o: dlamch.c ; $(CC) -g -c ${SHARED_COMPILE_FLAG} $<
-#superlu_timer.o:  superlu_timer.c ; $(CC) -g -c -DNO_TIMER ${SHARED_COMPILE_FLAG} $<
-##################################
-
-#.c.o:
-#	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE)
-
-#clean:	
-#	rm -f *.o ../libsuperlu_3.0.a
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/memory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/memory.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/memory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/memory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,208 +0,0 @@
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/** Precision-independent memory-related routines.
-    (Shared by [sdcz]memory.c) **/
-
-#include "slu_ddefs.h"
-
-
-#if ( DEBUGlevel>=1 )           /* Debug malloc/free. */
-int superlu_malloc_total = 0;
-
-#define PAD_FACTOR  2
-#define DWORD  (sizeof(double)) /* Be sure it's no smaller than double. */
-/* size_t is usually defined as 'unsigned long' */
-
-void *superlu_malloc(size_t size)
-{
-    char *buf;
-
-    buf = (char *) malloc(size + DWORD);
-    if ( !buf ) {
-	printf("superlu_malloc fails: malloc_total %.0f MB, size %ld\n",
-	       superlu_malloc_total*1e-6, size);
-	ABORT("superlu_malloc: out of memory");
-    }
-
-    ((int_t *) buf)[0] = size;
-#if 0
-    superlu_malloc_total += size + DWORD;
-#else
-    superlu_malloc_total += size;
-#endif
-    return (void *) (buf + DWORD);
-}
-
-void superlu_free(void *addr)
-{
-    char *p = ((char *) addr) - DWORD;
-
-    if ( !addr )
-	ABORT("superlu_free: tried to free NULL pointer");
-
-    if ( !p )
-	ABORT("superlu_free: tried to free NULL+DWORD pointer");
-
-    { 
-	int_t n = ((int_t *) p)[0];
-	
-	if ( !n )
-	    ABORT("superlu_free: tried to free a freed pointer");
-	*((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */
-#if 0	
-	superlu_malloc_total -= (n + DWORD);
-#else
-	superlu_malloc_total -= n;
-#endif
-
-	if ( superlu_malloc_total < 0 )
-	    ABORT("superlu_malloc_total went negative!");
-	
-	/*free (addr);*/
-	free (p);
-    }
-
-}
-
-#else   /* production mode */
-
-void *superlu_malloc(size_t size)
-{
-    void *buf;
-    buf = (void *) malloc(size);
-    return (buf);
-}
-
-void superlu_free(void *addr)
-{
-    free (addr);
-}
-
-#endif
-
-
-/*
- * Set up pointers for integer working arrays.
- */
-void
-SetIWork(int m, int n, int panel_size, int *iworkptr, int **segrep,
-	 int **parent, int **xplore, int **repfnz, int **panel_lsub,
-	 int **xprune, int **marker)
-{
-    *segrep = iworkptr;
-    *parent = iworkptr + m;
-    *xplore = *parent + m;
-    *repfnz = *xplore + m;
-    *panel_lsub = *repfnz + panel_size * m;
-    *xprune = *panel_lsub + panel_size * m;
-    *marker = *xprune + n;
-    ifill (*repfnz, m * panel_size, EMPTY);
-    ifill (*panel_lsub, m * panel_size, EMPTY);
-}
-
-
-void
-copy_mem_int(int howmany, void *oldM, void *newM)
-{
-    register int i;
-    int *iold = (int*) oldM;
-    int *inew = (int*) newM;
-    for (i = 0; i < howmany; i++) inew[i] = iold[i];
-}
-
-
-void
-user_bcopy(char *src, char *dest, int bytes)
-{
-    char *s_ptr, *d_ptr;
-
-    s_ptr = src + bytes - 1;
-    d_ptr = dest + bytes - 1;
-    for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
-}
-
-
-
-int *intMalloc(int n)
-{
-    int *buf;
-    buf = (int *) SUPERLU_MALLOC(n * sizeof(int));
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC fails for buf in intMalloc()");
-    }
-    return (buf);
-}
-
-int *intCalloc(int n)
-{
-    int *buf;
-    register int i;
-    buf = (int *) SUPERLU_MALLOC(n * sizeof(int));
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC fails for buf in intCalloc()");
-    }
-    for (i = 0; i < n; ++i) buf[i] = 0;
-    return (buf);
-}
-
-
-
-#if 0
-check_expanders()
-{
-    int p;
-    printf("Check expanders:\n");
-    for (p = 0; p < NO_MEMTYPE; p++) {
-	printf("type %d, size %d, mem %d\n",
-	       p, expanders[p].size, (int)expanders[p].mem);
-    }
-
-    return 0;
-}
-
-
-StackInfo()
-{
-    printf("Stack: size %d, used %d, top1 %d, top2 %d\n",
-	   stack.size, stack.used, stack.top1, stack.top2);
-    return 0;
-}
-
-
-
-PrintStack(char *msg, GlobalLU_t *Glu)
-{
-    int i;
-    int *xlsub, *lsub, *xusub, *usub;
-
-    xlsub = Glu->xlsub;
-    lsub  = Glu->lsub;
-    xusub = Glu->xusub;
-    usub  = Glu->usub;
-
-    printf("%s\n", msg);
-    
-/*    printf("\nUCOL: ");
-    for (i = 0; i < xusub[ndim]; ++i)
-	printf("%f  ", ucol[i]);
-
-    printf("\nLSUB: ");
-    for (i = 0; i < xlsub[ndim]; ++i)
-	printf("%d  ", lsub[i]);
-
-    printf("\nUSUB: ");
-    for (i = 0; i < xusub[ndim]; ++i)
-	printf("%d  ", usub[i]);
-
-    printf("\n");*/
-    return 0;
-}   
-#endif
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/mmd.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/mmd.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/mmd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/mmd.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,1012 +0,0 @@
-
-typedef int shortint;
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* ****     GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE     **** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */
-/*        ALGORITHM.  IT MAKES USE OF THE IMPLICIT REPRESENTATION */
-/*        OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */
-/*        NOTION OF INDISTINGUISHABLE NODES.  IT ALSO IMPLEMENTS */
-/*        THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */
-/*        EXTERNAL DEGREE. */
-/*        --------------------------------------------- */
-/*        CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */
-/*        DESTROYED. */
-/*        --------------------------------------------- */
-
-/*     INPUT PARAMETERS - */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */
-/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
-/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */
-/*                 (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */
-/*                 NODES. */
-
-/*     OUTPUT PARAMETERS - */
-/*        PERM   - THE MINIMUM DEGREE ORDERING. */
-/*        INVP   - THE INVERSE OF PERM. */
-/*        NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */
-/*                 SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */
-
-/*     WORKING PARAMETERS - */
-/*        DHEAD  - VECTOR FOR HEAD OF DEGREE LISTS. */
-/*        INVP   - USED TEMPORARILY FOR DEGREE FORWARD LINK. */
-/*        PERM   - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */
-/*        QSIZE  - VECTOR FOR SIZE OF SUPERNODES. */
-/*        LLIST  - VECTOR FOR TEMPORARY LINKED LISTS. */
-/*        MARKER - A TEMPORARY MARKER VECTOR. */
-
-/*     PROGRAM SUBROUTINES - */
-/*        MMDELM, MMDINT, MMDNUM, MMDUPD. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int genmmd_(int *neqns, int *xadj, shortint *adjncy, 
-	shortint *invp, shortint *perm, int *delta, shortint *dhead, 
-	shortint *qsize, shortint *llist, shortint *marker, int *maxint, 
-	int *nofsub)
-{
-    /* System generated locals */
-    int i__1;
-
-    /* Local variables */
-    static int mdeg, ehead, i, mdlmt, mdnode;
-    extern /* Subroutine */ int mmdelm_(int *, int *, shortint *, 
-	    shortint *, shortint *, shortint *, shortint *, shortint *, 
-	    shortint *, int *, int *), mmdupd_(int *, int *, 
-	    int *, shortint *, int *, int *, shortint *, shortint 
-	    *, shortint *, shortint *, shortint *, shortint *, int *, 
-	    int *), mmdint_(int *, int *, shortint *, shortint *, 
-	    shortint *, shortint *, shortint *, shortint *, shortint *), 
-	    mmdnum_(int *, shortint *, shortint *, shortint *);
-    static int nextmd, tag, num;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dhead;
-    --perm;
-    --invp;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    if (*neqns <= 0) {
-	return 0;
-    }
-
-/*        ------------------------------------------------ */
-/*        INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */
-/*        ------------------------------------------------ */
-    *nofsub = 0;
-    mmdint_(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], &
-	    qsize[1], &llist[1], &marker[1]);
-
-/*        ---------------------------------------------- */
-/*        NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */
-/*        ---------------------------------------------- */
-    num = 1;
-
-/*        ----------------------------- */
-/*        ELIMINATE ALL ISOLATED NODES. */
-/*        ----------------------------- */
-    nextmd = dhead[1];
-L100:
-    if (nextmd <= 0) {
-	goto L200;
-    }
-    mdnode = nextmd;
-    nextmd = invp[mdnode];
-    marker[mdnode] = *maxint;
-    invp[mdnode] = -num;
-    ++num;
-    goto L100;
-
-L200:
-/*        ---------------------------------------- */
-/*        SEARCH FOR NODE OF THE MINIMUM DEGREE. */
-/*        MDEG IS THE CURRENT MINIMUM DEGREE; */
-/*        TAG IS USED TO FACILITATE MARKING NODES. */
-/*        ---------------------------------------- */
-    if (num > *neqns) {
-	goto L1000;
-    }
-    tag = 1;
-    dhead[1] = 0;
-    mdeg = 2;
-L300:
-    if (dhead[mdeg] > 0) {
-	goto L400;
-    }
-    ++mdeg;
-    goto L300;
-L400:
-/*            ------------------------------------------------- */
-/*            USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */
-/*            WHEN A DEGREE UPDATE IS TO BE PERFORMED. */
-/*            ------------------------------------------------- */
-    mdlmt = mdeg + *delta;
-    ehead = 0;
-
-L500:
-    mdnode = dhead[mdeg];
-    if (mdnode > 0) {
-	goto L600;
-    }
-    ++mdeg;
-    if (mdeg > mdlmt) {
-	goto L900;
-    }
-    goto L500;
-L600:
-/*                ---------------------------------------- */
-/*                REMOVE MDNODE FROM THE DEGREE STRUCTURE. */
-/*                ---------------------------------------- */
-    nextmd = invp[mdnode];
-    dhead[mdeg] = nextmd;
-    if (nextmd > 0) {
-	perm[nextmd] = -mdeg;
-    }
-    invp[mdnode] = -num;
-    *nofsub = *nofsub + mdeg + qsize[mdnode] - 2;
-    if (num + qsize[mdnode] > *neqns) {
-	goto L1000;
-    }
-/*                ---------------------------------------------- */
-/*                ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */
-/*                TRANSFORMATION.  RESET TAG VALUE IF NECESSARY. */
-/*                ---------------------------------------------- */
-    ++tag;
-    if (tag < *maxint) {
-	goto L800;
-    }
-    tag = 1;
-    i__1 = *neqns;
-    for (i = 1; i <= i__1; ++i) {
-	if (marker[i] < *maxint) {
-	    marker[i] = 0;
-	}
-/* L700: */
-    }
-L800:
-    mmdelm_(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], &
-	    qsize[1], &llist[1], &marker[1], maxint, &tag);
-    num += qsize[mdnode];
-    llist[mdnode] = ehead;
-    ehead = mdnode;
-    if (*delta >= 0) {
-	goto L500;
-    }
-L900:
-/*            ------------------------------------------- */
-/*            UPDATE DEGREES OF THE NODES INVOLVED IN THE */
-/*            MINIMUM DEGREE NODES ELIMINATION. */
-/*            ------------------------------------------- */
-    if (num > *neqns) {
-	goto L1000;
-    }
-    mmdupd_(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], &
-	    invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, &tag)
-	    ;
-    goto L300;
-
-L1000:
-    mmdnum_(neqns, &perm[1], &invp[1], &qsize[1]);
-    return 0;
-
-} /* genmmd_ */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* ***     MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION     *** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */
-/*        MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */
-/*        ALGORITHM. */
-
-/*     INPUT PARAMETERS - */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
-
-/*     OUTPUT PARAMETERS - */
-/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
-/*        QSIZE  - SIZE OF SUPERNODE (INITIALIZED TO ONE). */
-/*        LLIST  - LINKED LIST. */
-/*        MARKER - MARKER VECTOR. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdint_(int *neqns, int *xadj, shortint *adjncy, 
-	shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, 
-	shortint *llist, shortint *marker)
-{
-    /* System generated locals */
-    int i__1;
-
-    /* Local variables */
-    static int ndeg, node, fnode;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dbakw;
-    --dforw;
-    --dhead;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	dhead[node] = 0;
-	qsize[node] = 1;
-	marker[node] = 0;
-	llist[node] = 0;
-/* L100: */
-    }
-/*        ------------------------------------------ */
-/*        INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */
-/*        ------------------------------------------ */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	ndeg = xadj[node + 1] - xadj[node] + 1;
-	fnode = dhead[ndeg];
-	dforw[node] = fnode;
-	dhead[ndeg] = node;
-	if (fnode > 0) {
-	    dbakw[fnode] = node;
-	}
-	dbakw[node] = -ndeg;
-/* L200: */
-    }
-    return 0;
-
-} /* mmdint_ */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* **     MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION     *** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */
-/*        MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */
-/*        IS STORED IN THE QUOTIENT GRAPH FORMAT.  IT ALSO */
-/*        TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */
-/*        ELIMINATION GRAPH. */
-
-/*     INPUT PARAMETERS - */
-/*        MDNODE - NODE OF MINIMUM DEGREE. */
-/*        MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */
-/*                 INT. */
-/*        TAG    - TAG VALUE. */
-
-/*     UPDATED PARAMETERS - */
-/*        (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */
-/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
-/*        QSIZE  - SIZE OF SUPERNODE. */
-/*        MARKER - MARKER VECTOR. */
-/*        LLIST  - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdelm_(int *mdnode, int *xadj, shortint *adjncy,
-	 shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, 
-	shortint *llist, shortint *marker, int *maxint, int *tag)
-{
-    /* System generated locals */
-    int i__1, i__2;
-
-    /* Local variables */
-    static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, 
-	    istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-/*        ----------------------------------------------- */
-/*        FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */
-/*        ----------------------------------------------- */
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dbakw;
-    --dforw;
-    --dhead;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    marker[*mdnode] = *tag;
-    istrt = xadj[*mdnode];
-    istop = xadj[*mdnode + 1] - 1;
-/*        ------------------------------------------------------- */
-/*        ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */
-/*        NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */
-/*        FOR THE NEXT REACHABLE NODE. */
-/*        ------------------------------------------------------- */
-    elmnt = 0;
-    rloc = istrt;
-    rlmt = istop;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	nabor = adjncy[i];
-	if (nabor == 0) {
-	    goto L300;
-	}
-	if (marker[nabor] >= *tag) {
-	    goto L200;
-	}
-	marker[nabor] = *tag;
-	if (dforw[nabor] < 0) {
-	    goto L100;
-	}
-	adjncy[rloc] = nabor;
-	++rloc;
-	goto L200;
-L100:
-	llist[nabor] = elmnt;
-	elmnt = nabor;
-L200:
-	;
-    }
-L300:
-/*            ----------------------------------------------------- */
-/*            MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */
-/*            ----------------------------------------------------- */
-    if (elmnt <= 0) {
-	goto L1000;
-    }
-    adjncy[rlmt] = -elmnt;
-    link = elmnt;
-L400:
-    jstrt = xadj[link];
-    jstop = xadj[link + 1] - 1;
-    i__1 = jstop;
-    for (j = jstrt; j <= i__1; ++j) {
-	node = adjncy[j];
-	link = -node;
-	if (node < 0) {
-	    goto L400;
-	} else if (node == 0) {
-	    goto L900;
-	} else {
-	    goto L500;
-	}
-L500:
-	if (marker[node] >= *tag || dforw[node] < 0) {
-	    goto L800;
-	}
-	marker[node] = *tag;
-/*                            --------------------------------- */
-/*                            USE STORAGE FROM ELIMINATED NODES */
-/*                            IF NECESSARY. */
-/*                            --------------------------------- */
-L600:
-	if (rloc < rlmt) {
-	    goto L700;
-	}
-	link = -adjncy[rlmt];
-	rloc = xadj[link];
-	rlmt = xadj[link + 1] - 1;
-	goto L600;
-L700:
-	adjncy[rloc] = node;
-	++rloc;
-L800:
-	;
-    }
-L900:
-    elmnt = llist[elmnt];
-    goto L300;
-L1000:
-    if (rloc <= rlmt) {
-	adjncy[rloc] = 0;
-    }
-/*        -------------------------------------------------------- */
-/*        FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */
-/*        -------------------------------------------------------- */
-    link = *mdnode;
-L1100:
-    istrt = xadj[link];
-    istop = xadj[link + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	rnode = adjncy[i];
-	link = -rnode;
-	if (rnode < 0) {
-	    goto L1100;
-	} else if (rnode == 0) {
-	    goto L1800;
-	} else {
-	    goto L1200;
-	}
-L1200:
-/*                -------------------------------------------- */
-/*                IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */
-/*                -------------------------------------------- */
-	pvnode = dbakw[rnode];
-	if (pvnode == 0 || pvnode == -(*maxint)) {
-	    goto L1300;
-	}
-/*                    ------------------------------------- */
-/*                    THEN REMOVE RNODE FROM THE STRUCTURE. */
-/*                    ------------------------------------- */
-	nxnode = dforw[rnode];
-	if (nxnode > 0) {
-	    dbakw[nxnode] = pvnode;
-	}
-	if (pvnode > 0) {
-	    dforw[pvnode] = nxnode;
-	}
-	npv = -pvnode;
-	if (pvnode < 0) {
-	    dhead[npv] = nxnode;
-	}
-L1300:
-/*                ---------------------------------------- */
-/*                PURGE INACTIVE QUOTIENT NABORS OF RNODE. */
-/*                ---------------------------------------- */
-	jstrt = xadj[rnode];
-	jstop = xadj[rnode + 1] - 1;
-	xqnbr = jstrt;
-	i__2 = jstop;
-	for (j = jstrt; j <= i__2; ++j) {
-	    nabor = adjncy[j];
-	    if (nabor == 0) {
-		goto L1500;
-	    }
-	    if (marker[nabor] >= *tag) {
-		goto L1400;
-	    }
-	    adjncy[xqnbr] = nabor;
-	    ++xqnbr;
-L1400:
-	    ;
-	}
-L1500:
-/*                ---------------------------------------- */
-/*                IF NO ACTIVE NABOR AFTER THE PURGING ... */
-/*                ---------------------------------------- */
-	nqnbrs = xqnbr - jstrt;
-	if (nqnbrs > 0) {
-	    goto L1600;
-	}
-/*                    ----------------------------- */
-/*                    THEN MERGE RNODE WITH MDNODE. */
-/*                    ----------------------------- */
-	qsize[*mdnode] += qsize[rnode];
-	qsize[rnode] = 0;
-	marker[rnode] = *maxint;
-	dforw[rnode] = -(*mdnode);
-	dbakw[rnode] = -(*maxint);
-	goto L1700;
-L1600:
-/*                -------------------------------------- */
-/*                ELSE FLAG RNODE FOR DEGREE UPDATE, AND */
-/*                ADD MDNODE AS A NABOR OF RNODE. */
-/*                -------------------------------------- */
-	dforw[rnode] = nqnbrs + 1;
-	dbakw[rnode] = 0;
-	adjncy[xqnbr] = *mdnode;
-	++xqnbr;
-	if (xqnbr <= jstop) {
-	    adjncy[xqnbr] = 0;
-	}
-
-L1700:
-	;
-    }
-L1800:
-    return 0;
-
-} /* mmdelm_ */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* *****     MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE     ***** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */
-/*        AFTER A MULTIPLE ELIMINATION STEP. */
-
-/*     INPUT PARAMETERS - */
-/*        EHEAD  - THE BEGINNING OF THE LIST OF ELIMINATED */
-/*                 NODES (I.E., NEWLY FORMED ELEMENTS). */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
-/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
-/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */
-/*                 INTEGER. */
-
-/*     UPDATED PARAMETERS - */
-/*        MDEG   - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */
-/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
-/*        QSIZE  - SIZE OF SUPERNODE. */
-/*        LLIST  - WORKING LINKED LIST. */
-/*        MARKER - MARKER VECTOR FOR DEGREE UPDATE. */
-/*        TAG    - TAG VALUE. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdupd_(int *ehead, int *neqns, int *xadj, 
-	shortint *adjncy, int *delta, int *mdeg, shortint *dhead, 
-	shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, 
-	shortint *marker, int *maxint, int *tag)
-{
-    /* System generated locals */
-    int i__1, i__2;
-
-    /* Local variables */
-    static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, 
-	    istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --marker;
-    --llist;
-    --qsize;
-    --dbakw;
-    --dforw;
-    --dhead;
-    --adjncy;
-    --xadj;
-
-    /* Function Body */
-    mdeg0 = *mdeg + *delta;
-    elmnt = *ehead;
-L100:
-/*            ------------------------------------------------------- */
-/*            FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */
-/*            (RESET TAG VALUE IF NECESSARY.) */
-/*            ------------------------------------------------------- */
-    if (elmnt <= 0) {
-	return 0;
-    }
-    mtag = *tag + mdeg0;
-    if (mtag < *maxint) {
-	goto L300;
-    }
-    *tag = 1;
-    i__1 = *neqns;
-    for (i = 1; i <= i__1; ++i) {
-	if (marker[i] < *maxint) {
-	    marker[i] = 0;
-	}
-/* L200: */
-    }
-    mtag = *tag + mdeg0;
-L300:
-/*            --------------------------------------------- */
-/*            CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */
-/*            WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */
-/*            ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */
-/*            THAN TWO NABORS (QXHEAD).  ALSO COMPUTE DEG0, */
-/*            NUMBER OF NODES IN THIS ELEMENT. */
-/*            --------------------------------------------- */
-    q2head = 0;
-    qxhead = 0;
-    deg0 = 0;
-    link = elmnt;
-L400:
-    istrt = xadj[link];
-    istop = xadj[link + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	enode = adjncy[i];
-	link = -enode;
-	if (enode < 0) {
-	    goto L400;
-	} else if (enode == 0) {
-	    goto L800;
-	} else {
-	    goto L500;
-	}
-
-L500:
-	if (qsize[enode] == 0) {
-	    goto L700;
-	}
-	deg0 += qsize[enode];
-	marker[enode] = mtag;
-/*                        ---------------------------------- */
-/*                        IF ENODE REQUIRES A DEGREE UPDATE, */
-/*                        THEN DO THE FOLLOWING. */
-/*                        ---------------------------------- */
-	if (dbakw[enode] != 0) {
-	    goto L700;
-	}
-/*                            --------------------------------------- 
-*/
-/*                            PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. 
-*/
-/*                            --------------------------------------- 
-*/
-	if (dforw[enode] == 2) {
-	    goto L600;
-	}
-	llist[enode] = qxhead;
-	qxhead = enode;
-	goto L700;
-L600:
-	llist[enode] = q2head;
-	q2head = enode;
-L700:
-	;
-    }
-L800:
-/*            -------------------------------------------- */
-/*            FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */
-/*            -------------------------------------------- */
-    enode = q2head;
-    iq2 = 1;
-L900:
-    if (enode <= 0) {
-	goto L1500;
-    }
-    if (dbakw[enode] != 0) {
-	goto L2200;
-    }
-    ++(*tag);
-    deg = deg0;
-/*                    ------------------------------------------ */
-/*                    IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */
-/*                    ------------------------------------------ */
-    istrt = xadj[enode];
-    nabor = adjncy[istrt];
-    if (nabor == elmnt) {
-	nabor = adjncy[istrt + 1];
-    }
-/*                    ------------------------------------------------ */
-/*                    IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */
-/*                    ------------------------------------------------ */
-    link = nabor;
-    if (dforw[nabor] < 0) {
-	goto L1000;
-    }
-    deg += qsize[nabor];
-    goto L2100;
-L1000:
-/*                        -------------------------------------------- */
-/*                        OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */
-/*                        DO THE FOLLOWING. */
-/*                        -------------------------------------------- */
-    istrt = xadj[link];
-    istop = xadj[link + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	node = adjncy[i];
-	link = -node;
-	if (node == enode) {
-	    goto L1400;
-	}
-	if (node < 0) {
-	    goto L1000;
-	} else if (node == 0) {
-	    goto L2100;
-	} else {
-	    goto L1100;
-	}
-
-L1100:
-	if (qsize[node] == 0) {
-	    goto L1400;
-	}
-	if (marker[node] >= *tag) {
-	    goto L1200;
-	}
-/*                                -----------------------------------
--- */
-/*                                CASE WHEN NODE IS NOT YET CONSIDERED
-. */
-/*                                -----------------------------------
--- */
-	marker[node] = *tag;
-	deg += qsize[node];
-	goto L1400;
-L1200:
-/*                            ----------------------------------------
- */
-/*                            CASE WHEN NODE IS INDISTINGUISHABLE FROM
- */
-/*                            ENODE.  MERGE THEM INTO A NEW SUPERNODE.
- */
-/*                            ----------------------------------------
- */
-	if (dbakw[node] != 0) {
-	    goto L1400;
-	}
-	if (dforw[node] != 2) {
-	    goto L1300;
-	}
-	qsize[enode] += qsize[node];
-	qsize[node] = 0;
-	marker[node] = *maxint;
-	dforw[node] = -enode;
-	dbakw[node] = -(*maxint);
-	goto L1400;
-L1300:
-/*                            -------------------------------------- 
-*/
-/*                            CASE WHEN NODE IS OUTMATCHED BY ENODE. 
-*/
-/*                            -------------------------------------- 
-*/
-	if (dbakw[node] == 0) {
-	    dbakw[node] = -(*maxint);
-	}
-L1400:
-	;
-    }
-    goto L2100;
-L1500:
-/*                ------------------------------------------------ */
-/*                FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */
-/*                ------------------------------------------------ */
-    enode = qxhead;
-    iq2 = 0;
-L1600:
-    if (enode <= 0) {
-	goto L2300;
-    }
-    if (dbakw[enode] != 0) {
-	goto L2200;
-    }
-    ++(*tag);
-    deg = deg0;
-/*                        --------------------------------- */
-/*                        FOR EACH UNMARKED NABOR OF ENODE, */
-/*                        DO THE FOLLOWING. */
-/*                        --------------------------------- */
-    istrt = xadj[enode];
-    istop = xadj[enode + 1] - 1;
-    i__1 = istop;
-    for (i = istrt; i <= i__1; ++i) {
-	nabor = adjncy[i];
-	if (nabor == 0) {
-	    goto L2100;
-	}
-	if (marker[nabor] >= *tag) {
-	    goto L2000;
-	}
-	marker[nabor] = *tag;
-	link = nabor;
-/*                                ------------------------------ */
-/*                                IF UNELIMINATED, INCLUDE IT IN */
-/*                                DEG COUNT. */
-/*                                ------------------------------ */
-	if (dforw[nabor] < 0) {
-	    goto L1700;
-	}
-	deg += qsize[nabor];
-	goto L2000;
-L1700:
-/*                                    ------------------------------- 
-*/
-/*                                    IF ELIMINATED, INCLUDE UNMARKED 
-*/
-/*                                    NODES IN THIS ELEMENT INTO THE 
-*/
-/*                                    DEGREE COUNT. */
-/*                                    ------------------------------- 
-*/
-	jstrt = xadj[link];
-	jstop = xadj[link + 1] - 1;
-	i__2 = jstop;
-	for (j = jstrt; j <= i__2; ++j) {
-	    node = adjncy[j];
-	    link = -node;
-	    if (node < 0) {
-		goto L1700;
-	    } else if (node == 0) {
-		goto L2000;
-	    } else {
-		goto L1800;
-	    }
-
-L1800:
-	    if (marker[node] >= *tag) {
-		goto L1900;
-	    }
-	    marker[node] = *tag;
-	    deg += qsize[node];
-L1900:
-	    ;
-	}
-L2000:
-	;
-    }
-L2100:
-/*                    ------------------------------------------- */
-/*                    UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */
-/*                    STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */
-/*                    ------------------------------------------- */
-    deg = deg - qsize[enode] + 1;
-    fnode = dhead[deg];
-    dforw[enode] = fnode;
-    dbakw[enode] = -deg;
-    if (fnode > 0) {
-	dbakw[fnode] = enode;
-    }
-    dhead[deg] = enode;
-    if (deg < *mdeg) {
-	*mdeg = deg;
-    }
-L2200:
-/*                    ---------------------------------- */
-/*                    GET NEXT ENODE IN CURRENT ELEMENT. */
-/*                    ---------------------------------- */
-    enode = llist[enode];
-    if (iq2 == 1) {
-	goto L900;
-    }
-    goto L1600;
-L2300:
-/*            ----------------------------- */
-/*            GET NEXT ELEMENT IN THE LIST. */
-/*            ----------------------------- */
-    *tag = mtag;
-    elmnt = llist[elmnt];
-    goto L100;
-
-} /* mmdupd_ */
-
-/* *************************************************************** */
-/* *************************************************************** */
-/* *****     MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING     ***** */
-/* *************************************************************** */
-/* *************************************************************** */
-
-/*     AUTHOR - JOSEPH W.H. LIU */
-/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
-
-/*     PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */
-/*        PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */
-/*        VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */
-/*        MINIMUM DEGREE ORDERING ALGORITHM. */
-
-/*     INPUT PARAMETERS - */
-/*        NEQNS  - NUMBER OF EQUATIONS. */
-/*        QSIZE  - SIZE OF SUPERNODES AT ELIMINATION. */
-
-/*     UPDATED PARAMETERS - */
-/*        INVP   - INVERSE PERMUTATION VECTOR.  ON INPUT, */
-/*                 IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */
-/*                 INTO THE NODE -INVP(NODE); OTHERWISE, */
-/*                 -INVP(NODE) IS ITS INVERSE LABELLING. */
-
-/*     OUTPUT PARAMETERS - */
-/*        PERM   - THE PERMUTATION VECTOR. */
-
-/* *************************************************************** */
-
-/* Subroutine */ int mmdnum_(int *neqns, shortint *perm, shortint *invp, 
-	shortint *qsize)
-{
-    /* System generated locals */
-    int i__1;
-
-    /* Local variables */
-    static int node, root, nextf, father, nqsize, num;
-
-
-/* *************************************************************** */
-
-
-/* *************************************************************** */
-
-    /* Parameter adjustments */
-    --qsize;
-    --invp;
-    --perm;
-
-    /* Function Body */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	nqsize = qsize[node];
-	if (nqsize <= 0) {
-	    perm[node] = invp[node];
-	}
-	if (nqsize > 0) {
-	    perm[node] = -invp[node];
-	}
-/* L100: */
-    }
-/*        ------------------------------------------------------ */
-/*        FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */
-/*        ------------------------------------------------------ */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	if (perm[node] > 0) {
-	    goto L500;
-	}
-/*                ----------------------------------------- */
-/*                TRACE THE MERGED TREE UNTIL ONE WHICH HAS */
-/*                NOT BEEN MERGED, CALL IT ROOT. */
-/*                ----------------------------------------- */
-	father = node;
-L200:
-	if (perm[father] > 0) {
-	    goto L300;
-	}
-	father = -perm[father];
-	goto L200;
-L300:
-/*                ----------------------- */
-/*                NUMBER NODE AFTER ROOT. */
-/*                ----------------------- */
-	root = father;
-	num = perm[root] + 1;
-	invp[node] = -num;
-	perm[root] = num;
-/*                ------------------------ */
-/*                SHORTEN THE MERGED TREE. */
-/*                ------------------------ */
-	father = node;
-L400:
-	nextf = -perm[father];
-	if (nextf <= 0) {
-	    goto L500;
-	}
-	perm[father] = -root;
-	father = nextf;
-	goto L400;
-L500:
-	;
-    }
-/*        ---------------------- */
-/*        READY TO COMPUTE PERM. */
-/*        ---------------------- */
-    i__1 = *neqns;
-    for (node = 1; node <= i__1; ++node) {
-	num = -invp[node];
-	invp[node] = num;
-	perm[num] = node;
-/* L600: */
-    }
-    return 0;
-
-} /* mmdnum_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,2583 +0,0 @@
-/* ========================================================================== */
-/* === colamd - a sparse matrix column ordering algorithm =================== */
-/* ========================================================================== */
-
-/*
-    colamd:  An approximate minimum degree column ordering algorithm.
-
-    Purpose:
-
-	Colamd computes a permutation Q such that the Cholesky factorization of
-	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
-	than A'A.  This also provides a good ordering for sparse partial
-	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
-	factorization, and P is computed during numerical factorization via
-	conventional partial pivoting with row interchanges.  Colamd is the
-	column ordering method used in SuperLU, part of the ScaLAPACK library.
-	It is also available as user-contributed software for Matlab 5.2,
-	available from MathWorks, Inc. (http://www.mathworks.com).  This
-	routine can be used in place of COLMMD in Matlab.  By default, the \
-	and / operators in Matlab perform a column ordering (using COLMMD)
-	prior to LU factorization using sparse partial pivoting, in the
-	built-in Matlab LU(A) routine.
-
-    Authors:
-
-	The authors of the code itself are Stefan I. Larimore and Timothy A.
-	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
-	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
-	Ng, Oak Ridge National Laboratory.
-
-    Date:
-
-	August 3, 1998.  Version 1.0.
-
-    Acknowledgements:
-
-	This work was supported by the National Science Foundation, under
-	grants DMS-9504974 and DMS-9803599.
-
-    Notice:
-
-	Copyright (c) 1998 by the University of Florida.  All Rights Reserved.
-
-	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-
-	Permission is hereby granted to use or copy this program for any
-	purpose, provided the above notices are retained on all copies.
-	User documentation of any code that uses this code must cite the
-	Authors, the Copyright, and "Used by permission."  If this code is
-	accessible from within Matlab, then typing "help colamd" or "colamd"
-	(with no arguments) must cite the Authors.  Permission to modify the
-	code and to distribute modified code is granted, provided the above
-	notices are retained, and a notice that the code was modified is
-	included with the above copyright notice.  You must also retain the
-	Availability information below, of the original version.
-
-	This software is provided free of charge.
-
-    Availability:
-
-	This file is located at
-
-		http://www.cise.ufl.edu/~davis/colamd/colamd.c
-
-	The colamd.h file is required, located in the same directory.
-	The colamdmex.c file provides a Matlab interface for colamd.
-	The symamdmex.c file provides a Matlab interface for symamd, which is
-	a symmetric ordering based on this code, colamd.c.  All codes are
-	purely ANSI C compliant (they use no Unix-specific routines, include
-	files, etc.).
-*/
-
-/* ========================================================================== */
-/* === Description of user-callable routines ================================ */
-/* ========================================================================== */
-
-/*
-    Each user-callable routine (declared as PUBLIC) is briefly described below.
-    Refer to the comments preceding each routine for more details.
-
-    ----------------------------------------------------------------------------
-    colamd_recommended:
-    ----------------------------------------------------------------------------
-
-	Usage:
-
-	    Alen = colamd_recommended (nnz, n_row, n_col) ;
-
-	Purpose:
-
-	    Returns recommended value of Alen for use by colamd.  Returns -1
-	    if any input argument is negative.
-
-	Arguments:
-
-	    int nnz ;		Number of nonzeros in the matrix A.  This must
-				be the same value as p [n_col] in the call to
-				colamd - otherwise you will get a wrong value
-				of the recommended memory to use.
-	    int n_row ;		Number of rows in the matrix A.
-	    int n_col ;		Number of columns in the matrix A.
-
-    ----------------------------------------------------------------------------
-    colamd_set_defaults:
-    ----------------------------------------------------------------------------
-
-	Usage:
-
-	    colamd_set_defaults (knobs) ;
-
-	Purpose:
-
-	    Sets the default parameters.
-
-	Arguments:
-
-	    double knobs [COLAMD_KNOBS] ;	Output only.
-
-		Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries
-		are removed prior to ordering.  Columns with more than
-		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed
-		prior to ordering, and placed last in the output column
-		ordering.  Default values of these two knobs are both 0.5.
-		Currently, only knobs [0] and knobs [1] are used, but future
-		versions may use more knobs.  If so, they will be properly set
-		to their defaults by the future version of colamd_set_defaults,
-		so that the code that calls colamd will not need to change,
-		assuming that you either use colamd_set_defaults, or pass a
-		(double *) NULL pointer as the knobs array to colamd.
-
-    ----------------------------------------------------------------------------
-    colamd:
-    ----------------------------------------------------------------------------
-
-	Usage:
-
-	    colamd (n_row, n_col, Alen, A, p, knobs) ;
-
-	Purpose:
-
-	    Computes a column ordering (Q) of A such that P(AQ)=LU or
-	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
-	    operations than factorizing the unpermuted matrix A or A'A,
-	    respectively.
-
-	Arguments:
-
-	    int n_row ;
-
-		Number of rows in the matrix A.
-		Restriction:  n_row >= 0.
-		Colamd returns FALSE if n_row is negative.
-
-	    int n_col ;
-
-		Number of columns in the matrix A.
-		Restriction:  n_col >= 0.
-		Colamd returns FALSE if n_col is negative.
-
-	    int Alen ;
-
-		Restriction (see note):
-		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS
-		Colamd returns FALSE if these conditions are not met.
-
-		Note:  this restriction makes an modest assumption regarding
-		the size of the two typedef'd structures, below.  We do,
-		however, guarantee that
-		Alen >= colamd_recommended (nnz, n_row, n_col)
-		will be sufficient.
-
-	    int A [Alen] ;	Input argument, stats on output.
-
-		A is an integer array of size Alen.  Alen must be at least as
-		large as the bare minimum value given above, but this is very
-		low, and can result in excessive run time.  For best
-		performance, we recommend that Alen be greater than or equal to
-		colamd_recommended (nnz, n_row, n_col), which adds
-		nnz/5 to the bare minimum value given above.
-
-		On input, the row indices of the entries in column c of the
-		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
-		in a given column c need not be in ascending order, and
-		duplicate row indices may be be present.  However, colamd will
-		work a little faster if both of these conditions are met
-		(Colamd puts the matrix into this format, if it finds that the
-		the conditions are not met).
-
-		The matrix is 0-based.  That is, rows are in the range 0 to
-		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
-		returns FALSE if any row index is out of range.
-
-		The contents of A are modified during ordering, and are thus
-		undefined on output with the exception of a few statistics
-		about the ordering (A [0..COLAMD_STATS-1]):
-		A [0]:  number of dense or empty rows ignored.
-		A [1]:  number of dense or empty columns ignored (and ordered
-			last in the output permutation p)
-		A [2]:  number of garbage collections performed.
-		A [3]:  0, if all row indices in each column were in sorted
-			  order, and no duplicates were present.
-			1, otherwise (in which case colamd had to do more work)
-		Note that a row can become "empty" if it contains only
-		"dense" and/or "empty" columns, and similarly a column can
-		become "empty" if it only contains "dense" and/or "empty" rows.
-		Future versions may return more statistics in A, but the usage
-		of these 4 entries in A will remain unchanged.
-
-	    int p [n_col+1] ;	Both input and output argument.
-
-		p is an integer array of size n_col+1.  On input, it holds the
-		"pointers" for the column form of the matrix A.  Column c of
-		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
-		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
-		for all c in the range 0 to n_col-1.  The value p [n_col] is
-		thus the total number of entries in the pattern of the matrix A.
-		Colamd returns FALSE if these conditions are not met.
-
-		On output, if colamd returns TRUE, the array p holds the column
-		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
-		the first column index in the new ordering, and p [n_col-1] is
-		the last.  That is, p [k] = j means that column j of A is the
-		kth pivot column, in AQ, where k is in the range 0 to n_col-1
-		(p [0] = j means that column j of A is the first column in AQ).
-
-		If colamd returns FALSE, then no permutation is returned, and
-		p is undefined on output.
-
-	    double knobs [COLAMD_KNOBS] ;	Input only.
-
-		See colamd_set_defaults for a description.  If the knobs array
-		is not present (that is, if a (double *) NULL pointer is passed
-		in its place), then the default values of the parameters are
-		used instead.
-
-*/
-
-
-/* ========================================================================== */
-/* === Include files ======================================================== */
-/* ========================================================================== */
-
-/* limits.h:  the largest positive integer (INT_MAX) */
-#include <limits.h>
-
-/* colamd.h:  knob array size, stats output size, and global prototypes */
-#include "colamd.h"
-
-/* ========================================================================== */
-/* === Scaffolding code definitions  ======================================== */
-/* ========================================================================== */
-
-/* Ensure that debugging is turned off: */
-#ifndef NDEBUG
-#define NDEBUG
-#endif
-
-/* assert.h:  the assert macro (no debugging if NDEBUG is defined) */
-#include <assert.h>
-
-/*
-   Our "scaffolding code" philosophy:  In our opinion, well-written library
-   code should keep its "debugging" code, and just normally have it turned off
-   by the compiler so as not to interfere with performance.  This serves
-   several purposes:
-
-   (1) assertions act as comments to the reader, telling you what the code
-	expects at that point.  All assertions will always be true (unless
-	there really is a bug, of course).
-
-   (2) leaving in the scaffolding code assists anyone who would like to modify
-	the code, or understand the algorithm (by reading the debugging output,
-	one can get a glimpse into what the code is doing).
-
-   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
-	and "should" be fully functional and bug-free ... but you never know...
-
-    To enable debugging, comment out the "#define NDEBUG" above.  The code will
-    become outrageously slow when debugging is enabled.  To control the level of
-    debugging output, set an environment variable D to 0 (little), 1 (some),
-    2, 3, or 4 (lots).
-*/
-
-/* ========================================================================== */
-/* === Row and Column structures ============================================ */
-/* ========================================================================== */
-
-typedef struct ColInfo_struct
-{
-    int start ;		/* index for A of first row in this column, or DEAD */
-			/* if column is dead */
-    int length ;	/* number of rows in this column */
-    union
-    {
-	int thickness ;	/* number of original columns represented by this */
-			/* col, if the column is alive */
-	int parent ;	/* parent in parent tree super-column structure, if */
-			/* the column is dead */
-    } shared1 ;
-    union
-    {
-	int score ;	/* the score used to maintain heap, if col is alive */
-	int order ;	/* pivot ordering of this column, if col is dead */
-    } shared2 ;
-    union
-    {
-	int headhash ;	/* head of a hash bucket, if col is at the head of */
-			/* a degree list */
-	int hash ;	/* hash value, if col is not in a degree list */
-	int prev ;	/* previous column in degree list, if col is in a */
-			/* degree list (but not at the head of a degree list) */
-    } shared3 ;
-    union
-    {
-	int degree_next ;	/* next column, if col is in a degree list */
-	int hash_next ;		/* next column, if col is in a hash list */
-    } shared4 ;
-
-} ColInfo ;
-
-typedef struct RowInfo_struct
-{
-    int start ;		/* index for A of first col in this row */
-    int length ;	/* number of principal columns in this row */
-    union
-    {
-	int degree ;	/* number of principal & non-principal columns in row */
-	int p ;		/* used as a row pointer in init_rows_cols () */
-    } shared1 ;
-    union
-    {
-	int mark ;	/* for computing set differences and marking dead rows*/
-	int first_column ;/* first column in row (used in garbage collection) */
-    } shared2 ;
-
-} RowInfo ;
-
-/* ========================================================================== */
-/* === Definitions ========================================================== */
-/* ========================================================================== */
-
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-#define ONES_COMPLEMENT(r) (-(r)-1)
-
-#define TRUE	(1)
-#define FALSE	(0)
-#define EMPTY	(-1)
-
-/* Row and column status */
-#define ALIVE	(0)
-#define DEAD	(-1)
-
-/* Column status */
-#define DEAD_PRINCIPAL		(-1)
-#define DEAD_NON_PRINCIPAL	(-2)
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
-#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
-
-/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
-#define PUBLIC
-#define PRIVATE static
-
-/* ========================================================================== */
-/* === Prototypes of PRIVATE routines ======================================= */
-/* ========================================================================== */
-
-PRIVATE int init_rows_cols
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int p []
-) ;
-
-PRIVATE void init_scoring
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int head [],
-    double knobs [COLAMD_KNOBS],
-    int *p_n_row2,
-    int *p_n_col2,
-    int *p_max_deg
-) ;
-
-PRIVATE int find_ordering
-(
-    int n_row,
-    int n_col,
-    int Alen,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int head [],
-    int n_col2,
-    int max_deg,
-    int pfree
-) ;
-
-PRIVATE void order_children
-(
-    int n_col,
-    ColInfo Col [],
-    int p []
-) ;
-
-PRIVATE void detect_super_cols
-(
-#ifndef NDEBUG
-    int n_col,
-    RowInfo Row [],
-#endif
-    ColInfo Col [],
-    int A [],
-    int head [],
-    int row_start,
-    int row_length
-) ;
-
-PRIVATE int garbage_collection
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int *pfree
-) ;
-
-PRIVATE int clear_mark
-(
-    int n_row,
-    RowInfo Row []
-) ;
-
-/* ========================================================================== */
-/* === Debugging definitions ================================================ */
-/* ========================================================================== */
-
-#ifndef NDEBUG
-
-/* === With debugging ======================================================= */
-
-/* stdlib.h: for getenv and atoi, to get debugging level from environment */
-#include <stdlib.h>
-
-/* stdio.h:  for printf (no printing if debugging is turned off) */
-#include <stdio.h>
-
-PRIVATE void debug_deg_lists
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int head [],
-    int min_score,
-    int should,
-    int max_deg
-) ;
-
-PRIVATE void debug_mark
-(
-    int n_row,
-    RowInfo Row [],
-    int tag_mark,
-    int max_mark
-) ;
-
-PRIVATE void debug_matrix
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A []
-) ;
-
-PRIVATE void debug_structures
-(
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int n_col2
-) ;
-
-/* the following is the *ONLY* global variable in this file, and is only */
-/* present when debugging */
-
-PRIVATE int debug_colamd ;	/* debug print level */
-
-#define DEBUG0(params) { (void) printf params ; }
-#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; }
-#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; }
-#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; }
-#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; }
-
-#else
-
-/* === No debugging ========================================================= */
-
-#define DEBUG0(params) ;
-#define DEBUG1(params) ;
-#define DEBUG2(params) ;
-#define DEBUG3(params) ;
-#define DEBUG4(params) ;
-
-#endif
-
-/* ========================================================================== */
-
-
-/* ========================================================================== */
-/* === USER-CALLABLE ROUTINES: ============================================== */
-/* ========================================================================== */
-
-
-/* ========================================================================== */
-/* === colamd_recommended =================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd_recommended routine returns the suggested size for Alen.  This
-    value has been determined to provide good balance between the number of
-    garbage collections and the memory requirements for colamd.
-*/
-
-PUBLIC int colamd_recommended	/* returns recommended value of Alen. */
-(
-    /* === Parameters ======================================================= */
-
-    int nnz,			/* number of nonzeros in A */
-    int n_row,			/* number of rows in A */
-    int n_col			/* number of columns in A */
-)
-{
-    /* === Local variables ================================================== */
-
-    int minimum ;		/* bare minimum requirements */
-    int recommended ;		/* recommended value of Alen */
-
-    if (nnz < 0 || n_row < 0 || n_col < 0)
-    {
-	/* return -1 if any input argument is corrupted */
-	DEBUG0 (("colamd_recommended error!")) ;
-	DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ;
-	return (-1) ;
-    }
-
-    minimum =
-	2 * (nnz)		/* for A */
-	+ (((n_col) + 1) * sizeof (ColInfo) / sizeof (int))	/* for Col */
-	+ (((n_row) + 1) * sizeof (RowInfo) / sizeof (int))	/* for Row */
-	+ n_col			/* minimum elbow room to guarrantee success */
-	+ COLAMD_STATS ;	/* for output statistics */
-
-    /* recommended is equal to the minumum plus enough memory to keep the */
-    /* number garbage collections low */
-    recommended = minimum + nnz/5 ;
-
-    return (recommended) ;
-}
-
-
-/* ========================================================================== */
-/* === colamd_set_defaults ================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd_set_defaults routine sets the default values of the user-
-    controllable parameters for colamd:
-
-	knobs [0]	rows with knobs[0]*n_col entries or more are removed
-			prior to ordering.
-
-	knobs [1]	columns with knobs[1]*n_row entries or more are removed
-			prior to ordering, and placed last in the column
-			permutation.
-
-	knobs [2..19]	unused, but future versions might use this
-*/
-
-PUBLIC void colamd_set_defaults
-(
-    /* === Parameters ======================================================= */
-
-    double knobs [COLAMD_KNOBS]		/* knob array */
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;
-
-    if (!knobs)
-    {
-	return ;			/* no knobs to initialize */
-    }
-    for (i = 0 ; i < COLAMD_KNOBS ; i++)
-    {
-	knobs [i] = 0 ;
-    }
-    knobs [COLAMD_DENSE_ROW] = 0.5 ;	/* ignore rows over 50% dense */
-    knobs [COLAMD_DENSE_COL] = 0.5 ;	/* ignore columns over 50% dense */
-}
-
-
-/* ========================================================================== */
-/* === colamd =============================================================== */
-/* ========================================================================== */
-
-/*
-    The colamd routine computes a column ordering Q of a sparse matrix
-    A such that the LU factorization P(AQ) = LU remains sparse, where P is
-    selected via partial pivoting.   The routine can also be viewed as
-    providing a permutation Q such that the Cholesky factorization
-    (AQ)'(AQ) = LL' remains sparse.
-
-    On input, the nonzero patterns of the columns of A are stored in the
-    array A, in order 0 to n_col-1.  A is held in 0-based form (rows in the
-    range 0 to n_row-1 and columns in the range 0 to n_col-1).  Row indices
-    for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0,
-    and thus p [n_col] is the number of entries in A.  The matrix is
-    destroyed on output.  The row indices within each column do not have to
-    be sorted (from small to large row indices), and duplicate row indices
-    may be present.  However, colamd will work a little faster if columns are
-    sorted and no duplicates are present.  Matlab 5.2 always passes the matrix
-    with sorted columns, and no duplicates.
-
-    The integer array A is of size Alen.  Alen must be at least of size
-    (where nnz is the number of entries in A):
-
-	nnz			for the input column form of A
-	+ nnz			for a row form of A that colamd generates
-	+ 6*(n_col+1)		for a ColInfo Col [0..n_col] array
-				(this assumes sizeof (ColInfo) is 6 int's).
-	+ 4*(n_row+1)		for a RowInfo Row [0..n_row] array
-				(this assumes sizeof (RowInfo) is 4 int's).
-	+ elbow_room		must be at least n_col.  We recommend at least
-				nnz/5 in addition to that.  If sufficient,
-				changes in the elbow room affect the ordering
-				time only, not the ordering itself.
-	+ COLAMD_STATS		for the output statistics
-
-    Colamd returns FALSE is memory is insufficient, or TRUE otherwise.
-
-    On input, the caller must specify:
-
-	n_row			the number of rows of A
-	n_col			the number of columns of A
-	Alen			the size of the array A
-	A [0 ... nnz-1]		the row indices, where nnz = p [n_col]
-	A [nnz ... Alen-1]	(need not be initialized by the user)
-	p [0 ... n_col]		the column pointers,  p [0] = 0, and p [n_col]
-				is the number of entries in A.  Column c of A
-				is stored in A [p [c] ... p [c+1]-1].
-	knobs [0 ... 19]	a set of parameters that control the behavior
-				of colamd.  If knobs is a NULL pointer the
-				defaults are used.  The user-callable
-				colamd_set_defaults routine sets the default
-				parameters.  See that routine for a description
-				of the user-controllable parameters.
-
-    If the return value of Colamd is TRUE, then on output:
-
-	p [0 ... n_col-1]	the column permutation. p [0] is the first
-				column index, and p [n_col-1] is the last.
-				That is, p [k] = j means that column j of A
-				is the kth column of AQ.
-
-	A			is undefined on output (the matrix pattern is
-				destroyed), except for the following statistics:
-
-	A [0]			the number of dense (or empty) rows ignored
-	A [1]			the number of dense (or empty) columms.  These
-				are ordered last, in their natural order.
-	A [2]			the number of garbage collections performed.
-				If this is excessive, then you would have
-				gotten your results faster if Alen was larger.
-	A [3]			0, if all row indices in each column were in
-				sorted order and no duplicates were present.
-				1, if there were unsorted or duplicate row
-				indices in the input.  You would have gotten
-				your results faster if A [3] was returned as 0.
-
-    If the return value of Colamd is FALSE, then A and p are undefined on
-    output.
-*/
-
-PUBLIC int colamd		/* returns TRUE if successful */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows in A */
-    int n_col,			/* number of columns in A */
-    int Alen,			/* length of A */
-    int A [],			/* row indices of A */
-    int p [],			/* pointers to columns in A */
-    double knobs [COLAMD_KNOBS]	/* parameters (uses defaults if NULL) */
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;			/* loop index */
-    int nnz ;			/* nonzeros in A */
-    int Row_size ;		/* size of Row [], in integers */
-    int Col_size ;		/* size of Col [], in integers */
-    int elbow_room ;		/* remaining free space */
-    RowInfo *Row ;		/* pointer into A of Row [0..n_row] array */
-    ColInfo *Col ;		/* pointer into A of Col [0..n_col] array */
-    int n_col2 ;		/* number of non-dense, non-empty columns */
-    int n_row2 ;		/* number of non-dense, non-empty rows */
-    int ngarbage ;		/* number of garbage collections performed */
-    int max_deg ;		/* maximum row degree */
-    double default_knobs [COLAMD_KNOBS] ;	/* default knobs knobs array */
-    int init_result ;		/* return code from initialization */
-
-#ifndef NDEBUG
-    debug_colamd = 0 ;		/* no debug printing */
-    /* get "D" environment variable, which gives the debug printing level */
-    if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ;
-    DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ;
-#endif
-
-    /* === Check the input arguments ======================================== */
-
-    if (n_row < 0 || n_col < 0 || !A || !p)
-    {
-	/* n_row and n_col must be non-negative, A and p must be present */
-	DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ;
-	return (FALSE) ;
-    }
-    nnz = p [n_col] ;
-    if (nnz < 0 || p [0] != 0)
-    {
-	/* nnz must be non-negative, and p [0] must be zero */
-	DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ;
-	return (FALSE) ;
-    }
-
-    /* === If no knobs, set default parameters ============================== */
-
-    if (!knobs)
-    {
-	knobs = default_knobs ;
-	colamd_set_defaults (knobs) ;
-    }
-
-    /* === Allocate the Row and Col arrays from array A ===================== */
-
-    Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ;
-    Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ;
-    elbow_room = Alen - (2*nnz + Col_size + Row_size) ;
-    if (elbow_room < n_col + COLAMD_STATS)
-    {
-	/* not enough space in array A to perform the ordering */
-	DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ;
-	return (FALSE) ;
-    }
-    Alen = 2*nnz + elbow_room ;
-    Col  = (ColInfo *) &A [Alen] ;
-    Row  = (RowInfo *) &A [Alen + Col_size] ;
-
-    /* === Construct the row and column data structures ===================== */
-
-    init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ;
-    if (init_result == -1)
-    {
-	/* input matrix is invalid */
-	DEBUG0 (("colamd error! matrix invalid\n")) ;
-	return (FALSE) ;
-    }
-
-    /* === Initialize scores, kill dense rows/columns ======================= */
-
-    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
-	&n_row2, &n_col2, &max_deg) ;
-
-    /* === Order the supercolumns =========================================== */
-
-    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
-	n_col2, max_deg, 2*nnz) ;
-
-    /* === Order the non-principal columns ================================== */
-
-    order_children (n_col, Col, p) ;
-
-    /* === Return statistics in A =========================================== */
-
-    for (i = 0 ; i < COLAMD_STATS ; i++)
-    {
-	A [i] = 0 ;
-    }
-    A [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-    A [COLAMD_DENSE_COL] = n_col - n_col2 ;
-    A [COLAMD_DEFRAG_COUNT] = ngarbage ;
-    A [COLAMD_JUMBLED_COLS] = init_result ;
-
-    return (TRUE) ;
-}
-
-
-/* ========================================================================== */
-/* === NON-USER-CALLABLE ROUTINES: ========================================== */
-/* ========================================================================== */
-
-/* There are no user-callable routines beyond this point in the file */
-
-
-/* ========================================================================== */
-/* === init_rows_cols ======================================================= */
-/* ========================================================================== */
-
-/*
-    Takes the column form of the matrix in A and creates the row form of the
-    matrix.  Also, row and column attributes are stored in the Col and Row
-    structs.  If the columns are un-sorted or contain duplicate row indices,
-    this routine will also sort and remove duplicate row indices from the
-    column form of the matrix.  Returns -1 on error, 1 if columns jumbled,
-    or 0 if columns not jumbled.  Not user-callable.
-*/
-
-PRIVATE int init_rows_cols	/* returns status code */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    RowInfo Row [],		/* of size n_row+1 */
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* row indices of A, of size Alen */
-    int p []			/* pointers to columns in A, of size n_col+1 */
-)
-{
-    /* === Local variables ================================================== */
-
-    int col ;			/* a column index */
-    int row ;			/* a row index */
-    int *cp ;			/* a column pointer */
-    int *cp_end ;		/* a pointer to the end of a column */
-    int *rp ;			/* a row pointer */
-    int *rp_end ;		/* a pointer to the end of a row */
-    int last_start ;		/* start index of previous column in A */
-    int start ;			/* start index of column in A */
-    int last_row ;		/* previous row */
-    int jumbled_columns ;	/* indicates if columns are jumbled */
-
-    /* === Initialize columns, and check column pointers ==================== */
-
-    last_start = 0 ;
-    for (col = 0 ; col < n_col ; col++)
-    {
-	start = p [col] ;
-	if (start < last_start)
-	{
-	    /* column pointers must be non-decreasing */
-	    DEBUG0 (("colamd error!  last p %d p [col] %d\n",last_start,start));
-	    return (-1) ;
-	}
-	Col [col].start = start ;
-	Col [col].length = p [col+1] - start ;
-	Col [col].shared1.thickness = 1 ;
-	Col [col].shared2.score = 0 ;
-	Col [col].shared3.prev = EMPTY ;
-	Col [col].shared4.degree_next = EMPTY ;
-	last_start = start ;
-    }
-    /* must check the end pointer for last column */
-    if (p [n_col] < last_start)
-    {
-	/* column pointers must be non-decreasing */
-	DEBUG0 (("colamd error!  last p %d p [n_col] %d\n",p[col],last_start)) ;
-	return (-1) ;
-    }
-
-    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
-
-    /* === Scan columns, compute row degrees, and check row indices ========= */
-
-    jumbled_columns = FALSE ;
-
-    for (row = 0 ; row < n_row ; row++)
-    {
-	Row [row].length = 0 ;
-	Row [row].shared2.mark = -1 ;
-    }
-
-    for (col = 0 ; col < n_col ; col++)
-    {
-	last_row = -1 ;
-
-	cp = &A [p [col]] ;
-	cp_end = &A [p [col+1]] ;
-
-	while (cp < cp_end)
-	{
-	    row = *cp++ ;
-
-	    /* make sure row indices within range */
-	    if (row < 0 || row >= n_row)
-	    {
-		DEBUG0 (("colamd error!  col %d row %d last_row %d\n",
-			 col, row, last_row)) ;
-		return (-1) ;
-	    }
-	    else if (row <= last_row)
-	    {
-		/* row indices are not sorted or repeated, thus cols */
-		/* are jumbled */
-		jumbled_columns = TRUE ;
-	    }
-	    /* prevent repeated row from being counted */
-	    if (Row [row].shared2.mark != col)
-	    {
-		Row [row].length++ ;
-		Row [row].shared2.mark = col ;
-		last_row = row ;
-	    }
-	    else
-	    {
-		/* this is a repeated entry in the column, */
-		/* it will be removed */
-		Col [col].length-- ;
-	    }
-	}
-    }
-
-    /* === Compute row pointers ============================================= */
-
-    /* row form of the matrix starts directly after the column */
-    /* form of matrix in A */
-    Row [0].start = p [n_col] ;
-    Row [0].shared1.p = Row [0].start ;
-    Row [0].shared2.mark = -1 ;
-    for (row = 1 ; row < n_row ; row++)
-    {
-	Row [row].start = Row [row-1].start + Row [row-1].length ;
-	Row [row].shared1.p = Row [row].start ;
-	Row [row].shared2.mark = -1 ;
-    }
-
-    /* === Create row form ================================================== */
-
-    if (jumbled_columns)
-    {
-	/* if cols jumbled, watch for repeated row indices */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    cp = &A [p [col]] ;
-	    cp_end = &A [p [col+1]] ;
-	    while (cp < cp_end)
-	    {
-		row = *cp++ ;
-		if (Row [row].shared2.mark != col)
-		{
-		    A [(Row [row].shared1.p)++] = col ;
-		    Row [row].shared2.mark = col ;
-		}
-	    }
-	}
-    }
-    else
-    {
-	/* if cols not jumbled, we don't need the mark (this is faster) */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    cp = &A [p [col]] ;
-	    cp_end = &A [p [col+1]] ;
-	    while (cp < cp_end)
-	    {
-		A [(Row [*cp++].shared1.p)++] = col ;
-	    }
-	}
-    }
-
-    /* === Clear the row marks and set row degrees ========================== */
-
-    for (row = 0 ; row < n_row ; row++)
-    {
-	Row [row].shared2.mark = 0 ;
-	Row [row].shared1.degree = Row [row].length ;
-    }
-
-    /* === See if we need to re-create columns ============================== */
-
-    if (jumbled_columns)
-    {
-
-#ifndef NDEBUG
-	/* make sure column lengths are correct */
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    p [col] = Col [col].length ;
-	}
-	for (row = 0 ; row < n_row ; row++)
-	{
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		p [*rp++]-- ;
-	    }
-	}
-	for (col = 0 ; col < n_col ; col++)
-	{
-	    assert (p [col] == 0) ;
-	}
-	/* now p is all zero (different than when debugging is turned off) */
-#endif
-
-	/* === Compute col pointers ========================================= */
-
-	/* col form of the matrix starts at A [0]. */
-	/* Note, we may have a gap between the col form and the row */
-	/* form if there were duplicate entries, if so, it will be */
-	/* removed upon the first garbage collection */
-	Col [0].start = 0 ;
-	p [0] = Col [0].start ;
-	for (col = 1 ; col < n_col ; col++)
-	{
-	    /* note that the lengths here are for pruned columns, i.e. */
-	    /* no duplicate row indices will exist for these columns */
-	    Col [col].start = Col [col-1].start + Col [col-1].length ;
-	    p [col] = Col [col].start ;
-	}
-
-	/* === Re-create col form =========================================== */
-
-	for (row = 0 ; row < n_row ; row++)
-	{
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		A [(p [*rp++])++] = row ;
-	    }
-	}
-	return (1) ;
-    }
-    else
-    {
-	/* no columns jumbled (this is faster) */
-	return (0) ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === init_scoring ========================================================= */
-/* ========================================================================== */
-
-/*
-    Kills dense or empty columns and rows, calculates an initial score for
-    each column, and places all columns in the degree lists.  Not user-callable.
-*/
-
-PRIVATE void init_scoring
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    RowInfo Row [],		/* of size n_row+1 */
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* column form and row form of A */
-    int head [],		/* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
-    int *p_n_row2,		/* number of non-dense, non-empty rows */
-    int *p_n_col2,		/* number of non-dense, non-empty columns */
-    int *p_max_deg		/* maximum row degree */
-)
-{
-    /* === Local variables ================================================== */
-
-    int c ;			/* a column index */
-    int r, row ;		/* a row index */
-    int *cp ;			/* a column pointer */
-    int deg ;			/* degree (# entries) of a row or column */
-    int *cp_end ;		/* a pointer to the end of a column */
-    int *new_cp ;		/* new column pointer */
-    int col_length ;		/* length of pruned column */
-    int score ;			/* current column score */
-    int n_col2 ;		/* number of non-dense, non-empty columns */
-    int n_row2 ;		/* number of non-dense, non-empty rows */
-    int dense_row_count ;	/* remove rows with more entries than this */
-    int dense_col_count ;	/* remove cols with more entries than this */
-    int min_score ;		/* smallest column score */
-    int max_deg ;		/* maximum row degree */
-    int next_col ;		/* Used to add to degree list.*/
-#ifndef NDEBUG
-    int debug_count ;		/* debug only. */
-#endif
-
-    /* === Extract knobs ==================================================== */
-
-    dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-    dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
-    DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ;
-    max_deg = 0 ;
-    n_col2 = n_col ;
-    n_row2 = n_row ;
-
-    /* === Kill empty columns =============================================== */
-
-    /* Put the empty columns at the end in their natural, so that LU */
-    /* factorization can proceed as far as possible. */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	deg = Col [c].length ;
-	if (deg == 0)
-	{
-	    /* this is a empty column, kill and order it last */
-	    Col [c].shared2.order = --n_col2 ;
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-    }
-    DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ;
-
-    /* === Kill dense columns =============================================== */
-
-    /* Put the dense columns at the end, in their natural order */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* skip any dead columns */
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	deg = Col [c].length ;
-	if (deg > dense_col_count)
-	{
-	    /* this is a dense column, kill and order it last */
-	    Col [c].shared2.order = --n_col2 ;
-	    /* decrement the row degrees */
-	    cp = &A [Col [c].start] ;
-	    cp_end = cp + Col [c].length ;
-	    while (cp < cp_end)
-	    {
-		Row [*cp++].shared1.degree-- ;
-	    }
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-    }
-    DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ;
-
-    /* === Kill dense and empty rows ======================================== */
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	deg = Row [r].shared1.degree ;
-	assert (deg >= 0 && deg <= n_col) ;
-	if (deg > dense_row_count || deg == 0)
-	{
-	    /* kill a dense or empty row */
-	    KILL_ROW (r) ;
-	    --n_row2 ;
-	}
-	else
-	{
-	    /* keep track of max degree of remaining rows */
-	    max_deg = MAX (max_deg, deg) ;
-	}
-    }
-    DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ;
-
-    /* === Compute initial column scores ==================================== */
-
-    /* At this point the row degrees are accurate.  They reflect the number */
-    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
-    /* Some "live" columns may contain only dead rows, however.  These are */
-    /* pruned in the code below. */
-
-    /* now find the initial matlab score for each column */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* skip dead column */
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	score = 0 ;
-	cp = &A [Col [c].start] ;
-	new_cp = cp ;
-	cp_end = cp + Col [c].length ;
-	while (cp < cp_end)
-	{
-	    /* get a row */
-	    row = *cp++ ;
-	    /* skip if dead */
-	    if (ROW_IS_DEAD (row))
-	    {
-		continue ;
-	    }
-	    /* compact the column */
-	    *new_cp++ = row ;
-	    /* add row's external degree */
-	    score += Row [row].shared1.degree - 1 ;
-	    /* guard against integer overflow */
-	    score = MIN (score, n_col) ;
-	}
-	/* determine pruned column length */
-	col_length = (int) (new_cp - &A [Col [c].start]) ;
-	if (col_length == 0)
-	{
-	    /* a newly-made null column (all rows in this col are "dense" */
-	    /* and have already been killed) */
-	    DEBUG0 (("Newly null killed: %d\n", c)) ;
-	    Col [c].shared2.order = --n_col2 ;
-	    KILL_PRINCIPAL_COL (c) ;
-	}
-	else
-	{
-	    /* set column length and set score */
-	    assert (score >= 0) ;
-	    assert (score <= n_col) ;
-	    Col [c].length = col_length ;
-	    Col [c].shared2.score = score ;
-	}
-    }
-    DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ;
-
-    /* At this point, all empty rows and columns are dead.  All live columns */
-    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
-    /* yet).  Rows may contain dead columns, but all live rows contain at */
-    /* least one live column. */
-
-#ifndef NDEBUG
-    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
-#endif
-
-    /* === Initialize degree lists ========================================== */
-
-#ifndef NDEBUG
-    debug_count = 0 ;
-#endif
-
-    /* clear the hash buckets */
-    for (c = 0 ; c <= n_col ; c++)
-    {
-	head [c] = EMPTY ;
-    }
-    min_score = n_col ;
-    /* place in reverse order, so low column indices are at the front */
-    /* of the lists.  This is to encourage natural tie-breaking */
-    for (c = n_col-1 ; c >= 0 ; c--)
-    {
-	/* only add principal columns to degree lists */
-	if (COL_IS_ALIVE (c))
-	{
-	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
-		c, Col [c].shared2.score, min_score, n_col)) ;
-
-	    /* === Add columns score to DList =============================== */
-
-	    score = Col [c].shared2.score ;
-
-	    assert (min_score >= 0) ;
-	    assert (min_score <= n_col) ;
-	    assert (score >= 0) ;
-	    assert (score <= n_col) ;
-	    assert (head [score] >= EMPTY) ;
-
-	    /* now add this column to dList at proper score location */
-	    next_col = head [score] ;
-	    Col [c].shared3.prev = EMPTY ;
-	    Col [c].shared4.degree_next = next_col ;
-
-	    /* if there already was a column with the same score, set its */
-	    /* previous pointer to this new column */
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = c ;
-	    }
-	    head [score] = c ;
-
-	    /* see if this score is less than current min */
-	    min_score = MIN (min_score, score) ;
-
-#ifndef NDEBUG
-	    debug_count++ ;
-#endif
-	}
-    }
-
-#ifndef NDEBUG
-    DEBUG0 (("Live cols %d out of %d, non-princ: %d\n",
-	debug_count, n_col, n_col-debug_count)) ;
-    assert (debug_count == n_col2) ;
-    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
-#endif
-
-    /* === Return number of remaining columns, and max row degree =========== */
-
-    *p_n_col2 = n_col2 ;
-    *p_n_row2 = n_row2 ;
-    *p_max_deg = max_deg ;
-}
-
-
-/* ========================================================================== */
-/* === find_ordering ======================================================== */
-/* ========================================================================== */
-
-/*
-    Order the principal columns of the supercolumn form of the matrix
-    (no supercolumns on input).  Uses a minimum approximate column minimum
-    degree ordering method.  Not user-callable.
-*/
-
-PRIVATE int find_ordering	/* return the number of garbage collections */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows of A */
-    int n_col,			/* number of columns of A */
-    int Alen,			/* size of A, 2*nnz + elbow_room or larger */
-    RowInfo Row [],		/* of size n_row+1 */
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* column form and row form of A */
-    int head [],		/* of size n_col+1 */
-    int n_col2,			/* Remaining columns to order */
-    int max_deg,		/* Maximum row degree */
-    int pfree			/* index of first free slot (2*nnz on entry) */
-)
-{
-    /* === Local variables ================================================== */
-
-    int k ;			/* current pivot ordering step */
-    int pivot_col ;		/* current pivot column */
-    int *cp ;			/* a column pointer */
-    int *rp ;			/* a row pointer */
-    int pivot_row ;		/* current pivot row */
-    int *new_cp ;		/* modified column pointer */
-    int *new_rp ;		/* modified row pointer */
-    int pivot_row_start ;	/* pointer to start of pivot row */
-    int pivot_row_degree ;	/* # of columns in pivot row */
-    int pivot_row_length ;	/* # of supercolumns in pivot row */
-    int pivot_col_score ;	/* score of pivot column */
-    int needed_memory ;		/* free space needed for pivot row */
-    int *cp_end ;		/* pointer to the end of a column */
-    int *rp_end ;		/* pointer to the end of a row */
-    int row ;			/* a row index */
-    int col ;			/* a column index */
-    int max_score ;		/* maximum possible score */
-    int cur_score ;		/* score of current column */
-    unsigned int hash ;		/* hash value for supernode detection */
-    int head_column ;		/* head of hash bucket */
-    int first_col ;		/* first column in hash bucket */
-    int tag_mark ;		/* marker value for mark array */
-    int row_mark ;		/* Row [row].shared2.mark */
-    int set_difference ;	/* set difference size of row with pivot row */
-    int min_score ;		/* smallest column score */
-    int col_thickness ;		/* "thickness" (# of columns in a supercol) */
-    int max_mark ;		/* maximum value of tag_mark */
-    int pivot_col_thickness ;	/* number of columns represented by pivot col */
-    int prev_col ;		/* Used by Dlist operations. */
-    int next_col ;		/* Used by Dlist operations. */
-    int ngarbage ;		/* number of garbage collections performed */
-#ifndef NDEBUG
-    int debug_d ;		/* debug loop counter */
-    int debug_step = 0 ;	/* debug loop counter */
-#endif
-
-    /* === Initialization and clear mark ==================================== */
-
-    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
-    tag_mark = clear_mark (n_row, Row) ;
-    min_score = 0 ;
-    ngarbage = 0 ;
-    DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ;
-
-    /* === Order the columns ================================================ */
-
-    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
-    {
-
-#ifndef NDEBUG
-	if (debug_step % 100 == 0)
-	{
-	    DEBUG0 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
-	}
-	else
-	{
-	    DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
-	}
-	debug_step++ ;
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k, max_deg) ;
-	debug_matrix (n_row, n_col, Row, Col, A) ;
-#endif
-
-	/* === Select pivot column, and order it ============================ */
-
-	/* make sure degree list isn't empty */
-	assert (min_score >= 0) ;
-	assert (min_score <= n_col) ;
-	assert (head [min_score] >= EMPTY) ;
-
-#ifndef NDEBUG
-	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
-	{
-	    assert (head [debug_d] == EMPTY) ;
-	}
-#endif
-
-	/* get pivot column from head of minimum degree list */
-	while (head [min_score] == EMPTY && min_score < n_col)
-	{
-	    min_score++ ;
-	}
-	pivot_col = head [min_score] ;
-	assert (pivot_col >= 0 && pivot_col <= n_col) ;
-	next_col = Col [pivot_col].shared4.degree_next ;
-	head [min_score] = next_col ;
-	if (next_col != EMPTY)
-	{
-	    Col [next_col].shared3.prev = EMPTY ;
-	}
-
-	assert (COL_IS_ALIVE (pivot_col)) ;
-	DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
-
-	/* remember score for defrag check */
-	pivot_col_score = Col [pivot_col].shared2.score ;
-
-	/* the pivot column is the kth column in the pivot order */
-	Col [pivot_col].shared2.order = k ;
-
-	/* increment order count by column thickness */
-	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
-	k += pivot_col_thickness ;
-	assert (pivot_col_thickness > 0) ;
-
-	/* === Garbage_collection, if necessary ============================= */
-
-	needed_memory = MIN (pivot_col_score, n_col - k) ;
-	if (pfree + needed_memory >= Alen)
-	{
-	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
-	    ngarbage++ ;
-	    /* after garbage collection we will have enough */
-	    assert (pfree + needed_memory < Alen) ;
-	    /* garbage collection has wiped out the Row[].shared2.mark array */
-	    tag_mark = clear_mark (n_row, Row) ;
-#ifndef NDEBUG
-	    debug_matrix (n_row, n_col, Row, Col, A) ;
-#endif
-	}
-
-	/* === Compute pivot row pattern ==================================== */
-
-	/* get starting location for this new merged row */
-	pivot_row_start = pfree ;
-
-	/* initialize new row counts to zero */
-	pivot_row_degree = 0 ;
-
-	/* tag pivot column as having been visited so it isn't included */
-	/* in merged pivot row */
-	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
-
-	/* pivot row is the union of all rows in the pivot column pattern */
-	cp = &A [Col [pivot_col].start] ;
-	cp_end = cp + Col [pivot_col].length ;
-	while (cp < cp_end)
-	{
-	    /* get a row */
-	    row = *cp++ ;
-	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
-	    /* skip if row is dead */
-	    if (ROW_IS_DEAD (row))
-	    {
-		continue ;
-	    }
-	    rp = &A [Row [row].start] ;
-	    rp_end = rp + Row [row].length ;
-	    while (rp < rp_end)
-	    {
-		/* get a column */
-		col = *rp++ ;
-		/* add the column, if alive and untagged */
-		col_thickness = Col [col].shared1.thickness ;
-		if (col_thickness > 0 && COL_IS_ALIVE (col))
-		{
-		    /* tag column in pivot row */
-		    Col [col].shared1.thickness = -col_thickness ;
-		    assert (pfree < Alen) ;
-		    /* place column in pivot row */
-		    A [pfree++] = col ;
-		    pivot_row_degree += col_thickness ;
-		}
-	    }
-	}
-
-	/* clear tag on pivot column */
-	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-	max_deg = MAX (max_deg, pivot_row_degree) ;
-
-#ifndef NDEBUG
-	DEBUG3 (("check2\n")) ;
-	debug_mark (n_row, Row, tag_mark, max_mark) ;
-#endif
-
-	/* === Kill all rows used to construct pivot row ==================== */
-
-	/* also kill pivot row, temporarily */
-	cp = &A [Col [pivot_col].start] ;
-	cp_end = cp + Col [pivot_col].length ;
-	while (cp < cp_end)
-	{
-	    /* may be killing an already dead row */
-	    row = *cp++ ;
-	    DEBUG2 (("Kill row in pivot col: %d\n", row)) ;
-	    KILL_ROW (row) ;
-	}
-
-	/* === Select a row index to use as the new pivot row =============== */
-
-	pivot_row_length = pfree - pivot_row_start ;
-	if (pivot_row_length > 0)
-	{
-	    /* pick the "pivot" row arbitrarily (first row in col) */
-	    pivot_row = A [Col [pivot_col].start] ;
-	    DEBUG2 (("Pivotal row is %d\n", pivot_row)) ;
-	}
-	else
-	{
-	    /* there is no pivot row, since it is of zero length */
-	    pivot_row = EMPTY ;
-	    assert (pivot_row_length == 0) ;
-	}
-	assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
-
-	/* === Approximate degree computation =============================== */
-
-	/* Here begins the computation of the approximate degree.  The column */
-	/* score is the sum of the pivot row "length", plus the size of the */
-	/* set differences of each row in the column minus the pattern of the */
-	/* pivot row itself.  The column ("thickness") itself is also */
-	/* excluded from the column score (we thus use an approximate */
-	/* external degree). */
-
-	/* The time taken by the following code (compute set differences, and */
-	/* add them up) is proportional to the size of the data structure */
-	/* being scanned - that is, the sum of the sizes of each column in */
-	/* the pivot row.  Thus, the amortized time to compute a column score */
-	/* is proportional to the size of that column (where size, in this */
-	/* context, is the column "length", or the number of row indices */
-	/* in that column).  The number of row indices in a column is */
-	/* monotonically non-decreasing, from the length of the original */
-	/* column on input to colamd. */
-
-	/* === Compute set differences ====================================== */
-
-	DEBUG1 (("** Computing set differences phase. **\n")) ;
-
-	/* pivot row is currently dead - it will be revived later. */
-
-	DEBUG2 (("Pivot row: ")) ;
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    col = *rp++ ;
-	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
-	    DEBUG2 (("Col: %d\n", col)) ;
-
-	    /* clear tags used to construct pivot row pattern */
-	    col_thickness = -Col [col].shared1.thickness ;
-	    assert (col_thickness > 0) ;
-	    Col [col].shared1.thickness = col_thickness ;
-
-	    /* === Remove column from degree list =========================== */
-
-	    cur_score = Col [col].shared2.score ;
-	    prev_col = Col [col].shared3.prev ;
-	    next_col = Col [col].shared4.degree_next ;
-	    assert (cur_score >= 0) ;
-	    assert (cur_score <= n_col) ;
-	    assert (cur_score >= EMPTY) ;
-	    if (prev_col == EMPTY)
-	    {
-		head [cur_score] = next_col ;
-	    }
-	    else
-	    {
-		Col [prev_col].shared4.degree_next = next_col ;
-	    }
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = prev_col ;
-	    }
-
-	    /* === Scan the column ========================================== */
-
-	    cp = &A [Col [col].start] ;
-	    cp_end = cp + Col [col].length ;
-	    while (cp < cp_end)
-	    {
-		/* get a row */
-		row = *cp++ ;
-		row_mark = Row [row].shared2.mark ;
-		/* skip if dead */
-		if (ROW_IS_MARKED_DEAD (row_mark))
-		{
-		    continue ;
-		}
-		assert (row != pivot_row) ;
-		set_difference = row_mark - tag_mark ;
-		/* check if the row has been seen yet */
-		if (set_difference < 0)
-		{
-		    assert (Row [row].shared1.degree <= max_deg) ;
-		    set_difference = Row [row].shared1.degree ;
-		}
-		/* subtract column thickness from this row's set difference */
-		set_difference -= col_thickness ;
-		assert (set_difference >= 0) ;
-		/* absorb this row if the set difference becomes zero */
-		if (set_difference == 0)
-		{
-		    DEBUG1 (("aggressive absorption. Row: %d\n", row)) ;
-		    KILL_ROW (row) ;
-		}
-		else
-		{
-		    /* save the new mark */
-		    Row [row].shared2.mark = set_difference + tag_mark ;
-		}
-	    }
-	}
-
-#ifndef NDEBUG
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k-pivot_row_degree, max_deg) ;
-#endif
-
-	/* === Add up set differences for each column ======================= */
-
-	DEBUG1 (("** Adding set differences phase. **\n")) ;
-
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    /* get a column */
-	    col = *rp++ ;
-	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
-	    hash = 0 ;
-	    cur_score = 0 ;
-	    cp = &A [Col [col].start] ;
-	    /* compact the column */
-	    new_cp = cp ;
-	    cp_end = cp + Col [col].length ;
-
-	    DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ;
-
-	    while (cp < cp_end)
-	    {
-		/* get a row */
-		row = *cp++ ;
-		assert(row >= 0 && row < n_row) ;
-		row_mark = Row [row].shared2.mark ;
-		/* skip if dead */
-		if (ROW_IS_MARKED_DEAD (row_mark))
-		{
-		    continue ;
-		}
-		assert (row_mark > tag_mark) ;
-		/* compact the column */
-		*new_cp++ = row ;
-		/* compute hash function */
-		hash += row ;
-		/* add set difference */
-		cur_score += row_mark - tag_mark ;
-		/* integer overflow... */
-		cur_score = MIN (cur_score, n_col) ;
-	    }
-
-	    /* recompute the column's length */
-	    Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
-
-	    /* === Further mass elimination ================================= */
-
-	    if (Col [col].length == 0)
-	    {
-		DEBUG1 (("further mass elimination. Col: %d\n", col)) ;
-		/* nothing left but the pivot row in this column */
-		KILL_PRINCIPAL_COL (col) ;
-		pivot_row_degree -= Col [col].shared1.thickness ;
-		assert (pivot_row_degree >= 0) ;
-		/* order it */
-		Col [col].shared2.order = k ;
-		/* increment order count by column thickness */
-		k += Col [col].shared1.thickness ;
-	    }
-	    else
-	    {
-		/* === Prepare for supercolumn detection ==================== */
-
-		DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ;
-
-		/* save score so far */
-		Col [col].shared2.score = cur_score ;
-
-		/* add column to hash table, for supercolumn detection */
-		hash %= n_col + 1 ;
-
-		DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
-		assert (hash <= n_col) ;
-
-		head_column = head [hash] ;
-		if (head_column > EMPTY)
-		{
-		    /* degree list "hash" is non-empty, use prev (shared3) of */
-		    /* first column in degree list as head of hash bucket */
-		    first_col = Col [head_column].shared3.headhash ;
-		    Col [head_column].shared3.headhash = col ;
-		}
-		else
-		{
-		    /* degree list "hash" is empty, use head as hash bucket */
-		    first_col = - (head_column + 2) ;
-		    head [hash] = - (col + 2) ;
-		}
-		Col [col].shared4.hash_next = first_col ;
-
-		/* save hash function in Col [col].shared3.hash */
-		Col [col].shared3.hash = (int) hash ;
-		assert (COL_IS_ALIVE (col)) ;
-	    }
-	}
-
-	/* The approximate external column degree is now computed.  */
-
-	/* === Supercolumn detection ======================================== */
-
-	DEBUG1 (("** Supercolumn detection phase. **\n")) ;
-
-	detect_super_cols (
-#ifndef NDEBUG
-		n_col, Row,
-#endif
-		Col, A, head, pivot_row_start, pivot_row_length) ;
-
-	/* === Kill the pivotal column ====================================== */
-
-	KILL_PRINCIPAL_COL (pivot_col) ;
-
-	/* === Clear mark =================================================== */
-
-	tag_mark += (max_deg + 1) ;
-	if (tag_mark >= max_mark)
-	{
-	    DEBUG1 (("clearing tag_mark\n")) ;
-	    tag_mark = clear_mark (n_row, Row) ;
-	}
-#ifndef NDEBUG
-	DEBUG3 (("check3\n")) ;
-	debug_mark (n_row, Row, tag_mark, max_mark) ;
-#endif
-
-	/* === Finalize the new pivot row, and column scores ================ */
-
-	DEBUG1 (("** Finalize scores phase. **\n")) ;
-
-	/* for each column in pivot row */
-	rp = &A [pivot_row_start] ;
-	/* compact the pivot row */
-	new_rp = rp ;
-	rp_end = rp + pivot_row_length ;
-	while (rp < rp_end)
-	{
-	    col = *rp++ ;
-	    /* skip dead columns */
-	    if (COL_IS_DEAD (col))
-	    {
-		continue ;
-	    }
-	    *new_rp++ = col ;
-	    /* add new pivot row to column */
-	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
-
-	    /* retrieve score so far and add on pivot row's degree. */
-	    /* (we wait until here for this in case the pivot */
-	    /* row's degree was reduced due to mass elimination). */
-	    cur_score = Col [col].shared2.score + pivot_row_degree ;
-
-	    /* calculate the max possible score as the number of */
-	    /* external columns minus the 'k' value minus the */
-	    /* columns thickness */
-	    max_score = n_col - k - Col [col].shared1.thickness ;
-
-	    /* make the score the external degree of the union-of-rows */
-	    cur_score -= Col [col].shared1.thickness ;
-
-	    /* make sure score is less or equal than the max score */
-	    cur_score = MIN (cur_score, max_score) ;
-	    assert (cur_score >= 0) ;
-
-	    /* store updated score */
-	    Col [col].shared2.score = cur_score ;
-
-	    /* === Place column back in degree list ========================= */
-
-	    assert (min_score >= 0) ;
-	    assert (min_score <= n_col) ;
-	    assert (cur_score >= 0) ;
-	    assert (cur_score <= n_col) ;
-	    assert (head [cur_score] >= EMPTY) ;
-	    next_col = head [cur_score] ;
-	    Col [col].shared4.degree_next = next_col ;
-	    Col [col].shared3.prev = EMPTY ;
-	    if (next_col != EMPTY)
-	    {
-		Col [next_col].shared3.prev = col ;
-	    }
-	    head [cur_score] = col ;
-
-	    /* see if this score is less than current min */
-	    min_score = MIN (min_score, cur_score) ;
-
-	}
-
-#ifndef NDEBUG
-	debug_deg_lists (n_row, n_col, Row, Col, head,
-		min_score, n_col2-k, max_deg) ;
-#endif
-
-	/* === Resurrect the new pivot row ================================== */
-
-	if (pivot_row_degree > 0)
-	{
-	    /* update pivot row length to reflect any cols that were killed */
-	    /* during super-col detection and mass elimination */
-	    Row [pivot_row].start  = pivot_row_start ;
-	    Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
-	    Row [pivot_row].shared1.degree = pivot_row_degree ;
-	    Row [pivot_row].shared2.mark = 0 ;
-	    /* pivot row is no longer dead */
-	}
-    }
-
-    /* === All principal columns have now been ordered ====================== */
-
-    return (ngarbage) ;
-}
-
-
-/* ========================================================================== */
-/* === order_children ======================================================= */
-/* ========================================================================== */
-
-/*
-    The find_ordering routine has ordered all of the principal columns (the
-    representatives of the supercolumns).  The non-principal columns have not
-    yet been ordered.  This routine orders those columns by walking up the
-    parent tree (a column is a child of the column which absorbed it).  The
-    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
-    being the first column, and p [n_col-1] being the last.  It doesn't look
-    like it at first glance, but be assured that this routine takes time linear
-    in the number of columns.  Although not immediately obvious, the time
-    taken by this routine is O (n_col), that is, linear in the number of
-    columns.  Not user-callable.
-*/
-
-PRIVATE void order_children
-(
-    /* === Parameters ======================================================= */
-
-    int n_col,			/* number of columns of A */
-    ColInfo Col [],		/* of size n_col+1 */
-    int p []			/* p [0 ... n_col-1] is the column permutation*/
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;			/* loop counter for all columns */
-    int c ;			/* column index */
-    int parent ;		/* index of column's parent */
-    int order ;			/* column's order */
-
-    /* === Order each non-principal column ================================== */
-
-    for (i = 0 ; i < n_col ; i++)
-    {
-	/* find an un-ordered non-principal column */
-	assert (COL_IS_DEAD (i)) ;
-	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
-	{
-	    parent = i ;
-	    /* once found, find its principal parent */
-	    do
-	    {
-		parent = Col [parent].shared1.parent ;
-	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
-
-	    /* now, order all un-ordered non-principal columns along path */
-	    /* to this parent.  collapse tree at the same time */
-	    c = i ;
-	    /* get order of parent */
-	    order = Col [parent].shared2.order ;
-
-	    do
-	    {
-		assert (Col [c].shared2.order == EMPTY) ;
-
-		/* order this column */
-		Col [c].shared2.order = order++ ;
-		/* collaps tree */
-		Col [c].shared1.parent = parent ;
-
-		/* get immediate parent of this column */
-		c = Col [c].shared1.parent ;
-
-		/* continue until we hit an ordered column.  There are */
-		/* guarranteed not to be anymore unordered columns */
-		/* above an ordered column */
-	    } while (Col [c].shared2.order == EMPTY) ;
-
-	    /* re-order the super_col parent to largest order for this group */
-	    Col [parent].shared2.order = order ;
-	}
-    }
-
-    /* === Generate the permutation ========================================= */
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	p [Col [c].shared2.order] = c ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === detect_super_cols ==================================================== */
-/* ========================================================================== */
-
-/*
-    Detects supercolumns by finding matches between columns in the hash buckets.
-    Check amongst columns in the set A [row_start ... row_start + row_length-1].
-    The columns under consideration are currently *not* in the degree lists,
-    and have already been placed in the hash buckets.
-
-    The hash bucket for columns whose hash function is equal to h is stored
-    as follows:
-
-	if head [h] is >= 0, then head [h] contains a degree list, so:
-
-		head [h] is the first column in degree bucket h.
-		Col [head [h]].headhash gives the first column in hash bucket h.
-
-	otherwise, the degree list is empty, and:
-
-		-(head [h] + 2) is the first column in hash bucket h.
-
-    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
-    column" pointer.  Col [c].shared3.hash is used instead as the hash number
-    for that column.  The value of Col [c].shared4.hash_next is the next column
-    in the same hash bucket.
-
-    Assuming no, or "few" hash collisions, the time taken by this routine is
-    linear in the sum of the sizes (lengths) of each column whose score has
-    just been computed in the approximate degree computation.
-    Not user-callable.
-*/
-
-PRIVATE void detect_super_cols
-(
-    /* === Parameters ======================================================= */
-
-#ifndef NDEBUG
-    /* these two parameters are only needed when debugging is enabled: */
-    int n_col,			/* number of columns of A */
-    RowInfo Row [],		/* of size n_row+1 */
-#endif
-    ColInfo Col [],		/* of size n_col+1 */
-    int A [],			/* row indices of A */
-    int head [],		/* head of degree lists and hash buckets */
-    int row_start,		/* pointer to set of columns to check */
-    int row_length		/* number of columns to check */
-)
-{
-    /* === Local variables ================================================== */
-
-    int hash ;			/* hash # for a column */
-    int *rp ;			/* pointer to a row */
-    int c ;			/* a column index */
-    int super_c ;		/* column index of the column to absorb into */
-    int *cp1 ;			/* column pointer for column super_c */
-    int *cp2 ;			/* column pointer for column c */
-    int length ;		/* length of column super_c */
-    int prev_c ;		/* column preceding c in hash bucket */
-    int i ;			/* loop counter */
-    int *rp_end ;		/* pointer to the end of the row */
-    int col ;			/* a column index in the row to check */
-    int head_column ;		/* first column in hash bucket or degree list */
-    int first_col ;		/* first column in hash bucket */
-
-    /* === Consider each column in the row ================================== */
-
-    rp = &A [row_start] ;
-    rp_end = rp + row_length ;
-    while (rp < rp_end)
-    {
-	col = *rp++ ;
-	if (COL_IS_DEAD (col))
-	{
-	    continue ;
-	}
-
-	/* get hash number for this column */
-	hash = Col [col].shared3.hash ;
-	assert (hash <= n_col) ;
-
-	/* === Get the first column in this hash bucket ===================== */
-
-	head_column = head [hash] ;
-	if (head_column > EMPTY)
-	{
-	    first_col = Col [head_column].shared3.headhash ;
-	}
-	else
-	{
-	    first_col = - (head_column + 2) ;
-	}
-
-	/* === Consider each column in the hash bucket ====================== */
-
-	for (super_c = first_col ; super_c != EMPTY ;
-	    super_c = Col [super_c].shared4.hash_next)
-	{
-	    assert (COL_IS_ALIVE (super_c)) ;
-	    assert (Col [super_c].shared3.hash == hash) ;
-	    length = Col [super_c].length ;
-
-	    /* prev_c is the column preceding column c in the hash bucket */
-	    prev_c = super_c ;
-
-	    /* === Compare super_c with all columns after it ================ */
-
-	    for (c = Col [super_c].shared4.hash_next ;
-		 c != EMPTY ; c = Col [c].shared4.hash_next)
-	    {
-		assert (c != super_c) ;
-		assert (COL_IS_ALIVE (c)) ;
-		assert (Col [c].shared3.hash == hash) ;
-
-		/* not identical if lengths or scores are different */
-		if (Col [c].length != length ||
-		    Col [c].shared2.score != Col [super_c].shared2.score)
-		{
-		    prev_c = c ;
-		    continue ;
-		}
-
-		/* compare the two columns */
-		cp1 = &A [Col [super_c].start] ;
-		cp2 = &A [Col [c].start] ;
-
-		for (i = 0 ; i < length ; i++)
-		{
-		    /* the columns are "clean" (no dead rows) */
-		    assert (ROW_IS_ALIVE (*cp1))  ;
-		    assert (ROW_IS_ALIVE (*cp2))  ;
-		    /* row indices will same order for both supercols, */
-		    /* no gather scatter nessasary */
-		    if (*cp1++ != *cp2++)
-		    {
-			break ;
-		    }
-		}
-
-		/* the two columns are different if the for-loop "broke" */
-		if (i != length)
-		{
-		    prev_c = c ;
-		    continue ;
-		}
-
-		/* === Got it!  two columns are identical =================== */
-
-		assert (Col [c].shared2.score == Col [super_c].shared2.score) ;
-
-		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
-		Col [c].shared1.parent = super_c ;
-		KILL_NON_PRINCIPAL_COL (c) ;
-		/* order c later, in order_children() */
-		Col [c].shared2.order = EMPTY ;
-		/* remove c from hash bucket */
-		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
-	    }
-	}
-
-	/* === Empty this hash bucket ======================================= */
-
-	if (head_column > EMPTY)
-	{
-	    /* corresponding degree list "hash" is not empty */
-	    Col [head_column].shared3.headhash = EMPTY ;
-	}
-	else
-	{
-	    /* corresponding degree list "hash" is empty */
-	    head [hash] = EMPTY ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === garbage_collection =================================================== */
-/* ========================================================================== */
-
-/*
-    Defragments and compacts columns and rows in the workspace A.  Used when
-    all avaliable memory has been used while performing row merging.  Returns
-    the index of the first free position in A, after garbage collection.  The
-    time taken by this routine is linear is the size of the array A, which is
-    itself linear in the number of nonzeros in the input matrix.
-    Not user-callable.
-*/
-
-PRIVATE int garbage_collection  /* returns the new value of pfree */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,			/* number of rows */
-    int n_col,			/* number of columns */
-    RowInfo Row [],		/* row info */
-    ColInfo Col [],		/* column info */
-    int A [],			/* A [0 ... Alen-1] holds the matrix */
-    int *pfree			/* &A [0] ... pfree is in use */
-)
-{
-    /* === Local variables ================================================== */
-
-    int *psrc ;			/* source pointer */
-    int *pdest ;		/* destination pointer */
-    int j ;			/* counter */
-    int r ;			/* a row index */
-    int c ;			/* a column index */
-    int length ;		/* length of a row or column */
-
-#ifndef NDEBUG
-    int debug_rows ;
-    DEBUG0 (("Defrag..\n")) ;
-    for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ;
-    debug_rows = 0 ;
-#endif
-
-    /* === Defragment the columns =========================================== */
-
-    pdest = &A[0] ;
-    for (c = 0 ; c < n_col ; c++)
-    {
-	if (COL_IS_ALIVE (c))
-	{
-	    psrc = &A [Col [c].start] ;
-
-	    /* move and compact the column */
-	    assert (pdest <= psrc) ;
-	    Col [c].start = (int) (pdest - &A [0]) ;
-	    length = Col [c].length ;
-	    for (j = 0 ; j < length ; j++)
-	    {
-		r = *psrc++ ;
-		if (ROW_IS_ALIVE (r))
-		{
-		    *pdest++ = r ;
-		}
-	    }
-	    Col [c].length = (int) (pdest - &A [Col [c].start]) ;
-	}
-    }
-
-    /* === Prepare to defragment the rows =================================== */
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    if (Row [r].length == 0)
-	    {
-		/* this row is of zero length.  cannot compact it, so kill it */
-		DEBUG0 (("Defrag row kill\n")) ;
-		KILL_ROW (r) ;
-	    }
-	    else
-	    {
-		/* save first column index in Row [r].shared2.first_column */
-		psrc = &A [Row [r].start] ;
-		Row [r].shared2.first_column = *psrc ;
-		assert (ROW_IS_ALIVE (r)) ;
-		/* flag the start of the row with the one's complement of row */
-		*psrc = ONES_COMPLEMENT (r) ;
-#ifndef NDEBUG
-		debug_rows++ ;
-#endif
-	    }
-	}
-    }
-
-    /* === Defragment the rows ============================================== */
-
-    psrc = pdest ;
-    while (psrc < pfree)
-    {
-	/* find a negative number ... the start of a row */
-	if (*psrc++ < 0)
-	{
-	    psrc-- ;
-	    /* get the row index */
-	    r = ONES_COMPLEMENT (*psrc) ;
-	    assert (r >= 0 && r < n_row) ;
-	    /* restore first column index */
-	    *psrc = Row [r].shared2.first_column ;
-	    assert (ROW_IS_ALIVE (r)) ;
-
-	    /* move and compact the row */
-	    assert (pdest <= psrc) ;
-	    Row [r].start = (int) (pdest - &A [0]) ;
-	    length = Row [r].length ;
-	    for (j = 0 ; j < length ; j++)
-	    {
-		c = *psrc++ ;
-		if (COL_IS_ALIVE (c))
-		{
-		    *pdest++ = c ;
-		}
-	    }
-	    Row [r].length = (int) (pdest - &A [Row [r].start]) ;
-#ifndef NDEBUG
-	    debug_rows-- ;
-#endif
-	}
-    }
-    /* ensure we found all the rows */
-    assert (debug_rows == 0) ;
-
-    /* === Return the new value of pfree ==================================== */
-
-    return ((int) (pdest - &A [0])) ;
-}
-
-
-/* ========================================================================== */
-/* === clear_mark =========================================================== */
-/* ========================================================================== */
-
-/*
-    Clears the Row [].shared2.mark array, and returns the new tag_mark.
-    Return value is the new tag_mark.  Not user-callable.
-*/
-
-PRIVATE int clear_mark	/* return the new value for tag_mark */
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,		/* number of rows in A */
-    RowInfo Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-
-    DEBUG0 (("Clear mark\n")) ;
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    Row [r].shared2.mark = 0 ;
-	}
-    }
-    return (1) ;
-}
-
-
-/* ========================================================================== */
-/* === debugging routines =================================================== */
-/* ========================================================================== */
-
-/* When debugging is disabled, the remainder of this file is ignored. */
-
-#ifndef NDEBUG
-
-
-/* ========================================================================== */
-/* === debug_structures ===================================================== */
-/* ========================================================================== */
-
-/*
-    At this point, all empty rows and columns are dead.  All live columns
-    are "clean" (containing no dead rows) and simplicial (no supercolumns
-    yet).  Rows may contain dead columns, but all live rows contain at
-    least one live column.
-*/
-
-PRIVATE void debug_structures
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A [],
-    int n_col2
-)
-{
-    /* === Local variables ================================================== */
-
-    int i ;
-    int c ;
-    int *cp ;
-    int *cp_end ;
-    int len ;
-    int score ;
-    int r ;
-    int *rp ;
-    int *rp_end ;
-    int deg ;
-
-    /* === Check A, Row, and Col ============================================ */
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	if (COL_IS_ALIVE (c))
-	{
-	    len = Col [c].length ;
-	    score = Col [c].shared2.score ;
-	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
-	    assert (len > 0) ;
-	    assert (score >= 0) ;
-	    assert (Col [c].shared1.thickness == 1) ;
-	    cp = &A [Col [c].start] ;
-	    cp_end = cp + len ;
-	    while (cp < cp_end)
-	    {
-		r = *cp++ ;
-		assert (ROW_IS_ALIVE (r)) ;
-	    }
-	}
-	else
-	{
-	    i = Col [c].shared2.order ;
-	    assert (i >= n_col2 && i < n_col) ;
-	}
-    }
-
-    for (r = 0 ; r < n_row ; r++)
-    {
-	if (ROW_IS_ALIVE (r))
-	{
-	    i = 0 ;
-	    len = Row [r].length ;
-	    deg = Row [r].shared1.degree ;
-	    assert (len > 0) ;
-	    assert (deg > 0) ;
-	    rp = &A [Row [r].start] ;
-	    rp_end = rp + len ;
-	    while (rp < rp_end)
-	    {
-		c = *rp++ ;
-		if (COL_IS_ALIVE (c))
-		{
-		    i++ ;
-		}
-	    }
-	    assert (i > 0) ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_deg_lists ====================================================== */
-/* ========================================================================== */
-
-/*
-    Prints the contents of the degree lists.  Counts the number of columns
-    in the degree list and compares it to the total it should have.  Also
-    checks the row degrees.
-*/
-
-PRIVATE void debug_deg_lists
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int head [],
-    int min_score,
-    int should,
-    int max_deg
-)
-{
-    /* === Local variables ================================================== */
-
-    int deg ;
-    int col ;
-    int have ;
-    int row ;
-
-    /* === Check the degree lists =========================================== */
-
-    if (n_col > 10000 && debug_colamd <= 0)
-    {
-	return ;
-    }
-    have = 0 ;
-    DEBUG4 (("Degree lists: %d\n", min_score)) ;
-    for (deg = 0 ; deg <= n_col ; deg++)
-    {
-	col = head [deg] ;
-	if (col == EMPTY)
-	{
-	    continue ;
-	}
-	DEBUG4 (("%d:", deg)) ;
-	while (col != EMPTY)
-	{
-	    DEBUG4 ((" %d", col)) ;
-	    have += Col [col].shared1.thickness ;
-	    assert (COL_IS_ALIVE (col)) ;
-	    col = Col [col].shared4.degree_next ;
-	}
-	DEBUG4 (("\n")) ;
-    }
-    DEBUG4 (("should %d have %d\n", should, have)) ;
-    assert (should == have) ;
-
-    /* === Check the row degrees ============================================ */
-
-    if (n_row > 10000 && debug_colamd <= 0)
-    {
-	return ;
-    }
-    for (row = 0 ; row < n_row ; row++)
-    {
-	if (ROW_IS_ALIVE (row))
-	{
-	    assert (Row [row].shared1.degree <= max_deg) ;
-	}
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_mark =========================================================== */
-/* ========================================================================== */
-
-/*
-    Ensures that the tag_mark is less that the maximum and also ensures that
-    each entry in the mark array is less than the tag mark.
-*/
-
-PRIVATE void debug_mark
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    RowInfo Row [],
-    int tag_mark,
-    int max_mark
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-
-    /* === Check the Row marks ============================================== */
-
-    assert (tag_mark > 0 && tag_mark <= max_mark) ;
-    if (n_row > 10000 && debug_colamd <= 0)
-    {
-	return ;
-    }
-    for (r = 0 ; r < n_row ; r++)
-    {
-	assert (Row [r].shared2.mark < tag_mark) ;
-    }
-}
-
-
-/* ========================================================================== */
-/* === debug_matrix ========================================================= */
-/* ========================================================================== */
-
-/*
-    Prints out the contents of the columns and the rows.
-*/
-
-PRIVATE void debug_matrix
-(
-    /* === Parameters ======================================================= */
-
-    int n_row,
-    int n_col,
-    RowInfo Row [],
-    ColInfo Col [],
-    int A []
-)
-{
-    /* === Local variables ================================================== */
-
-    int r ;
-    int c ;
-    int *rp ;
-    int *rp_end ;
-    int *cp ;
-    int *cp_end ;
-
-    /* === Dump the rows and columns of the matrix ========================== */
-
-    if (debug_colamd < 3)
-    {
-	return ;
-    }
-    DEBUG3 (("DUMP MATRIX:\n")) ;
-    for (r = 0 ; r < n_row ; r++)
-    {
-	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
-	if (ROW_IS_DEAD (r))
-	{
-	    continue ;
-	}
-	DEBUG3 (("start %d length %d degree %d\n",
-		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
-	rp = &A [Row [r].start] ;
-	rp_end = rp + Row [r].length ;
-	while (rp < rp_end)
-	{
-	    c = *rp++ ;
-	    DEBUG3 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
-	}
-    }
-
-    for (c = 0 ; c < n_col ; c++)
-    {
-	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
-	if (COL_IS_DEAD (c))
-	{
-	    continue ;
-	}
-	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
-		Col [c].start, Col [c].length,
-		Col [c].shared1.thickness, Col [c].shared2.score)) ;
-	cp = &A [Col [c].start] ;
-	cp_end = cp + Col [c].length ;
-	while (cp < cp_end)
-	{
-	    r = *cp++ ;
-	    DEBUG3 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
-	}
-    }
-}
-
-#endif
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/old_colamd.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/old_colamd.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,67 +0,0 @@
-/* ========================================================================== */
-/* === colamd prototypes and definitions ==================================== */
-/* ========================================================================== */
-
-/*
-    This is the colamd include file,
-
-	http://www.cise.ufl.edu/~davis/colamd/colamd.h
-
-    for use in the colamd.c, colamdmex.c, and symamdmex.c files located at
-
-	http://www.cise.ufl.edu/~davis/colamd/
-
-    See those files for a description of colamd and symamd, and for the
-    copyright notice, which also applies to this file.
-
-    August 3, 1998.  Version 1.0.
-*/
-
-/* ========================================================================== */
-/* === Definitions ========================================================== */
-/* ========================================================================== */
-
-/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
-
-/* number of output statistics.  Only A [0..2] are currently used. */
-#define COLAMD_STATS 20
-
-/* knobs [0] and A [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
-
-/* knobs [1] and A [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
-
-/* A [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
-
-/* A [3]: whether or not the input columns were jumbled or had duplicates */
-#define COLAMD_JUMBLED_COLS 3
-
-/* ========================================================================== */
-/* === Prototypes of user-callable routines ================================= */
-/* ========================================================================== */
-
-int colamd_recommended		/* returns recommended value of Alen */
-(
-    int nnz,			/* nonzeros in A */
-    int n_row,			/* number of rows in A */
-    int n_col			/* number of columns in A */
-) ;
-
-void colamd_set_defaults	/* sets default parameters */
-(				/* knobs argument is modified on output */
-    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
-) ;
-
-int colamd			/* returns TRUE if successful, FALSE otherwise*/
-(				/* A and p arguments are modified on output */
-    int n_row,			/* number of rows in A */
-    int n_col,			/* number of columns in A */
-    int Alen,			/* size of the array A */
-    int A [],			/* row indices of A, of size Alen */
-    int p [],			/* column pointers of A, of size n_col+1 */
-    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
-) ;
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/relax_snode.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/relax_snode.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/relax_snode.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/relax_snode.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,71 +0,0 @@
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_ddefs.h"
-
-void
-relax_snode (
-	     const     int n,
-	     int       *et,           /* column elimination tree */
-	     const int relax_columns, /* max no of columns allowed in a
-					 relaxed snode */
-	     int       *descendants,  /* no of descendants of each node
-					 in the etree */
-	     int       *relax_end     /* last column in a supernode */
-	     )
-{
-/*
- * Purpose
- * =======
- *    relax_snode() - Identify the initial relaxed supernodes, assuming that 
- *    the matrix has been reordered according to the postorder of the etree.
- *
- */ 
-    register int j, parent;
-    register int snode_start;	/* beginning of a snode */
-    
-    ifill (relax_end, n, EMPTY);
-    for (j = 0; j < n; j++) descendants[j] = 0;
-
-    /* Compute the number of descendants of each node in the etree */
-    for (j = 0; j < n; j++) {
-	parent = et[j];
-	if ( parent != n )  /* not the dummy root */
-	    descendants[parent] += descendants[j] + 1;
-    }
-
-    /* Identify the relaxed supernodes by postorder traversal of the etree. */
-    for (j = 0; j < n; ) { 
-     	parent = et[j];
-        snode_start = j;
- 	while ( parent != n && descendants[parent] < relax_columns ) {
-	    j = parent;
-	    parent = et[j];
-	}
-	/* Found a supernode with j being the last column. */
-	relax_end[snode_start] = j;		/* Last column is recorded */
-	j++;
-	/* Search for a new leaf */
-	while ( descendants[j] != 0 && j < n ) j++;
-    }
-
-    /*printf("No of relaxed snodes: %d; relaxed columns: %d\n", 
-		nsuper, no_relaxed_col); */
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,348 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_sdefs.h"
-
-/* 
- * Function prototypes 
- */
-void susolve(int, int, float*, float*);
-void slsolve(int, int, float*, float*);
-void smatvec(int, int, int, float*, float*, float*);
-
-
-
-/* Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-scolumn_bmod (
-	     const int  jcol,	  /* in */
-	     const int  nseg,	  /* in */
-	     float     *dense,	  /* in */
-	     float     *tempv,	  /* working array */
-	     int        *segrep,  /* in */
-	     int        *repfnz,  /* in */
-	     int        fpanelc,  /* in -- first column in the current panel */
-	     GlobalLU_t *Glu,     /* modified */
-	     SuperLUStat_t *stat  /* output */
-	     )
-{
-/*
- * Purpose:
- * ========
- *    Performs numeric block updates (sup-col) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int         incx = 1, incy = 1;
-    float      alpha, beta;
-    
-    /* krep = representative of current k-th supernode
-     * fsupc = first supernodal column
-     * nsupc = no of columns in supernode
-     * nsupr = no of rows in supernode (used as leading dimension)
-     * luptr = location of supernodal LU-block in storage
-     * kfnz = first nonz in the k-th supernodal segment
-     * no_zeros = no of leading zeros in a supernodal U-segment
-     */
-    float       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          fsupc, nsupc, nsupr, segsze;
-    int          nrow;	  /* No of rows in the matrix of matrix-vector */
-    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
-    register int lptr, kfnz, isub, irow, i;
-    register int no_zeros, new_next; 
-    int          ufirst, nextlu;
-    int          fst_col; /* First column within small LU update */
-    int          d_fsupc; /* Distance between the first column of the current
-			     panel and the first column of the current snode. */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    float       *lusup;
-    int          *xlusup;
-    int          nzlumax;
-    float       *tempv1;
-    float      zero = 0.0;
-    float      one = 1.0;
-    float      none = -1.0;
-    int          mem_error;
-    flops_t      *ops = stat->ops;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    nzlumax = Glu->nzlumax;
-    jcolp1 = jcol + 1;
-    jsupno = supno[jcol];
-    
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-
-	krep = segrep[k];
-	k--;
-	ksupno = supno[krep];
-	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
-
-	    fsupc = xsup[ksupno];
-	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-  	    /* Distance from the current supernode to the current panel; 
-	       d_fsupc=0 if fsupc > fpanelc. */
-  	    d_fsupc = fst_col - fsupc; 
-
-	    luptr = xlusup[fst_col] + d_fsupc;
-	    lptr = xlsub[fsupc] + d_fsupc;
-
-	    kfnz = repfnz[krep];
-	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
-
-	    segsze = krep - kfnz + 1;
-	    nsupc = krep - fst_col + 1;
-	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	    nrow = nsupr - d_fsupc - nsupc;
-	    krep_ind = lptr + nsupc - 1;
-
-	    ops[TRSV] += segsze * (segsze - 1);
-	    ops[GEMV] += 2 * nrow * segsze;
-
-
-	    /* 
-	     * Case 1: Update U-segment of size 1 -- col-col update 
-	     */
-	    if ( segsze == 1 ) {
-	  	ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc;
-
-		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    dense[irow] -=  ukj*lusup[luptr];
-		    luptr++;
-		}
-
-	    } else if ( segsze <= 3 ) {
-		ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc-1;
-		ukj1 = dense[lsub[krep_ind - 1]];
-		luptr1 = luptr - nsupr;
-
-		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
-		    ukj -= ukj1 * lusup[luptr1];
-		    dense[lsub[krep_ind]] = ukj;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-		    	dense[irow] -= ( ukj*lusup[luptr]
-					+ ukj1*lusup[luptr1] );
-		    }
-		} else { /* Case 3: 3cols-col update */
-		    ukj2 = dense[lsub[krep_ind - 2]];
-		    luptr2 = luptr1 - nsupr;
-		    ukj1 -= ukj2 * lusup[luptr2-1];
-		    ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
-		    dense[lsub[krep_ind]] = ukj;
-		    dense[lsub[krep_ind-1]] = ukj1;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-			luptr2++;
-		    	dense[irow] -= ( ukj*lusup[luptr]
-			     + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
-		    }
-		}
-
-
-
-	    } else {
-	  	/*
-		 * Case: sup-col update
-		 * Perform a triangular solve and block update,
-		 * then scatter the result of sup-col update to dense
-		 */
-
-		no_zeros = kfnz - fst_col;
-
-	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
-	        isub = lptr + no_zeros;
-	        for (i = 0; i < segsze; i++) {
-	  	    irow = lsub[isub];
-		    tempv[i] = dense[irow];
-		    ++isub; 
-	        }
-
-	        /* Dense triangular solve -- start effective triangle */
-		luptr += nsupr * no_zeros + no_zeros; 
-		
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#else		
-		strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#endif		
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-                alpha = one;
-                beta = zero;
-#ifdef _CRAY
-		SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-		sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		slsolve ( nsupr, segsze, &lusup[luptr], tempv );
-
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-		smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		
-		
-                /* Scatter tempv[] into SPA dense[] as a temporary storage */
-                isub = lptr + no_zeros;
-                for (i = 0; i < segsze; i++) {
-                    irow = lsub[isub];
-                    dense[irow] = tempv[i];
-                    tempv[i] = zero;
-                    ++isub;
-                }
-
-		/* Scatter tempv1[] into SPA dense[] */
-		for (i = 0; i < nrow; i++) {
-		    irow = lsub[isub];
-		    dense[irow] -= tempv1[i];
-		    tempv1[i] = zero;
-		    ++isub;
-		}
-	    }
-	    
-	} /* if jsupno ... */
-
-    } /* for each segment... */
-
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    nextlu = xlusup[jcol];
-    fsupc = xsup[jsupno];
-
-    /* Copy the SPA dense into L\U[*,j] */
-    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
-    while ( new_next > nzlumax ) {
-	if (mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
-	    return (mem_error);
-	lusup = Glu->lusup;
-	lsub = Glu->lsub;
-    }
-
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-        dense[irow] = zero;
-	++nextlu;
-    }
-
-    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */
-
-    /* For more updates within the panel (also within the current supernode), 
-     * should start from the first column of the panel, or the first column 
-     * of the supernode, whichever is bigger. There are 2 cases:
-     *    1) fsupc < fpanelc, then fst_col := fpanelc
-     *    2) fsupc >= fpanelc, then fst_col := fsupc
-     */
-    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-    if ( fst_col < jcol ) {
-
-  	/* Distance between the current supernode and the current panel.
-	   d_fsupc=0 if fsupc >= fpanelc. */
-  	d_fsupc = fst_col - fsupc;
-
-	lptr = xlsub[fsupc] + d_fsupc;
-	luptr = xlusup[fst_col] + d_fsupc;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	nsupc = jcol - fst_col;	/* Excluding jcol */
-	nrow = nsupr - d_fsupc - nsupc;
-
-	/* Points to the beginning of jcol in snode L\U(jsupno) */
-	ufirst = xlusup[jcol] + d_fsupc;	
-
-	ops[TRSV] += nsupc * (nsupc - 1);
-	ops[GEMV] += 2 * nrow * nsupc;
-	
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#else
-	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#endif
-	
-	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
-
-#ifdef _CRAY
-	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-
-	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
-		&lusup[ufirst], tempv );
-	
-        /* Copy updates from tempv[*] into lusup[*] */
-	isub = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    lusup[isub] -= tempv[i];
-	    tempv[i] = 0.0;
-	    ++isub;
-	}
-
-#endif
-	
-	
-    } /* if fst_col < jcol ... */ 
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scolumn_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,269 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-/* What type of supernodes we want */
-#define T2_SUPER
-
-int
-scolumn_dfs(
-	   const int  m,         /* in - number of rows in the matrix */
-	   const int  jcol,      /* in */
-	   int        *perm_r,   /* in */
-	   int        *nseg,     /* modified - with new segments appended */
-	   int        *lsub_col, /* in - defines the RHS vector to start the dfs */
-	   int        *segrep,   /* modified - with new segments appended */
-	   int        *repfnz,   /* modified */
-	   int        *xprune,   /* modified */
-	   int        *marker,   /* modified */
-	   int        *parent,	 /* working array */
-	   int        *xplore,   /* working array */
-	   GlobalLU_t *Glu       /* modified */
-	   )
-{
-/* 
- * Purpose
- * =======
- *   "column_dfs" performs a symbolic factorization on column jcol, and
- *   decide the supernode boundary.
- *
- *   This routine does not use numeric values, but only use the RHS 
- *   row indices to start the dfs.
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives. The routine returns a list of such supernodal 
- *   representatives in topological order of the dfs that generates them.
- *   The location of the first nonzero in each such supernodal segment
- *   (supernodal entry location) is also returned.
- *
- * Local parameters
- * ================
- *   nseg: no of segments in current U[*,j]
- *   jsuper: jsuper=EMPTY if column j does not belong to the same
- *	supernode as j-1. Otherwise, jsuper=nsuper.
- *
- *   marker2: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- * Return value
- * ============
- *     0  success;
- *   > 0  number of bytes allocated when run out of space.
- *
- */
-    int     jcolp1, jcolm1, jsuper, nsuper, nextl;
-    int     k, krep, krow, kmark, kperm;
-    int     *marker2;           /* Used for small panel LU */
-    int	    fsupc;		/* First column of a snode */
-    int     myfnz;		/* First nonz column of a U-segment */
-    int	    chperm, chmark, chrep, kchild;
-    int     xdfs, maxdfs, kpar, oldrep;
-    int     jptr, jm1ptr;
-    int     ito, ifrom, istop;	/* Used to compress row subscripts */
-    int     mem_error;
-    int     *xsup, *supno, *lsub, *xlsub;
-    int     nzlmax;
-    static  int  first = 1, maxsuper;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	first = 0;
-    }
-    jcolp1  = jcol + 1;
-    jcolm1  = jcol - 1;
-    nsuper  = supno[jcol];
-    jsuper  = nsuper;
-    nextl   = xlsub[jcol];
-    marker2 = &marker[2*m];
-
-
-    /* For each nonzero in A[*,jcol] do dfs */
-    for (k = 0; lsub_col[k] != EMPTY; k++) {
-
-	krow = lsub_col[k];
-    	lsub_col[k] = EMPTY;
-	kmark = marker2[krow];    	
-
-	/* krow was visited before, go to the next nonz */
-        if ( kmark == jcol ) continue; 
-
-	/* For each unmarked nbr krow of jcol
-	 *	krow is in L: place it in structure of L[*,jcol]
-	 */
-	marker2[krow] = jcol;
-	kperm = perm_r[krow];
-
-   	if ( kperm == EMPTY ) {
-	    lsub[nextl++] = krow; 	/* krow is indexed into A */
-	    if ( nextl >= nzlmax ) {
-		if ( mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-		    return (mem_error);
-		lsub = Glu->lsub;
-	    }
-            if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */
-  	} else {
-	    /*	krow is in U: if its supernode-rep krep
-	     *	has been explored, update repfnz[*]
-	     */
-	    krep = xsup[supno[kperm]+1] - 1;
-	    myfnz = repfnz[krep];
-
-	    if ( myfnz != EMPTY ) {	/* Visited before */
-	    	if ( myfnz > kperm ) repfnz[krep] = kperm;
-		/* continue; */
-	    }
-	    else {
-		/* Otherwise, perform dfs starting at krep */
-		oldrep = EMPTY;
-	 	parent[krep] = oldrep;
-	  	repfnz[krep] = kperm;
-		xdfs = xlsub[krep];
-	  	maxdfs = xprune[krep];
-
-		do {
-		    /* 
-		     * For each unmarked kchild of krep 
-		     */
-		    while ( xdfs < maxdfs ) {
-
-		   	kchild = lsub[xdfs];
-			xdfs++;
-		  	chmark = marker2[kchild];
-
-		   	if ( chmark != jcol ) { /* Not reached yet */
-		   	    marker2[kchild] = jcol;
-		   	    chperm = perm_r[kchild];
-
-		   	    /* Case kchild is in L: place it in L[*,k] */
-		   	    if ( chperm == EMPTY ) {
-			    	lsub[nextl++] = kchild;
-				if ( nextl >= nzlmax ) {
-				    if ( mem_error =
-					 sLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) )
-					return (mem_error);
-				    lsub = Glu->lsub;
-				}
-				if ( chmark != jcolm1 ) jsuper = EMPTY;
-			    } else {
-		    	    	/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-			         *   been explored, update its repfnz[*]
-			         */
-		   	    	chrep = xsup[supno[chperm]+1] - 1;
-		   		myfnz = repfnz[chrep];
-		   		if ( myfnz != EMPTY ) { /* Visited before */
-				    if ( myfnz > chperm )
-     				  	repfnz[chrep] = chperm;
-				} else {
-		        	    /* Continue dfs at super-rep of kchild */
-		   		    xplore[krep] = xdfs;	
-		   		    oldrep = krep;
-		   		    krep = chrep; /* Go deeper down G(L^t) */
-				    parent[krep] = oldrep;
-		    		    repfnz[krep] = chperm;
-		   		    xdfs = xlsub[krep];     
-				    maxdfs = xprune[krep];
-				} /* else */
-
-			   } /* else */
-
-			} /* if */
-
-		    } /* while */
-
-		    /* krow has no more unexplored nbrs;
-	   	     *    place supernode-rep krep in postorder DFS.
-	   	     *    backtrack dfs to its parent
-		     */
-		    segrep[*nseg] = krep;
-		    ++(*nseg);
-		    kpar = parent[krep]; /* Pop from stack, mimic recursion */
-		    if ( kpar == EMPTY ) break; /* dfs done */
-		    krep = kpar;
-		    xdfs = xplore[krep];
-		    maxdfs = xprune[krep];
-
-		} while ( kpar != EMPTY ); 	/* Until empty stack */
-
-	    } /* else */
-
-	} /* else */
-
-    } /* for each nonzero ... */
-
-    /* Check to see if j belongs in the same supernode as j-1 */
-    if ( jcol == 0 ) { /* Do nothing for column 0 */
-	nsuper = supno[0] = 0;
-    } else {
-   	fsupc = xsup[nsuper];
-	jptr = xlsub[jcol];	/* Not compressed yet */
-	jm1ptr = xlsub[jcolm1];
-
-#ifdef T2_SUPER
-	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
-#endif
-	/* Make sure the number of columns in a supernode doesn't
-	   exceed threshold. */
-	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
-
-	/* If jcol starts a new supernode, reclaim storage space in
-	 * lsub from the previous supernode. Note we only store
-	 * the subscript set of the first and last columns of
-   	 * a supernode. (first for num values, last for pruning)
-	 */
-	if ( jsuper == EMPTY ) {	/* starts a new supernode */
-	    if ( (fsupc < jcolm1-1) ) {	/* >= 3 columns in nsuper */
-#ifdef CHK_COMPRESS
-		printf("  Compress lsub[] at super %d-%d\n", fsupc, jcolm1);
-#endif
-	        ito = xlsub[fsupc+1];
-		xlsub[jcolm1] = ito;
-		istop = ito + jptr - jm1ptr;
-		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
-		xlsub[jcol] = istop;
-		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
-		    lsub[ito] = lsub[ifrom];
-		nextl = ito;            /* = istop + length(jcol) */
-	    }
-	    nsuper++;
-	    supno[jcol] = nsuper;
-	} /* if a new supernode */
-
-    }	/* else: jcol > 0 */ 
-    
-    /* Tidy up the pointers before exit */
-    xsup[nsuper+1] = jcolp1;
-    supno[jcolp1]  = nsuper;
-    xprune[jcol]   = nextl;	/* Initialize upper bound for pruning */
-    xlsub[jcolp1]  = nextl;
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scomplex.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scomplex.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scomplex.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scomplex.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,106 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * This file defines common arithmetic operations for complex type.
- */
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "slu_scomplex.h"
-
-
-/* Complex Division c = a/b */
-void c_div(complex *c, complex *a, complex *b)
-{
-    float ratio, den;
-    float abr, abi, cr, ci;
-  
-    if( (abr = b->r) < 0.)
-	abr = - abr;
-    if( (abi = b->i) < 0.)
-	abi = - abi;
-    if( abr <= abi ) {
-	if (abi == 0) {
-	    fprintf(stderr, "z_div.c: division by zero\n");
-            exit(-1);
-	}	  
-	ratio = b->r / b->i ;
-	den = b->i * (1 + ratio*ratio);
-	cr = (a->r*ratio + a->i) / den;
-	ci = (a->i*ratio - a->r) / den;
-    } else {
-	ratio = b->i / b->r ;
-	den = b->r * (1 + ratio*ratio);
-	cr = (a->r + a->i*ratio) / den;
-	ci = (a->i - a->r*ratio) / den;
-    }
-    c->r = cr;
-    c->i = ci;
-}
-
-
-/* Returns sqrt(z.r^2 + z.i^2) */
-double c_abs(complex *z)
-{
-    float temp;
-    float real = z->r;
-    float imag = z->i;
-
-    if (real < 0) real = -real;
-    if (imag < 0) imag = -imag;
-    if (imag > real) {
-	temp = real;
-	real = imag;
-	imag = temp;
-    }
-    if ((real+imag) == real) return(real);
-  
-    temp = imag/real;
-    temp = real*sqrt(1.0 + temp*temp);  /*overflow!!*/
-    return (temp);
-}
-
-
-/* Approximates the abs */
-/* Returns abs(z.r) + abs(z.i) */
-double c_abs1(complex *z)
-{
-    float real = z->r;
-    float imag = z->i;
-  
-    if (real < 0) real = -real;
-    if (imag < 0) imag = -imag;
-
-    return (real + imag);
-}
-
-/* Return the exponentiation */
-void c_exp(complex *r, complex *z)
-{
-    float expx;
-
-    expx = exp(z->r);
-    r->r = expx * cos(z->i);
-    r->i = expx * sin(z->i);
-}
-
-/* Return the complex conjugate */
-void r_cnjg(complex *r, complex *z)
-{
-    r->r = z->r;
-    r->i = -z->i;
-}
-
-/* Return the imaginary part */
-double r_imag(complex *z)
-{
-    return (z->i);
-}
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scopy_to_ucol.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,103 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-int
-scopy_to_ucol(
-	      int        jcol,	  /* in */
-	      int        nseg,	  /* in */
-	      int        *segrep,  /* in */
-	      int        *repfnz,  /* in */
-	      int        *perm_r,  /* in */
-	      float     *dense,   /* modified - reset to zero on return */
-	      GlobalLU_t *Glu      /* modified */
-	      )
-{
-/* 
- * Gather from SPA dense[*] to global ucol[*].
- */
-    int ksub, krep, ksupno;
-    int i, k, kfnz, segsze;
-    int fsupc, isub, irow;
-    int jsupno, nextu;
-    int new_next, mem_error;
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-    float    *ucol;
-    int       *usub, *xusub;
-    int       nzumax;
-
-    float zero = 0.0;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    nzumax  = Glu->nzumax;
-    
-    jsupno = supno[jcol];
-    nextu  = xusub[jcol];
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-	krep = segrep[k--];
-	ksupno = supno[krep];
-
-	if ( ksupno != jsupno ) { /* Should go into ucol[] */
-	    kfnz = repfnz[krep];
-	    if ( kfnz != EMPTY ) {	/* Nonzero U-segment */
-
-	    	fsupc = xsup[ksupno];
-	        isub = xlsub[fsupc] + kfnz - fsupc;
-	        segsze = krep - kfnz + 1;
-
-		new_next = nextu + segsze;
-		while ( new_next > nzumax ) {
-		    if (mem_error = sLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu))
-			return (mem_error);
-		    ucol = Glu->ucol;
-		    if (mem_error = sLUMemXpand(jcol, nextu, USUB, &nzumax, Glu))
-			return (mem_error);
-		    usub = Glu->usub;
-		    lsub = Glu->lsub;
-		}
-		
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    usub[nextu] = perm_r[irow];
-		    ucol[nextu] = dense[irow];
-		    dense[irow] = zero;
-		    nextu++;
-		    isub++;
-		} 
-
-	    }
-
-	}
-
-    } /* for each segment... */
-
-    xusub[jcol + 1] = nextu;      /* Close U[*,jcol] */
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scsum1.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scsum1.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/scsum1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/scsum1.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,96 +0,0 @@
-#include "slu_Cnames.h"
-#include "slu_scomplex.h"
-
-double scsum1_(int *n, complex *cx, int *incx)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SCSUM1 takes the sum of the absolute values of a complex   
-    vector and returns a single precision result.   
-
-    Based on SCASUM from the Level 1 BLAS.   
-    The change is to use the 'genuine' absolute value.   
-
-    Contributed by Nick Higham for use with CLACON.   
-
-    Arguments   
-    =========   
-
-    N       (input) INT
-            The number of elements in the vector CX.   
-
-    CX      (input) COMPLEX array, dimension (N)   
-            The vector whose elements will be summed.   
-
-    INCX    (input) INT
-            The spacing between successive values of CX.  INCX > 0.   
-
-    ===================================================================== 
-  
-
-
-    
-   Parameter adjustments   
-       Function Body */
-    /* System generated locals */
-    int i__1, i__2;
-    float ret_val;
-    /* Builtin functions */
-    double c_abs(complex *);
-    /* Local variables */
-    static int i, nincx;
-    static float stemp;
-
-
-#define CX(I) cx[(I)-1]
-
-
-    ret_val = 0.f;
-    stemp = 0.f;
-    if (*n <= 0) {
-	return ret_val;
-    }
-    if (*incx == 1) {
-	goto L20;
-    }
-
-/*     CODE FOR INCREMENT NOT EQUAL TO 1 */
-
-    nincx = *n * *incx;
-    i__1 = nincx;
-    i__2 = *incx;
-    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
-
-/*        NEXT LINE MODIFIED. */
-
-	stemp += c_abs(&CX(i));
-/* L10: */
-    }
-    ret_val = stemp;
-    return ret_val;
-
-/*     CODE FOR INCREMENT EQUAL TO 1 */
-
-L20:
-    i__2 = *n;
-    for (i = 1; i <= *n; ++i) {
-
-/*        NEXT LINE MODIFIED. */
-
-	stemp += c_abs(&CX(i));
-/* L30: */
-    }
-    ret_val = stemp;
-    return ret_val;
-
-/*     End of SCSUM1 */
-
-} /* scsum1_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgscon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgscon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgscon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,146 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	sgscon.c
- * History:     Modified from lapack routines SGECON.
- */
-#include <math.h>
-#include "slu_sdefs.h"
-
-void
-sgscon(char *norm, SuperMatrix *L, SuperMatrix *U,
-       float anorm, float *rcond, SuperLUStat_t *stat, int *info)
-{
-/*
-    Purpose   
-    =======   
-
-    SGSCON estimates the reciprocal of the condition number of a general 
-    real matrix A, in either the 1-norm or the infinity-norm, using   
-    the LU factorization computed by SGETRF.   
-
-    An estimate is obtained for norm(inv(A)), and the reciprocal of the   
-    condition number is computed as   
-       RCOND = 1 / ( norm(A) * norm(inv(A)) ).   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    NORM    (input) char*
-            Specifies whether the 1-norm condition number or the   
-            infinity-norm condition number is required:   
-            = '1' or 'O':  1-norm;   
-            = 'I':         Infinity-norm.
-	    
-    L       (input) SuperMatrix*
-            The factor L from the factorization Pr*A*Pc=L*U as computed by
-            sgstrf(). Use compressed row subscripts storage for supernodes,
-            i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
- 
-    U       (input) SuperMatrix*
-            The factor U from the factorization Pr*A*Pc=L*U as computed by
-            sgstrf(). Use column-wise storage scheme, i.e., U has types:
-            Stype = SLU_NC, Dtype = SLU_S, Mtype = TRU.
-	    
-    ANORM   (input) float
-            If NORM = '1' or 'O', the 1-norm of the original matrix A.   
-            If NORM = 'I', the infinity-norm of the original matrix A.
-	    
-    RCOND   (output) float*
-            The reciprocal of the condition number of the matrix A,   
-            computed as RCOND = 1/(norm(A) * norm(inv(A))).
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    int    kase, kase1, onenrm, i;
-    float ainvnm;
-    float *work;
-    int    *iwork;
-    extern int srscl_(int *, float *, float *, int *);
-
-    extern int slacon_(int *, float *, float *, int *, float *, int *);
-
-    
-    /* Test the input parameters. */
-    *info = 0;
-    onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O");
-    if (! onenrm && ! lsame_(norm, "I")) *info = -1;
-    else if (L->nrow < 0 || L->nrow != L->ncol ||
-             L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU)
-	 *info = -2;
-    else if (U->nrow < 0 || U->nrow != U->ncol ||
-             U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU) 
-	*info = -3;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("sgscon", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    *rcond = 0.;
-    if ( L->nrow == 0 || U->nrow == 0) {
-	*rcond = 1.;
-	return;
-    }
-
-    work = floatCalloc( 3*L->nrow );
-    iwork = intMalloc( L->nrow );
-
-
-    if ( !work || !iwork )
-	ABORT("Malloc fails for work arrays in sgscon.");
-    
-    /* Estimate the norm of inv(A). */
-    ainvnm = 0.;
-    if ( onenrm ) kase1 = 1;
-    else kase1 = 2;
-    kase = 0;
-
-    do {
-	slacon_(&L->nrow, &work[L->nrow], &work[0], &iwork[0], &ainvnm, &kase);
-
-	if (kase == 0) break;
-
-	if (kase == kase1) {
-	    /* Multiply by inv(L). */
-	    sp_strsv("L", "No trans", "Unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(U). */
-	    sp_strsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info);
-	    
-	} else {
-
-	    /* Multiply by inv(U'). */
-	    sp_strsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(L'). */
-	    sp_strsv("L", "Transpose", "Unit", L, U, &work[0], stat, info);
-	    
-	}
-
-    } while ( kase != 0 );
-
-    /* Compute the estimate of the reciprocal condition number. */
-    if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm;
-
-    SUPERLU_FREE (work);
-    SUPERLU_FREE (iwork);
-    return;
-
-} /* sgscon */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsequ.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,184 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	sgsequ.c
- * History:     Modified from LAPACK routine SGEEQU
- */
-#include <math.h>
-#include "slu_sdefs.h"
-
-void
-sgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd,
-	float *colcnd, float *amax, int *info)
-{
-/*    
-    Purpose   
-    =======   
-
-    SGSEQU computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE.
-	    
-    R       (output) float*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) float*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) float*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) float*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) float*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= A->nrow:  the i-th row of A is exactly zero   
-                  >  A->ncol:  the (i-M)-th column of A is exactly zero   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    float   *Aval;
-    int i, j, irow;
-    float rcmin, rcmax;
-    float bignum, smlnum;
-    extern double slamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("sgsequ", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Get machine constants. */
-    smlnum = slamch_("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = 0; i < A->nrow; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    return;
-
-} /* sgsequ */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsrfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgsrfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgsrfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,437 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	sgsrfs.c
- * History:     Modified from lapack routine SGERFS
- */
-#include <math.h>
-#include "slu_sdefs.h"
-
-void
-sgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
-       int *perm_c, int *perm_r, char *equed, float *R, float *C,
-       SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr,
-       SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose   
- *   =======   
- *
- *   SGSRFS improves the computed solution to a system of linear   
- *   equations and provides error bounds and backward error estimates for 
- *   the solution.   
- *
- *   If equilibration was performed, the system becomes:
- *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- *   Arguments   
- *   =========   
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *   
- *   A       (input) SuperMatrix*
- *           The original matrix A in the system, or the scaled A if
- *           equilibration was done. The type of A can be:
- *           Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_GE.
- *    
- *   L       (input) SuperMatrix*
- *	     The factor L from the factorization Pr*A*Pc=L*U. Use
- *           compressed row subscripts storage for supernodes, 
- *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
- * 
- *   U       (input) SuperMatrix*
- *           The factor U from the factorization Pr*A*Pc=L*U as computed by
- *           sgstrf(). Use column-wise storage scheme, 
- *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
- *
- *   perm_c  (input) int*, dimension (A->ncol)
- *	     Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *
- *   perm_r  (input) int*, dimension (A->nrow)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- *   equed   (input) Specifies the form of equilibration that was done.
- *           = 'N': No equilibration.
- *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *           = 'C': Column equilibration, i.e., A was postmultiplied by
- *                  diag(C).
- *           = 'B': Both row and column equilibration, i.e., A was replaced 
- *                  by diag(R)*A*diag(C).
- *
- *   R       (input) float*, dimension (A->nrow)
- *           The row scale factors for A.
- *           If equed = 'R' or 'B', A is premultiplied by diag(R).
- *           If equed = 'N' or 'C', R is not accessed.
- * 
- *   C       (input) float*, dimension (A->ncol)
- *           The column scale factors for A.
- *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
- *           If equed = 'N' or 'R', C is not accessed.
- *
- *   B       (input) SuperMatrix*
- *           B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
- *           The right hand side matrix B.
- *           if equed = 'R' or 'B', B is premultiplied by diag(R).
- *
- *   X       (input/output) SuperMatrix*
- *           X has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
- *           On entry, the solution matrix X, as computed by sgstrs().
- *           On exit, the improved solution matrix X.
- *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
- *               in order to obtain the solution to the original system.
- *
- *   FERR    (output) float*, dimension (B->ncol)   
- *           The estimated forward error bound for each solution vector   
- *           X(j) (the j-th column of the solution matrix X).   
- *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *           is an estimated upper bound for the magnitude of the largest 
- *           element in (X(j) - XTRUE) divided by the magnitude of the   
- *           largest element in X(j).  The estimate is as reliable as   
- *           the estimate for RCOND, and is almost always a slight   
- *           overestimate of the true error.
- *
- *   BERR    (output) float*, dimension (B->ncol)   
- *           The componentwise relative backward error of each solution   
- *           vector X(j) (i.e., the smallest relative change in   
- *           any element of A or B that makes X(j) an exact solution).
- *
- *   stat     (output) SuperLUStat_t*
- *            Record the statistics on runtime and floating-point operation count.
- *            See util.h for the definition of 'SuperLUStat_t'.
- *
- *   info    (output) int*   
- *           = 0:  successful exit   
- *            < 0:  if INFO = -i, the i-th argument had an illegal value   
- *
- *    Internal Parameters   
- *    ===================   
- *
- *    ITMAX is the maximum number of steps of iterative refinement.   
- *
- */  
-
-#define ITMAX 5
-    
-    /* Table of constant values */
-    int    ione = 1;
-    float ndone = -1.;
-    float done = 1.;
-    
-    /* Local variables */
-    NCformat *Astore;
-    float   *Aval;
-    SuperMatrix Bjcol;
-    DNformat *Bstore, *Xstore, *Bjcol_store;
-    float   *Bmat, *Xmat, *Bptr, *Xptr;
-    int      kase;
-    float   safe1, safe2;
-    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
-    int      ldb, ldx, nrhs;
-    float   s, xk, lstres, eps, safmin;
-    char     transc[1];
-    trans_t  transt;
-    float   *work;
-    float   *rwork;
-    int      *iwork;
-    extern double slamch_(char *);
-    extern int slacon_(int *, float *, float *, int *, float *, int *);
-#ifdef _CRAY
-    extern int SCOPY(int *, float *, int *, float *, int *);
-    extern int SSAXPY(int *, float *, float *, int *, float *, int *);
-#else
-    extern int scopy_(int *, float *, int *, float *, int *);
-    extern int saxpy_(int *, float *, float *, int *, float *, int *);
-#endif
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    Bstore = B->Store;
-    Xstore = X->Store;
-    Bmat   = Bstore->nzval;
-    Xmat   = Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-    
-    /* Test the input parameters */
-    *info = 0;
-    notran = (trans == NOTRANS);
-    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
- 	      L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU )
-	*info = -3;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
- 	      U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU )
-	*info = -4;
-    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
- 	      B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE )
-        *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
- 	      X->Stype != SLU_DN || X->Dtype != SLU_S || X->Mtype != SLU_GE )
-	*info = -11;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("sgsrfs", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || nrhs == 0) {
-	for (j = 0; j < nrhs; ++j) {
-	    ferr[j] = 0.;
-	    berr[j] = 0.;
-	}
-	return;
-    }
-
-    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-    colequ = lsame_(equed, "C") || lsame_(equed, "B");
-    
-    /* Allocate working space */
-    work = floatMalloc(2*A->nrow);
-    rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) );
-    iwork = intMalloc(2*A->nrow);
-    if ( !work || !rwork || !iwork ) 
-        ABORT("Malloc fails for work/rwork/iwork.");
-    
-    if ( notran ) {
-	*(unsigned char *)transc = 'N';
-        transt = TRANS;
-    } else {
-	*(unsigned char *)transc = 'T';
-	transt = NOTRANS;
-    }
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = slamch_("Epsilon");
-    safmin = slamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-    /* Compute the number of nonzeros in each row (or column) of A */
-    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
-    if ( notran ) {
-	for (k = 0; k < A->ncol; ++k)
-	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
-		++iwork[Astore->rowind[i]];
-    } else {
-	for (k = 0; k < A->ncol; ++k)
-	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
-    }	
-
-    /* Copy one column of RHS B into Bjcol. */
-    Bjcol.Stype = B->Stype;
-    Bjcol.Dtype = B->Dtype;
-    Bjcol.Mtype = B->Mtype;
-    Bjcol.nrow  = B->nrow;
-    Bjcol.ncol  = 1;
-    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
-    Bjcol_store = Bjcol.Store;
-    Bjcol_store->lda = ldb;
-    Bjcol_store->nzval = work; /* address aliasing */
-	
-    /* Do for each right hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	Bptr = &Bmat[j*ldb];
-	Xptr = &Xmat[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-	    
-#ifdef _CRAY
-	    SCOPY(&A->nrow, Bptr, &ione, work, &ione);
-#else
-	    scopy_(&A->nrow, Bptr, &ione, work, &ione);
-#endif
-	    sp_sgemv(transc, ndone, A, Xptr, ione, done, work, ione);
-
-	    /* Compute componentwise relative backward error from formula 
-	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
-	       where abs(Z) is the componentwise absolute value of the matrix
-	       or vector Z.  If the i-th component of the denominator is less
-	       than SAFE2, then SAFE1 is added to the i-th component of the   
-	       numerator and denominator before dividing. */
-
-	    for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] );
-	    
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    if (notran) {
-		for (k = 0; k < A->ncol; ++k) {
-		    xk = fabs( Xptr[k] );
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-			rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk;
-		}
-	    } else {
-		for (k = 0; k < A->ncol; ++k) {
-		    s = 0.;
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-			irow = Astore->rowind[i];
-			s += fabs(Aval[i]) * fabs(Xptr[irow]);
-		    }
-		    rwork[k] += s;
-		}
-	    }
-	    s = 0.;
-	    for (i = 0; i < A->nrow; ++i) {
-		if (rwork[i] > safe2)
-		    s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] );
-		else
-		    s = SUPERLU_MAX( s, (fabs(work[i]) + safe1) / 
-				(rwork[i] + safe1) );
-	    }
-	    berr[j] = s;
-
-	    /* Test stopping criterion. Continue iterating if   
-	       1) The residual BERR(J) is larger than machine epsilon, and   
-	       2) BERR(J) decreased by at least a factor of 2 during the   
-	          last iteration, and   
-	       3) At most ITMAX iterations tried. */
-
-	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
-		/* Update solution and try again. */
-		sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-#ifdef _CRAY
-		SAXPY(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#else
-		saxpy_(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#endif
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-        
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-	/* Bound error from formula:
-	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
-	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
-          where   
-            norm(Z) is the magnitude of the largest component of Z   
-            inv(op(A)) is the inverse of op(A)   
-            abs(Z) is the componentwise absolute value of the matrix or
-	       vector Z   
-            NZ is the maximum number of nonzeros in any row of A, plus 1   
-            EPS is machine epsilon   
-
-          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
-          is incremented by SAFE1 if the i-th component of   
-          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   
-
-          Use SLACON to estimate the infinity-norm of the matrix   
-             inv(op(A)) * diag(W),   
-          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
-	
-	for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] );
-	
-	/* Compute abs(op(A))*abs(X) + abs(B). */
-	if ( notran ) {
-	    for (k = 0; k < A->ncol; ++k) {
-		xk = fabs( Xptr[k] );
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-		    rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk;
-	    }
-	} else {
-	    for (k = 0; k < A->ncol; ++k) {
-		s = 0.;
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    xk = fabs( Xptr[irow] );
-		    s += fabs(Aval[i]) * xk;
-		}
-		rwork[k] += s;
-	    }
-	}
-	
-	for (i = 0; i < A->nrow; ++i)
-	    if (rwork[i] > safe2)
-		rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i];
-	    else
-		rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;
-
-	kase = 0;
-
-	do {
-	    slacon_(&A->nrow, &work[A->nrow], work,
-		    &iwork[A->nrow], &ferr[j], &kase);
-	    if (kase == 0) break;
-
-	    if (kase == 1) {
-		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) work[i] *= C[i];
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->nrow; ++i) work[i] *= R[i];
-		
-		sgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i];
-	    } else {
-		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
-		for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i];
-		
-		sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) work[i] *= C[i];
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->ncol; ++i) work[i] *= R[i];  
-	    }
-	    
-	} while ( kase != 0 );
-
-
-	/* Normalize error. */
-	lstres = 0.;
- 	if ( notran && colequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) );
-  	} else if ( !notran && rowequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) );
-	} else {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) );
-	}
-	if ( lstres != 0. )
-	    ferr[j] /= lstres;
-
-    } /* for each RHS j ... */
-    
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(rwork);
-    SUPERLU_FREE(iwork);
-    SUPERLU_FREE(Bjcol.Store);
-
-    return;
-
-} /* sgsrfs */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssv.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,221 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_sdefs.h"
-
-void
-sgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-      SuperMatrix *L, SuperMatrix *U, SuperMatrix *B,
-      SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * SGSSV solves the system of linear equations A*X=B, using the
- * LU factorization from SGSTRF. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *
- *      1.1. Permute the columns of A, forming A*Pc, where Pc
- *           is a permutation matrix. For more details of this step, 
- *           see sp_preorder.c.
- *
- *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
- *           by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      1.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
- *      above algorithm to the transpose of A:
- *
- *      2.1. Permute columns of transpose(A) (rows of A),
- *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
- *           determined by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      2.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- * 
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR; Dtype = SLU_S; Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- * perm_c  (input/output) int*
- *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
- *         which defines the permutation matrix Pc; perm_c[i] = j means 
- *         column i of A is in position j in A*Pc.
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
- *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
- *            On exit, perm_c may be overwritten by the product of the input
- *            perm_c and a permutation that postorders the elimination tree
- *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *            is already in postorder.
- *         Otherwise, it is an output argument.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined 
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->RowPerm = MY_PERMR or
- *            options->Fact = SamePattern_SameRowPerm, perm_r is an
- *            input argument.
- *         otherwise it is an output argument.
- *
- * L       (output) SuperMatrix*
- *         The factor L from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
- *         
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *	   = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *   
- */
-    DNformat *Bstore;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int      lwork = 0, *etree, i;
-    
-    /* Set default values for some parameters */
-    float   drop_tol = 0.;
-    int      panel_size;     /* panel size */
-    int      relax;          /* no of columns in a relaxed snodes */
-    int      permc_spec;
-    trans_t  trans = NOTRANS;
-    double   *utime;
-    double   t;	/* Temporary time */
-
-    /* Test the input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    if ( options->Fact != DOFACT ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	 A->Dtype != SLU_S || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-	B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE )
-	*info = -7;
-    if ( *info != 0 ) {
-	i = -(*info);
-	xerbla_("sgssv", &i);
-	return;
-    }
-
-    utime = stat->utime;
-
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	trans = TRANS;
-    } else {
-        if ( A->Stype == SLU_NC ) AA = A;
-    }
-
-    t = SuperLU_timer_();
-    /*
-     * Get column permutation vector perm_c[], according to permc_spec:
-     *   permc_spec = NATURAL:  natural ordering 
-     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-     *   permc_spec = COLAMD:   approximate minimum degree column ordering
-     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-     */
-    permc_spec = options->ColPerm;
-    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-      get_perm_c(permc_spec, AA, perm_c);
-    utime[COLPERM] = SuperLU_timer_() - t;
-
-    etree = intMalloc(A->ncol);
-
-    t = SuperLU_timer_();
-    sp_preorder(options, AA, perm_c, etree, &AC);
-    utime[ETREE] = SuperLU_timer_() - t;
-
-    panel_size = sp_ienv(1);
-    relax = sp_ienv(2);
-
-    /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	  relax, panel_size, sp_ienv(3), sp_ienv(4));*/
-    t = SuperLU_timer_(); 
-    /* Compute the LU factorization of A. */
-    sgstrf(options, &AC, drop_tol, relax, panel_size,
-	   etree, NULL, lwork, perm_c, perm_r, L, U, stat, info);
-    utime[FACT] = SuperLU_timer_() - t;
-
-    t = SuperLU_timer_();
-    if ( *info == 0 ) {
-        /* Solve the system A*X=B, overwriting B with X. */
-        sgstrs (trans, L, U, perm_c, perm_r, B, stat, info);
-    }
-    utime[SOLVE] = SuperLU_timer_() - t;
-
-    SUPERLU_FREE (etree);
-    Destroy_CompCol_Permuted(&AC);
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssvx.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgssvx.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgssvx.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,614 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_sdefs.h"
-
-void
-sgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-       int *etree, char *equed, float *R, float *C,
-       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
-       SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, 
-       float *rcond, float *ferr, float *berr, 
-       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * SGSSVX solves the system of linear equations A*X=B or A'*X=B, using
- * the LU factorization from sgstrf(). Error bounds on the solution and
- * a condition estimate are also provided. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *  
- *      1.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A is
- *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
- *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
- *           = TRANS or CONJ).
- *
- *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
- *           matrix that usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the matrix A (after equilibration if options->Equil = YES)
- *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
- *
- *      1.4. Compute the reciprocal pivot growth factor.
- *
- *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form of 
- *           A is used to estimate the condition number of the matrix A. If
- *           the reciprocal of the condition number is less than machine
- *           precision, info = A->ncol+1 is returned as a warning, but the
- *           routine still goes on to solve for X and computes error bounds
- *           as described below.
- *
- *      1.6. The system of equations is solved for X using the factored form
- *           of A.
- *
- *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      1.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R)
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
- *      to the transpose of A:
- *
- *      2.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A' is
- *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
- *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
- *
- *      2.2. Permute columns of transpose(A) (rows of A), 
- *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
- *           usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the transpose(A) (after equilibration if 
- *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
- *           permutation Pr determined by partial pivoting.
- *
- *      2.4. Compute the reciprocal pivot growth factor.
- *
- *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form 
- *           of transpose(A) is used to estimate the condition number of the
- *           matrix A. If the reciprocal of the condition number
- *           is less than machine precision, info = A->nrow+1 is returned as
- *           a warning, but the routine still goes on to solve for X and
- *           computes error bounds as described below.
- *
- *      2.6. The system of equations is solved for X using the factored form
- *           of transpose(A).
- *
- *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      2.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input/output) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- *         On entry, If options->Fact = FACTORED and equed is not 'N', 
- *         then A must have been equilibrated by the scaling factors in
- *         R and/or C.  
- *         On exit, A is not modified if options->Equil = NO, or if 
- *         options->Equil = YES but equed = 'N' on exit.
- *         Otherwise, if options->Equil = YES and equed is not 'N',
- *         A is scaled as follows:
- *         If A->Stype = SLU_NC:
- *           equed = 'R':  A := diag(R) * A
- *           equed = 'C':  A := A * diag(C)
- *           equed = 'B':  A := diag(R) * A * diag(C).
- *         If A->Stype = SLU_NR:
- *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
- *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
- *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
- *
- * perm_c  (input/output) int*
- *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
- *         which defines the permutation matrix Pc; perm_c[i] = j means
- *         column i of A is in position j in A*Pc.
- *         On exit, perm_c may be overwritten by the product of the input
- *         perm_c and a permutation that postorders the elimination tree
- *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *         is already in postorder.
- *
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *         will try to use the input perm_r, unless a certain threshold
- *         criterion is violated. In that case, perm_r is overwritten by a
- *         new permutation determined by partial pivoting or diagonal
- *         threshold pivoting.
- *         Otherwise, perm_r is output argument.
- * 
- * etree   (input/output) int*,  dimension (A->ncol)
- *         Elimination tree of Pc'*A'*A*Pc.
- *         If options->Fact != FACTORED and options->Fact != DOFACT,
- *         etree is an input argument, otherwise it is an output argument.
- *         Note: etree is a vector of parent pointers for a forest whose
- *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- * equed   (input/output) char*
- *         Specifies the form of equilibration that was done.
- *         = 'N': No equilibration.
- *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
- *         = 'B': Both row and column equilibration, i.e., A was replaced 
- *                by diag(R)*A*diag(C).
- *         If options->Fact = FACTORED, equed is an input argument,
- *         otherwise it is an output argument.
- *
- * R       (input/output) float*, dimension (A->nrow)
- *         The row scale factors for A or transpose(A).
- *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
- *         If equed = 'N' or 'C', R is not accessed.
- *         If options->Fact = FACTORED, R is an input argument,
- *             otherwise, R is output.
- *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
- *             of R must be positive.
- * 
- * C       (input/output) float*, dimension (A->ncol)
- *         The column scale factors for A or transpose(A).
- *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
- *         If equed = 'N' or 'R', C is not accessed.
- *         If options->Fact = FACTORED, C is an input argument,
- *             otherwise, C is output.
- *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
- *             of C must be positive.
- *         
- * L       (output) SuperMatrix*
- *	   The factor L from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
- *
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
- *
- * work    (workspace/output) void*, size (lwork) (in bytes)
- *         User supplied workspace, should be large enough
- *         to hold data structures for factors L and U.
- *         On exit, if fact is not 'F', L and U point to this array.
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               mem_usage->total_needed; no other side effects.
- *
- *         See argument 'mem_usage' for memory usage statistics.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         If B->ncol = 0, only LU decomposition is performed, the triangular
- *                         solve is skipped.
- *         On exit,
- *            if equed = 'N', B is not modified; otherwise
- *            if A->Stype = SLU_NC:
- *               if options->Trans = NOTRANS and equed = 'R' or 'B',
- *                  B is overwritten by diag(R)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
- *                  B is overwritten by diag(C)*B;
- *            if A->Stype = SLU_NR:
- *               if options->Trans = NOTRANS and equed = 'C' or 'B',
- *                  B is overwritten by diag(C)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
- *                  B is overwritten by diag(R)*B.
- *
- * X       (output) SuperMatrix*
- *         X has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. 
- *         If info = 0 or info = A->ncol+1, X contains the solution matrix
- *         to the original system of equations. Note that A and B are modified
- *         on exit if equed is not 'N', and the solution to the equilibrated
- *         system is inv(diag(C))*X if options->Trans = NOTRANS and
- *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
- *         and equed = 'R' or 'B'.
- *
- * recip_pivot_growth (output) float*
- *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
- *         The infinity norm is used. If recip_pivot_growth is much less
- *         than 1, the stability of the LU factorization could be poor.
- *
- * rcond   (output) float*
- *         The estimate of the reciprocal condition number of the matrix A
- *         after equilibration (if done). If rcond is less than the machine
- *         precision (in particular, if rcond = 0), the matrix is singular
- *         to working precision. This condition is indicated by a return
- *         code of info > 0.
- *
- * FERR    (output) float*, dimension (B->ncol)   
- *         The estimated forward error bound for each solution vector   
- *         X(j) (the j-th column of the solution matrix X).   
- *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *         is an estimated upper bound for the magnitude of the largest 
- *         element in (X(j) - XTRUE) divided by the magnitude of the   
- *         largest element in X(j).  The estimate is as reliable as   
- *         the estimate for RCOND, and is almost always a slight   
- *         overestimate of the true error.
- *         If options->IterRefine = NOREFINE, ferr = 1.0.
- *
- * BERR    (output) float*, dimension (B->ncol)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *         If options->IterRefine = NOREFINE, berr = 1.0.
- *
- * mem_usage (output) mem_usage_t*
- *         Record the memory usage statistics, consisting of following fields:
- *         - for_lu (float)
- *           The amount of space used in bytes for L\U data structures.
- *         - total_needed (float)
- *           The amount of space needed in bytes to perform factorization.
- *         - expansions (int)
- *           The number of memory expansions during the LU factorization.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit   
- *         < 0: if info = -i, the i-th argument had an illegal value   
- *         > 0: if info = i, and i is   
- *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
- *                    been completed, but the factor U is exactly   
- *                    singular, so the solution and error bounds   
- *                    could not be computed.   
- *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
- *                    precision, meaning that the matrix is singular to
- *                    working precision. Nevertheless, the solution and
- *                    error bounds are computed because there are a number
- *                    of situations where the computed solution can be more
- *                    accurate than the value of RCOND would suggest.   
- *              > A->ncol+1: number of bytes allocated when memory allocation
- *                    failure occurred, plus A->ncol.
- *
- */
-
-    DNformat  *Bstore, *Xstore;
-    float    *Bmat, *Xmat;
-    int       ldb, ldx, nrhs;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int       colequ, equil, nofact, notran, rowequ, permc_spec;
-    trans_t   trant;
-    char      norm[1];
-    int       i, j, info1;
-    float    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
-    int       relax, panel_size;
-    float    diag_pivot_thresh, drop_tol;
-    double    t0;      /* temporary time */
-    double    *utime;
-
-    /* External functions */
-    extern float slangs(char *, SuperMatrix *);
-    extern double slamch_(char *);
-
-    Bstore = B->Store;
-    Xstore = X->Store;
-    Bmat   = Bstore->nzval;
-    Xmat   = Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-
-    *info = 0;
-    nofact = (options->Fact != FACTORED);
-    equil = (options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    if ( nofact ) {
-	*(unsigned char *)equed = 'N';
-	rowequ = FALSE;
-	colequ = FALSE;
-    } else {
-	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-	colequ = lsame_(equed, "C") || lsame_(equed, "B");
-	smlnum = slamch_("Safe minimum");
-	bignum = 1. / smlnum;
-    }
-
-#if 0
-printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
-       options->Fact, options->Trans, *equed);
-#endif
-
-    /* Test the input parameters */
-    if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern &&
-	options->Fact != SamePattern_SameRowPerm &&
-	!notran && options->Trans != TRANS && options->Trans != CONJ &&
-	!equil && options->Equil != NO)
-	*info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	      A->Dtype != SLU_S || A->Mtype != SLU_GE )
-	*info = -2;
-    else if (options->Fact == FACTORED &&
-	     !(rowequ || colequ || lsame_(equed, "N")))
-	*info = -6;
-    else {
-	if (rowequ) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, R[j]);
-		rcmax = SUPERLU_MAX(rcmax, R[j]);
-	    }
-	    if (rcmin <= 0.) *info = -7;
-	    else if ( A->nrow > 0)
-		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else rowcnd = 1.;
-	}
-	if (colequ && *info == 0) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, C[j]);
-		rcmax = SUPERLU_MAX(rcmax, C[j]);
-	    }
-	    if (rcmin <= 0.) *info = -8;
-	    else if (A->nrow > 0)
-		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else colcnd = 1.;
-	}
-	if (*info == 0) {
-	    if ( lwork < -1 ) *info = -12;
-	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      B->Stype != SLU_DN || B->Dtype != SLU_S || 
-		      B->Mtype != SLU_GE )
-		*info = -13;
-	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      (B->ncol != 0 && B->ncol != X->ncol) ||
-                      X->Stype != SLU_DN ||
-		      X->Dtype != SLU_S || X->Mtype != SLU_GE )
-		*info = -14;
-	}
-    }
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("sgssvx", &i);
-	return;
-    }
-    
-    /* Initialization for factor parameters */
-    panel_size = sp_ienv(1);
-    relax      = sp_ienv(2);
-    diag_pivot_thresh = options->DiagPivotThresh;
-    drop_tol   = 0.0;
-
-    utime = stat->utime;
-    
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	if ( notran ) { /* Reverse the transpose argument. */
-	    trant = TRANS;
-	    notran = 0;
-	} else {
-	    trant = NOTRANS;
-	    notran = 1;
-	}
-    } else { /* A->Stype == SLU_NC */
-	trant = options->Trans;
-	AA = A;
-    }
-
-    if ( nofact && equil ) {
-	t0 = SuperLU_timer_();
-	/* Compute row and column scalings to equilibrate the matrix A. */
-	sgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
-	
-	if ( info1 == 0 ) {
-	    /* Equilibrate matrix A. */
-	    slaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
-	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
-	}
-	utime[EQUIL] = SuperLU_timer_() - t0;
-    }
-
-    if ( nrhs > 0 ) {
-        /* Scale the right hand side if equilibration was performed. */
-        if ( notran ) {
-	    if ( rowequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-		        Bmat[i + j*ldb] *= R[i];
-	            }
-	    }
-        } else if ( colequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-	            Bmat[i + j*ldb] *= C[i];
-	        }
-        }
-    }
-
-    if ( nofact ) {
-	
-        t0 = SuperLU_timer_();
-	/*
-	 * Gnet column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-            get_perm_c(permc_spec, AA, perm_c);
-	utime[COLPERM] = SuperLU_timer_() - t0;
-
-	t0 = SuperLU_timer_();
-	sp_preorder(options, AA, perm_c, etree, &AC);
-	utime[ETREE] = SuperLU_timer_() - t0;
-    
-/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	       relax, panel_size, sp_ienv(3), sp_ienv(4));
-	fflush(stdout); */
-	
-	/* Compute the LU factorization of A*Pc. */
-	t0 = SuperLU_timer_();
-	sgstrf(options, &AC, drop_tol, relax, panel_size,
-	       etree, work, lwork, perm_c, perm_r, L, U, stat, info);
-	utime[FACT] = SuperLU_timer_() - t0;
-	
-	if ( lwork == -1 ) {
-	    mem_usage->total_needed = *info - A->ncol;
-	    return;
-	}
-    }
-
-    if ( options->PivotGrowth ) {
-        if ( *info > 0 ) {
-	    if ( *info <= A->ncol ) {
-	        /* Compute the reciprocal pivot growth factor of the leading
-	           rank-deficient *info columns of A. */
-	        *recip_pivot_growth = sPivotGrowth(*info, AA, perm_c, L, U);
-	    }
-	    return;
-        }
-
-        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
-        *recip_pivot_growth = sPivotGrowth(A->ncol, AA, perm_c, L, U);
-    }
-
-    if ( options->ConditionNumber ) {
-        /* Estimate the reciprocal of the condition number of A. */
-        t0 = SuperLU_timer_();
-        if ( notran ) {
-	    *(unsigned char *)norm = '1';
-        } else {
-	    *(unsigned char *)norm = 'I';
-        }
-        anorm = slangs(norm, AA);
-        sgscon(norm, L, U, anorm, rcond, stat, info);
-        utime[RCOND] = SuperLU_timer_() - t0;
-    }
-    
-    if ( nrhs > 0 ) {
-        /* Compute the solution matrix X. */
-        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
-            for (i = 0; i < B->nrow; i++)
-	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
-    
-        t0 = SuperLU_timer_();
-        sgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
-        utime[SOLVE] = SuperLU_timer_() - t0;
-    
-        /* Use iterative refinement to improve the computed solution and compute
-           error bounds and backward error estimates for it. */
-        t0 = SuperLU_timer_();
-        if ( options->IterRefine != NOREFINE ) {
-            sgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
-                   X, ferr, berr, stat, info);
-        } else {
-            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
-        }
-        utime[REFINE] = SuperLU_timer_() - t0;
-
-        /* Transform the solution matrix X to a solution of the original system. */
-        if ( notran ) {
-	    if ( colequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-                        Xmat[i + j*ldx] *= C[i];
-	            }
-	    }
-        } else if ( rowequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-	            Xmat[i + j*ldx] *= R[i];
-                }
-        }
-    } /* end if nrhs > 0 */
-
-    if ( options->ConditionNumber ) {
-        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
-        if ( *rcond < slamch_("E") ) *info = A->ncol + 1;
-    }
-
-    if ( nofact ) {
-        sQuerySpace(L, U, mem_usage);
-        Destroy_CompCol_Permuted(&AC);
-    }
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrf.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrf.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,433 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-void
-sgstrf (superlu_options_t *options, SuperMatrix *A, float drop_tol,
-        int relax, int panel_size, int *etree, void *work, int lwork,
-        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * SGSTRF computes an LU factorization of a general sparse m-by-n
- * matrix A using partial pivoting with row interchanges.
- * The factorization has the form
- *     Pr * A = L * U
- * where Pr is a row permutation matrix, L is lower triangular with unit
- * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
- * triangular (upper trapezoidal if A->nrow < A->ncol).
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
- *
- * drop_tol (input) float (NOT IMPLEMENTED)
- *	    Drop tolerance parameter. At step j of the Gaussian elimination,
- *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
- *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
- *
- * relax    (input) int
- *          To control degree of relaxing supernodes. If the number
- *          of nodes (columns) in a subtree of the elimination tree is less
- *          than relax, this subtree is considered as one supernode,
- *          regardless of the row structures of those columns.
- *
- * panel_size (input) int
- *          A panel consists of at most panel_size consecutive columns.
- *
- * etree    (input) int*, dimension (A->ncol)
- *          Elimination tree of A'*A.
- *          Note: etree is a vector of parent pointers for a forest whose
- *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *          On input, the columns of A should be permuted so that the
- *          etree is in a certain postorder.
- *
- * work     (input/output) void*, size (lwork) (in bytes)
- *          User-supplied work space and space for the output data structures.
- *          Not referenced if lwork = 0;
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               *info; no other side effects.
- *
- * perm_c   (input) int*, dimension (A->ncol)
- *	    Column permutation vector, which defines the 
- *          permutation matrix Pc; perm_c[i] = j means column i of A is 
- *          in position j in A*Pc.
- *          When searching for diagonal, perm_c[*] is applied to the
- *          row subscripts of A, so that diagonal threshold pivoting
- *          can find the diagonal of A, rather than that of A*Pc.
- *
- * perm_r   (input/output) int*, dimension (A->nrow)
- *          Row permutation vector which defines the permutation matrix Pr,
- *          perm_r[i] = j means row i of A is in position j in Pr*A.
- *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *             will try to use the input perm_r, unless a certain threshold
- *             criterion is violated. In that case, perm_r is overwritten by
- *             a new permutation determined by partial pivoting or diagonal
- *             threshold pivoting.
- *          Otherwise, perm_r is output argument;
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = SLU_NC, 
- *          Dtype = SLU_S, Mtype = SLU_TRU.
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info     (output) int*
- *          = 0: successful exit
- *          < 0: if info = -i, the i-th argument had an illegal value
- *          > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                and division by zero will occur if it is used to solve a
- *                system of equations.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol. If lwork = -1, it is
- *                the estimated amount of space needed, plus A->ncol.
- *
- * ======================================================================
- *
- * Local Working Arrays: 
- * ======================
- *   m = number of rows in the matrix
- *   n = number of columns in the matrix
- *
- *   xprune[0:n-1]: xprune[*] points to locations in subscript 
- *	vector lsub[*]. For column i, xprune[i] denotes the point where 
- *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
- *	to be traversed for symbolic factorization.
- *
- *   marker[0:3*m-1]: marker[i] = j means that node i has been 
- *	reached when working on column j.
- *	Storage: relative to original row subscripts
- *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
- *	      see spanel_dfs.c; marker2 is used for inner-factorization,
- *            see scolumn_dfs.c.
- *
- *   parent[0:m-1]: parent vector used during dfs
- *      Storage: relative to new row subscripts
- *
- *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
- *	unexplored neighbor of i in lsub[*]
- *
- *   segrep[0:nseg-1]: contains the list of supernodal representatives
- *	in topological order of the dfs. A supernode representative is the 
- *	last column of a supernode.
- *      The maximum size of segrep[] is n.
- *
- *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
- *	supernodal representative r, repfnz[r] is the location of the first 
- *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
- *	indicates the supernode r has been explored.
- *	NOTE: There are W of them, each used for one column of a panel. 
- *
- *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
- *      the panel diagonal. These are filled in during spanel_dfs(), and are
- *      used later in the inner LU factorization within the panel.
- *	panel_lsub[]/dense[] pair forms the SPA data structure.
- *	NOTE: There are W of them.
- *
- *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
- *	    	   NOTE: there are W of them.
- *
- *   tempv[0:*]: real temporary used for dense numeric kernels;
- *	The size of this array is defined by NUM_TEMPV() in ssp_defs.h.
- *
- */
-    /* Local working arrays */
-    NCPformat *Astore;
-    int       *iperm_r = NULL; /* inverse of perm_r; used when 
-                                  options->Fact == SamePattern_SameRowPerm */
-    int       *iperm_c; /* inverse of perm_c */
-    int       *iwork;
-    float    *swork;
-    int	      *segrep, *repfnz, *parent, *xplore;
-    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
-    int	      *xprune;
-    int	      *marker;
-    float    *dense, *tempv;
-    int       *relax_end;
-    float    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int       *xsup, *supno;
-    int       *xlsub, *xlusup, *xusub;
-    int       nzlumax;
-    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
-
-    /* Local scalars */
-    fact_t    fact = options->Fact;
-    double    diag_pivot_thresh = options->DiagPivotThresh;
-    int       pivrow;   /* pivotal row number in the original matrix A */
-    int       nseg1;	/* no of segments in U-column above panel row jcol */
-    int       nseg;	/* no of segments in each U-column */
-    register int jcol;	
-    register int kcol;	/* end column of a relaxed snode */
-    register int icol;
-    register int i, k, jj, new_next, iinfo;
-    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
-    int       w_def;	/* upper bound on panel width */
-    int       usepr, iperm_r_allocated = 0;
-    int       nnzL, nnzU;
-    int       *panel_histo = stat->panel_histo;
-    flops_t   *ops = stat->ops;
-
-    iinfo    = 0;
-    m        = A->nrow;
-    n        = A->ncol;
-    min_mn   = SUPERLU_MIN(m, n);
-    Astore   = A->Store;
-    a        = Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-
-    /* Allocate storage common to the factor routines */
-    *info = sLUMemInit(fact, work, lwork, m, n, Astore->nnz,
-                       panel_size, L, U, &Glu, &iwork, &swork);
-    if ( *info ) return;
-    
-    xsup    = Glu.xsup;
-    supno   = Glu.supno;
-    xlsub   = Glu.xlsub;
-    xlusup  = Glu.xlusup;
-    xusub   = Glu.xusub;
-    
-    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
-	     &repfnz, &panel_lsub, &xprune, &marker);
-    sSetRWork(m, panel_size, swork, &dense, &tempv);
-    
-    usepr = (fact == SamePattern_SameRowPerm);
-    if ( usepr ) {
-	/* Compute the inverse of perm_r */
-	iperm_r = (int *) intMalloc(m);
-	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
-	iperm_r_allocated = 1;
-    }
-    iperm_c = (int *) intMalloc(n);
-    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
-
-    /* Identify relaxed snodes */
-    relax_end = (int *) intMalloc(n);
-    if ( options->SymmetricMode == YES ) {
-        heap_relax_snode(n, etree, relax, marker, relax_end); 
-    } else {
-        relax_snode(n, etree, relax, marker, relax_end); 
-    }
-    
-    ifill (perm_r, m, EMPTY);
-    ifill (marker, m * NO_MARKER, EMPTY);
-    supno[0] = -1;
-    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
-    w_def    = panel_size;
-
-    /* 
-     * Work on one "panel" at a time. A panel is one of the following: 
-     *	   (a) a relaxed supernode at the bottom of the etree, or
-     *	   (b) panel_size contiguous columns, defined by the user
-     */
-    for (jcol = 0; jcol < min_mn; ) {
-
-	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
-   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
-	    panel_histo[kcol-jcol+1]++;
-
-	    /* --------------------------------------
-	     * Factorize the relaxed supernode(jcol:kcol) 
-	     * -------------------------------------- */
-	    /* Determine the union of the row structure of the snode */
-	    if ( (*info = ssnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
-				    xprune, marker, &Glu)) != 0 )
-		return;
-
-            nextu    = xusub[jcol];
-	    nextlu   = xlusup[jcol];
-	    jsupno   = supno[jcol];
-	    fsupc    = xsup[jsupno];
-	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
-	    nzlumax = Glu.nzlumax;
-	    while ( new_next > nzlumax ) {
-		if ( (*info = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
-		    return;
-	    }
-    
-	    for (icol = jcol; icol<= kcol; icol++) {
-		xusub[icol+1] = nextu;
-		
-    		/* Scatter into SPA dense[*] */
-    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
-        	    dense[asub[k]] = a[k];
-
-	       	/* Numeric update within the snode */
-	        ssnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);
-
-		if ( (*info = spivotL(icol, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-		
-#ifdef DEBUG
-		sprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-	    jcol = icol;
-
-	} else { /* Work on one panel of panel_size columns */
-	    
-	    /* Adjust panel_size so that a panel won't overlap with the next 
-	     * relaxed snode.
-	     */
-	    panel_size = w_def;
-	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
-		if ( relax_end[k] != EMPTY ) {
-		    panel_size = k - jcol;
-		    break;
-		}
-	    if ( k == min_mn ) panel_size = min_mn - jcol;
-	    panel_histo[panel_size]++;
-
-	    /* symbolic factor on a panel of columns */
-	    spanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
-		      dense, panel_lsub, segrep, repfnz, xprune,
-		      marker, parent, xplore, &Glu);
-	    
-	    /* numeric sup-panel updates in topological order */
-	    spanel_bmod(m, panel_size, jcol, nseg1, dense,
-		        tempv, segrep, repfnz, &Glu, stat);
-	    
-	    /* Sparse LU within the panel, and below panel diagonal */
-    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
- 		k = (jj - jcol) * m; /* column index for w-wide arrays */
-
-		nseg = nseg1;	/* Begin after all the panel segments */
-
-	    	if ((*info = scolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
-					segrep, &repfnz[k], xprune, marker,
-					parent, xplore, &Glu)) != 0) return;
-
-	      	/* Numeric updates */
-	    	if ((*info = scolumn_bmod(jj, (nseg - nseg1), &dense[k],
-					 tempv, &segrep[nseg1], &repfnz[k],
-					 jcol, &Glu, stat)) != 0) return;
-		
-	        /* Copy the U-segments to ucol[*] */
-		if ((*info = scopy_to_ucol(jj, nseg, segrep, &repfnz[k],
-					  perm_r, &dense[k], &Glu)) != 0)
-		    return;
-
-	    	if ( (*info = spivotL(jj, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-
-		/* Prune columns (0:jj-1) using column jj */
-	    	spruneL(jj, perm_r, pivrow, nseg, segrep,
-                        &repfnz[k], xprune, &Glu);
-
-		/* Reset repfnz[] for this column */
-	    	resetrep_col (nseg, segrep, &repfnz[k]);
-		
-#ifdef DEBUG
-		sprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-   	    jcol += panel_size;	/* Move to the next panel */
-
-	} /* else */
-
-    } /* for */
-
-    *info = iinfo;
-    
-    if ( m > n ) {
-	k = 0;
-        for (i = 0; i < m; ++i) 
-            if ( perm_r[i] == EMPTY ) {
-    		perm_r[i] = n + k;
-		++k;
-	    }
-    }
-
-    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
-    fixupL(min_mn, perm_r, &Glu);
-
-    sLUWorkFree(iwork, swork, &Glu); /* Free work space and compress storage */
-
-    if ( fact == SamePattern_SameRowPerm ) {
-        /* L and U structures may have changed due to possibly different
-	   pivoting, even though the storage is available.
-	   There could also be memory expansions, so the array locations
-           may have changed, */
-        ((SCformat *)L->Store)->nnz = nnzL;
-	((SCformat *)L->Store)->nsuper = Glu.supno[n];
-	((SCformat *)L->Store)->nzval = Glu.lusup;
-	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
-	((SCformat *)L->Store)->rowind = Glu.lsub;
-	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
-	((NCformat *)U->Store)->nnz = nnzU;
-	((NCformat *)U->Store)->nzval = Glu.ucol;
-	((NCformat *)U->Store)->rowind = Glu.usub;
-	((NCformat *)U->Store)->colptr = Glu.xusub;
-    } else {
-        sCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, 
-	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
-			         Glu.xsup, SLU_SC, SLU_S, SLU_TRLU);
-    	sCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, 
-			       Glu.usub, Glu.xusub, SLU_NC, SLU_S, SLU_TRU);
-    }
-    
-    ops[FACT] += ops[TRSV] + ops[GEMV];	
-    
-    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
-    SUPERLU_FREE (iperm_c);
-    SUPERLU_FREE (relax_end);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sgstrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sgstrs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,332 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-
-/* 
- * Function prototypes 
- */
-void susolve(int, int, float*, float*);
-void slsolve(int, int, float*, float*);
-void smatvec(int, int, int, float*, float*, float*);
-
-
-void
-sgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U,
-        int *perm_c, int *perm_r, SuperMatrix *B,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * SGSTRS solves a system of linear equations A*X=B or A'*X=B
- * with A sparse and B dense, using the LU factorization computed by
- * SGSTRF.
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *
- * L       (input) SuperMatrix*
- *         The factor L from the factorization Pr*A*Pc=L*U as computed by
- *         sgstrf(). Use compressed row subscripts storage for supernodes,
- *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
- *
- * U       (input) SuperMatrix*
- *         The factor U from the factorization Pr*A*Pc=L*U as computed by
- *         sgstrf(). Use column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
- *
- * perm_c  (input) int*, dimension (L->ncol)
- *	   Column permutation vector, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * perm_r  (input) int*, dimension (L->nrow)
- *         Row permutation vector, which defines the permutation matrix Pr; 
- *         perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
-#endif
-    int      incx = 1, incy = 1;
-#ifdef USE_VENDOR_BLAS
-    float   alpha = 1.0, beta = 1.0;
-    float   *work_col;
-#endif
-    DNformat *Bstore;
-    float   *Bmat;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    float   *Lval, *Uval;
-    int      fsupc, nrow, nsupr, nsupc, luptr, istart, irow;
-    int      i, j, k, iptr, jcol, n, ldb, nrhs;
-    float   *work, *rhs_work, *soln;
-    flops_t  solve_ops;
-    void sprint_soln();
-
-    /* Test input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    ldb = Bstore->lda;
-    nrhs = B->ncol;
-    if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
-	      L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU )
-	*info = -2;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
-	      U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU )
-	*info = -3;
-    else if ( ldb < SUPERLU_MAX(0, L->nrow) ||
-	      B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("sgstrs", &i);
-	return;
-    }
-
-    n = L->nrow;
-    work = floatCalloc(n * nrhs);
-    if ( !work ) ABORT("Malloc fails for local work[].");
-    soln = floatMalloc(n);
-    if ( !soln ) ABORT("Malloc fails for local soln[].");
-
-    Bmat = Bstore->nzval;
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-    
-    if ( trans == NOTRANS ) {
-	/* Permute right hand sides to form Pr*B */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-	/* Forward solve PLy=Pb. */
-	for (k = 0; k <= Lstore->nsuper; k++) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    nrow = nsupr - nsupc;
-
-	    solve_ops += nsupc * (nsupc - 1) * nrhs;
-	    solve_ops += 2 * nrow * nsupc * nrhs;
-	    
-	    if ( nsupc == 1 ) {
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-	    	    luptr = L_NZ_START(fsupc);
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){
-			irow = L_SUB(iptr);
-			++luptr;
-			rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr];
-		    }
-		}
-	    } else {
-	    	luptr = L_NZ_START(fsupc);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("N", strlen("N"));
-		ftcs3 = _cptofcd("U", strlen("U"));
-		STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#else
-		strsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		sgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#endif
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    work_col = &work[j*n];
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			rhs_work[irow] -= work_col[i]; /* Scatter */
-			work_col[i] = 0.0;
-			iptr++;
-		    }
-		}
-#else		
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    slsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
-		    smatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
-			    &rhs_work[fsupc], &work[0] );
-
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			rhs_work[irow] -= work[i];
-			work[i] = 0.0;
-			iptr++;
-		    }
-		}
-#endif		    
-	    } /* else ... */
-	} /* for L-solve */
-
-#ifdef DEBUG
-  	printf("After L-solve: y=\n");
-	sprint_soln(n, nrhs, Bmat);
-#endif
-
-	/*
-	 * Back solve Ux=y.
-	 */
-	for (k = Lstore->nsuper; k >= 0; k--) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    luptr = L_NZ_START(fsupc);
-
-	    solve_ops += nsupc * (nsupc + 1) * nrhs;
-
-	    if ( nsupc == 1 ) {
-		rhs_work = &Bmat[0];
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work[fsupc] /= Lval[luptr];
-		    rhs_work += ldb;
-		}
-	    } else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("U", strlen("U"));
-		ftcs3 = _cptofcd("N", strlen("N"));
-		STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#else
-		strsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#endif
-#else		
-		for (j = 0; j < nrhs; j++)
-		    susolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] );
-#endif		
-	    }
-
-	    for (j = 0; j < nrhs; ++j) {
-		rhs_work = &Bmat[j*ldb];
-		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
-		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){
-			irow = U_SUB(i);
-			rhs_work[irow] -= rhs_work[jcol] * Uval[i];
-		    }
-		}
-	    }
-	    
-	} /* for U-solve */
-
-#ifdef DEBUG
-  	printf("After U-solve: x=\n");
-	sprint_soln(n, nrhs, Bmat);
-#endif
-
-	/* Compute the final solution X := Pc*X. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-        stat->ops[SOLVE] = solve_ops;
-
-    } else { /* Solve A'*X=B or CONJ(A)*X=B */
-	/* Permute right hand sides to form Pc'*B. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-	stat->ops[SOLVE] = 0;
-	for (k = 0; k < nrhs; ++k) {
-	    
-	    /* Multiply by inv(U'). */
-	    sp_strsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info);
-	    
-	    /* Multiply by inv(L'). */
-	    sp_strsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info);
-	    
-	}
-	/* Compute the final solution X := Pr'*X (=inv(Pr)*X) */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-    }
-
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(soln);
-}
-
-/*
- * Diagnostic print of the solution vector 
- */
-void
-sprint_soln(int n, int nrhs, float *soln)
-{
-    int i;
-
-    for (i = 0; i < n; i++) 
-  	printf("\t%d: %.4f\n", i, soln[i]);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slacon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slacon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slacon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,228 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <math.h>
-#include "slu_Cnames.h"
-
-int
-slacon_(int *n, float *v, float *x, int *isgn, float *est, int *kase)
-
-{
-/*
-    Purpose   
-    =======   
-
-    SLACON estimates the 1-norm of a square matrix A.   
-    Reverse communication is used for evaluating matrix-vector products. 
-  
-
-    Arguments   
-    =========   
-
-    N      (input) INT
-           The order of the matrix.  N >= 1.   
-
-    V      (workspace) FLOAT PRECISION array, dimension (N)   
-           On the final return, V = A*W,  where  EST = norm(V)/norm(W)   
-           (W is not returned).   
-
-    X      (input/output) FLOAT PRECISION array, dimension (N)   
-           On an intermediate return, X should be overwritten by   
-                 A * X,   if KASE=1,   
-                 A' * X,  if KASE=2,
-           and SLACON must be re-called with all the other parameters   
-           unchanged.   
-
-    ISGN   (workspace) INT array, dimension (N)
-
-    EST    (output) FLOAT PRECISION   
-           An estimate (a lower bound) for norm(A).   
-
-    KASE   (input/output) INT
-           On the initial call to SLACON, KASE should be 0.   
-           On an intermediate return, KASE will be 1 or 2, indicating   
-           whether X should be overwritten by A * X  or A' * X.   
-           On the final return from SLACON, KASE will again be 0.   
-
-    Further Details   
-    ======= =======   
-
-    Contributed by Nick Higham, University of Manchester.   
-    Originally named CONEST, dated March 16, 1988.   
-
-    Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of 
-    a real or complex matrix, with applications to condition estimation", 
-    ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988.   
-    ===================================================================== 
-*/
-
-    /* Table of constant values */
-    int c__1 = 1;
-    float      zero = 0.0;
-    float      one = 1.0;
-    
-    /* Local variables */
-    static int iter;
-    static int jump, jlast;
-    static float altsgn, estold;
-    static int i, j;
-    float temp;
-#ifdef _CRAY
-    extern int ISAMAX(int *, float *, int *);
-    extern float SASUM(int *, float *, int *);
-    extern int SCOPY(int *, float *, int *, float *, int *);
-#else
-    extern int isamax_(int *, float *, int *);
-    extern float sasum_(int *, float *, int *);
-    extern int scopy_(int *, float *, int *, float *, int *);
-#endif
-#define d_sign(a, b) (b >= 0 ? fabs(a) : -fabs(a))    /* Copy sign */
-#define i_dnnt(a) \
-	( a>=0 ? floor(a+.5) : -floor(.5-a) ) /* Round to nearest integer */
-
-    if ( *kase == 0 ) {
-	for (i = 0; i < *n; ++i) {
-	    x[i] = 1. / (float) (*n);
-	}
-	*kase = 1;
-	jump = 1;
-	return 0;
-    }
-
-    switch (jump) {
-	case 1:  goto L20;
-	case 2:  goto L40;
-	case 3:  goto L70;
-	case 4:  goto L110;
-	case 5:  goto L140;
-    }
-
-    /*     ................ ENTRY   (JUMP = 1)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY A*X. */
-  L20:
-    if (*n == 1) {
-	v[0] = x[0];
-	*est = fabs(v[0]);
-	/*        ... QUIT */
-	goto L150;
-    }
-#ifdef _CRAY
-    *est = SASUM(n, x, &c__1);
-#else
-    *est = sasum_(n, x, &c__1);
-#endif
-
-    for (i = 0; i < *n; ++i) {
-	x[i] = d_sign(one, x[i]);
-	isgn[i] = i_dnnt(x[i]);
-    }
-    *kase = 2;
-    jump = 2;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 2)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */
-L40:
-#ifdef _CRAY
-    j = ISAMAX(n, &x[0], &c__1);
-#else
-    j = isamax_(n, &x[0], &c__1);
-#endif
-    --j;
-    iter = 2;
-
-    /*     MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */
-L50:
-    for (i = 0; i < *n; ++i) x[i] = zero;
-    x[j] = one;
-    *kase = 1;
-    jump = 3;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 3)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L70:
-#ifdef _CRAY
-    SCOPY(n, x, &c__1, v, &c__1);
-#else
-    scopy_(n, x, &c__1, v, &c__1);
-#endif
-    estold = *est;
-#ifdef _CRAY
-    *est = SASUM(n, v, &c__1);
-#else
-    *est = sasum_(n, v, &c__1);
-#endif
-
-    for (i = 0; i < *n; ++i)
-	if (i_dnnt(d_sign(one, x[i])) != isgn[i])
-	    goto L90;
-
-    /*     REPEATED SIGN VECTOR DETECTED, HENCE ALGORITHM HAS CONVERGED. */
-    goto L120;
-
-L90:
-    /*     TEST FOR CYCLING. */
-    if (*est <= estold) goto L120;
-
-    for (i = 0; i < *n; ++i) {
-	x[i] = d_sign(one, x[i]);
-	isgn[i] = i_dnnt(x[i]);
-    }
-    *kase = 2;
-    jump = 4;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 4)   
-	   X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */
-L110:
-    jlast = j;
-#ifdef _CRAY
-    j = ISAMAX(n, &x[0], &c__1);
-#else
-    j = isamax_(n, &x[0], &c__1);
-#endif
-    --j;
-    if (x[jlast] != fabs(x[j]) && iter < 5) {
-	++iter;
-	goto L50;
-    }
-
-    /*     ITERATION COMPLETE.  FINAL STAGE. */
-L120:
-    altsgn = 1.;
-    for (i = 1; i <= *n; ++i) {
-	x[i-1] = altsgn * ((float)(i - 1) / (float)(*n - 1) + 1.);
-	altsgn = -altsgn;
-    }
-    *kase = 1;
-    jump = 5;
-    return 0;
-    
-    /*     ................ ENTRY   (JUMP = 5)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L140:
-#ifdef _CRAY
-    temp = SASUM(n, x, &c__1) / (float)(*n * 3) * 2.;
-#else
-    temp = sasum_(n, x, &c__1) / (float)(*n * 3) * 2.;
-#endif
-    if (temp > *est) {
-#ifdef _CRAY
-	SCOPY(n, &x[0], &c__1, &v[0], &c__1);
-#else
-	scopy_(n, &x[0], &c__1, &v[0], &c__1);
-#endif
-	*est = temp;
-    }
-
-L150:
-    *kase = 0;
-    return 0;
-
-} /* slacon_ */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slamch.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slamch.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slamch.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,984 +0,0 @@
-#include <stdio.h>
-#include "slu_Cnames.h"
-
-#define TRUE_ (1)
-#define FALSE_ (0)
-#define min(a,b) ((a) <= (b) ? (a) : (b))
-#define max(a,b) ((a) >= (b) ? (a) : (b))
-#define abs(x) ((x) >= 0 ? (x) : -(x))
-#define dabs(x) (double)abs(x)
-
-double slamch_(char *cmach)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMCH determines single precision machine parameters.   
-
-    Arguments   
-    =========   
-
-    CMACH   (input) CHARACTER*1   
-            Specifies the value to be returned by SLAMCH:   
-            = 'E' or 'e',   SLAMCH := eps   
-            = 'S' or 's ,   SLAMCH := sfmin   
-            = 'B' or 'b',   SLAMCH := base   
-            = 'P' or 'p',   SLAMCH := eps*base   
-            = 'N' or 'n',   SLAMCH := t   
-            = 'R' or 'r',   SLAMCH := rnd   
-            = 'M' or 'm',   SLAMCH := emin   
-            = 'U' or 'u',   SLAMCH := rmin   
-            = 'L' or 'l',   SLAMCH := emax   
-            = 'O' or 'o',   SLAMCH := rmax   
-
-            where   
-
-            eps   = relative machine precision   
-            sfmin = safe minimum, such that 1/sfmin does not overflow   
-            base  = base of the machine   
-            prec  = eps*base   
-            t     = number of (base) digits in the mantissa   
-            rnd   = 1.0 when rounding occurs in addition, 0.0 otherwise   
-            emin  = minimum exponent before (gradual) underflow   
-            rmin  = underflow threshold - base**(emin-1)   
-            emax  = largest exponent before overflow   
-            rmax  = overflow threshold  - (base**emax)*(1-eps)   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    int i__1;
-    float ret_val;
-    /* Builtin functions */
-    double pow_ri(float *, int *);
-    /* Local variables */
-    static float base;
-    static int beta;
-    static float emin, prec, emax;
-    static int imin, imax;
-    static int lrnd;
-    static float rmin, rmax, t, rmach;
-    extern int lsame_(char *, char *);
-    static float small, sfmin;
-    extern /* Subroutine */ int slamc2_(int *, int *, int *, float 
-	    *, int *, float *, int *, float *);
-    static int it;
-    static float rnd, eps;
-
-
-
-    if (first) {
-	first = FALSE_;
-	slamc2_(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
-	base = (float) beta;
-	t = (float) it;
-	if (lrnd) {
-	    rnd = 1.f;
-	    i__1 = 1 - it;
-	    eps = pow_ri(&base, &i__1) / 2;
-	} else {
-	    rnd = 0.f;
-	    i__1 = 1 - it;
-	    eps = pow_ri(&base, &i__1);
-	}
-	prec = eps * base;
-	emin = (float) imin;
-	emax = (float) imax;
-	sfmin = rmin;
-	small = 1.f / rmax;
-	if (small >= sfmin) {
-
-/*           Use SMALL plus a bit, to avoid the possibility of rou
-nding   
-             causing overflow when computing  1/sfmin. */
-
-	    sfmin = small * (eps + 1.f);
-	}
-    }
-
-    if (lsame_(cmach, "E")) {
-	rmach = eps;
-    } else if (lsame_(cmach, "S")) {
-	rmach = sfmin;
-    } else if (lsame_(cmach, "B")) {
-	rmach = base;
-    } else if (lsame_(cmach, "P")) {
-	rmach = prec;
-    } else if (lsame_(cmach, "N")) {
-	rmach = t;
-    } else if (lsame_(cmach, "R")) {
-	rmach = rnd;
-    } else if (lsame_(cmach, "M")) {
-	rmach = emin;
-    } else if (lsame_(cmach, "U")) {
-	rmach = rmin;
-    } else if (lsame_(cmach, "L")) {
-	rmach = emax;
-    } else if (lsame_(cmach, "O")) {
-	rmach = rmax;
-    }
-
-    ret_val = rmach;
-    return ret_val;
-
-/*     End of SLAMCH */
-
-} /* slamch_ */
-
-
-/* Subroutine */ int slamc1_(int *beta, int *t, int *rnd, int 
-	*ieee1)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC1 determines the machine parameters given by BETA, T, RND, and   
-    IEEE1.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    IEEE1   (output) INT   
-            Specifies whether rounding appears to be done in the IEEE   
-            'round to nearest' style.   
-
-    Further Details   
-    ===============   
-
-    The routine is based on the routine  ENVRON  by Malcolm and   
-    incorporates suggestions by Gentleman and Marovich. See   
-
-       Malcolm M. A. (1972) Algorithms to reveal properties of   
-          floating-point arithmetic. Comms. of the ACM, 15, 949-951.   
-
-       Gentleman W. M. and Marovich S. B. (1974) More on algorithms   
-          that reveal properties of floating point arithmetic units.   
-          Comms. of the ACM, 17, 276-277.   
-
-   ===================================================================== 
-*/
-    /* Initialized data */
-    static int first = TRUE_;
-    /* System generated locals */
-    float r__1, r__2;
-    /* Local variables */
-    static int lrnd;
-    static float a, b, c, f;
-    static int lbeta;
-    static float savec;
-    static int lieee1;
-    static float t1, t2;
-    extern double slamc3_(float *, float *);
-    static int lt;
-    static float one, qtr;
-
-
-
-    if (first) {
-	first = FALSE_;
-	one = 1.f;
-
-/*        LBETA,  LIEEE1,  LT and  LRND  are the  local values  of  BE
-TA,   
-          IEEE1, T and RND.   
-
-          Throughout this routine  we use the function  SLAMC3  to ens
-ure   
-          that relevant values are  stored and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          Compute  a = 2.0**m  with the  smallest positive integer m s
-uch   
-          that   
-
-             fl( a + 1.0 ) = a. */
-
-	a = 1.f;
-	c = 1.f;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L10:
-	if (c == one) {
-	    a *= 2;
-	    c = slamc3_(&a, &one);
-	    r__1 = -(double)a;
-	    c = slamc3_(&c, &r__1);
-	    goto L10;
-	}
-/* +       END WHILE   
-
-          Now compute  b = 2.0**m  with the smallest positive integer 
-m   
-          such that   
-
-             fl( a + b ) .gt. a. */
-
-	b = 1.f;
-	c = slamc3_(&a, &b);
-
-/* +       WHILE( C.EQ.A )LOOP */
-L20:
-	if (c == a) {
-	    b *= 2;
-	    c = slamc3_(&a, &b);
-	    goto L20;
-	}
-/* +       END WHILE   
-
-          Now compute the base.  a and c  are neighbouring floating po
-int   
-          numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
- so   
-          their difference is beta. Adding 0.25 to c is to ensure that
- it   
-          is truncated to beta and not ( beta - 1 ). */
-
-	qtr = one / 4;
-	savec = c;
-	r__1 = -(double)a;
-	c = slamc3_(&c, &r__1);
-	lbeta = c + qtr;
-
-/*        Now determine whether rounding or chopping occurs,  by addin
-g a   
-          bit  less  than  beta/2  and a  bit  more  than  beta/2  to 
- a. */
-
-	b = (float) lbeta;
-	r__1 = b / 2;
-	r__2 = -(double)b / 100;
-	f = slamc3_(&r__1, &r__2);
-	c = slamc3_(&f, &a);
-	if (c == a) {
-	    lrnd = TRUE_;
-	} else {
-	    lrnd = FALSE_;
-	}
-	r__1 = b / 2;
-	r__2 = b / 100;
-	f = slamc3_(&r__1, &r__2);
-	c = slamc3_(&f, &a);
-	if (lrnd && c == a) {
-	    lrnd = FALSE_;
-	}
-
-/*        Try and decide whether rounding is done in the  IEEE  'round
- to   
-          nearest' style. B/2 is half a unit in the last place of the 
-two   
-          numbers A and SAVEC. Furthermore, A is even, i.e. has last  
-bit   
-          zero, and SAVEC is odd. Thus adding B/2 to A should not  cha
-nge   
-          A, but adding B/2 to SAVEC should change SAVEC. */
-
-	r__1 = b / 2;
-	t1 = slamc3_(&r__1, &a);
-	r__1 = b / 2;
-	t2 = slamc3_(&r__1, &savec);
-	lieee1 = t1 == a && t2 > savec && lrnd;
-
-/*        Now find  the  mantissa, t.  It should  be the  integer part
- of   
-          log to the base beta of a,  however it is safer to determine
-  t   
-          by powering.  So we find t as the smallest positive integer 
-for   
-          which   
-
-             fl( beta**t + 1.0 ) = 1.0. */
-
-	lt = 0;
-	a = 1.f;
-	c = 1.f;
-
-/* +       WHILE( C.EQ.ONE )LOOP */
-L30:
-	if (c == one) {
-	    ++lt;
-	    a *= lbeta;
-	    c = slamc3_(&a, &one);
-	    r__1 = -(double)a;
-	    c = slamc3_(&c, &r__1);
-	    goto L30;
-	}
-/* +       END WHILE */
-
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *ieee1 = lieee1;
-    return 0;
-
-/*     End of SLAMC1 */
-
-} /* slamc1_ */
-
-
-/* Subroutine */ int slamc2_(int *beta, int *t, int *rnd, float *
-	eps, int *emin, float *rmin, int *emax, float *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC2 determines the machine parameters specified in its argument   
-    list.   
-
-    Arguments   
-    =========   
-
-    BETA    (output) INT   
-            The base of the machine.   
-
-    T       (output) INT   
-            The number of ( BETA ) digits in the mantissa.   
-
-    RND     (output) INT   
-            Specifies whether proper rounding  ( RND = .TRUE. )  or   
-            chopping  ( RND = .FALSE. )  occurs in addition. This may not 
-  
-            be a reliable guide to the way in which the machine performs 
-  
-            its arithmetic.   
-
-    EPS     (output) FLOAT   
-            The smallest positive number such that   
-
-               fl( 1.0 - EPS ) .LT. 1.0,   
-
-            where fl denotes the computed value.   
-
-    EMIN    (output) INT   
-            The minimum exponent before (gradual) underflow occurs.   
-
-    RMIN    (output) FLOAT   
-            The smallest normalized number for the machine, given by   
-            BASE**( EMIN - 1 ), where  BASE  is the floating point value 
-  
-            of BETA.   
-
-    EMAX    (output) INT   
-            The maximum exponent before overflow occurs.   
-
-    RMAX    (output) FLOAT   
-            The largest positive number for the machine, given by   
-            BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating point 
-  
-            value of BETA.   
-
-    Further Details   
-    ===============   
-
-    The computation of  EPS  is based on a routine PARANOIA by   
-    W. Kahan of the University of California at Berkeley.   
-
-   ===================================================================== 
-*/
-    /* Table of constant values */
-    static int c__1 = 1;
-    
-    /* Initialized data */
-    static int first = TRUE_;
-    static int iwarn = FALSE_;
-    /* System generated locals */
-    int i__1;
-    float r__1, r__2, r__3, r__4, r__5;
-    /* Builtin functions */
-    double pow_ri(float *, int *);
-    /* Local variables */
-    static int ieee;
-    static float half;
-    static int lrnd;
-    static float leps, zero, a, b, c;
-    static int i, lbeta;
-    static float rbase;
-    static int lemin, lemax, gnmin;
-    static float small;
-    static int gpmin;
-    static float third, lrmin, lrmax, sixth;
-    static int lieee1;
-    extern /* Subroutine */ int slamc1_(int *, int *, int *, 
-	    int *);
-    extern double slamc3_(float *, float *);
-    extern /* Subroutine */ int slamc4_(int *, float *, int *), 
-	    slamc5_(int *, int *, int *, int *, int *, 
-	    float *);
-    static int lt, ngnmin, ngpmin;
-    static float one, two;
-
-
-
-    if (first) {
-	first = FALSE_;
-	zero = 0.f;
-	one = 1.f;
-	two = 2.f;
-
-/*        LBETA, LT, LRND, LEPS, LEMIN and LRMIN  are the local values
- of   
-          BETA, T, RND, EPS, EMIN and RMIN.   
-
-          Throughout this routine  we use the function  SLAMC3  to ens
-ure   
-          that relevant values are stored  and not held in registers, 
- or   
-          are not affected by optimizers.   
-
-          SLAMC1 returns the parameters  LBETA, LT, LRND and LIEEE1. 
-*/
-
-	slamc1_(&lbeta, &lt, &lrnd, &lieee1);
-
-/*        Start to find EPS. */
-
-	b = (float) lbeta;
-	i__1 = -lt;
-	a = pow_ri(&b, &i__1);
-	leps = a;
-
-/*        Try some tricks to see whether or not this is the correct  E
-PS. */
-
-	b = two / 3;
-	half = one / 2;
-	r__1 = -(double)half;
-	sixth = slamc3_(&b, &r__1);
-	third = slamc3_(&sixth, &sixth);
-	r__1 = -(double)half;
-	b = slamc3_(&third, &r__1);
-	b = slamc3_(&b, &sixth);
-	b = dabs(b);
-	if (b < leps) {
-	    b = leps;
-	}
-
-	leps = 1.f;
-
-/* +       WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
-L10:
-	if (leps > b && b > zero) {
-	    leps = b;
-	    r__1 = half * leps;
-/* Computing 5th power */
-	    r__3 = two, r__4 = r__3, r__3 *= r__3;
-/* Computing 2nd power */
-	    r__5 = leps;
-	    r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5);
-	    c = slamc3_(&r__1, &r__2);
-	    r__1 = -(double)c;
-	    c = slamc3_(&half, &r__1);
-	    b = slamc3_(&half, &c);
-	    r__1 = -(double)b;
-	    c = slamc3_(&half, &r__1);
-	    b = slamc3_(&half, &c);
-	    goto L10;
-	}
-/* +       END WHILE */
-
-	if (a < leps) {
-	    leps = a;
-	}
-
-/*        Computation of EPS complete.   
-
-          Now find  EMIN.  Let A = + or - 1, and + or - (1 + BASE**(-3
-)).   
-          Keep dividing  A by BETA until (gradual) underflow occurs. T
-his   
-          is detected when we cannot recover the previous A. */
-
-	rbase = one / lbeta;
-	small = one;
-	for (i = 1; i <= 3; ++i) {
-	    r__1 = small * rbase;
-	    small = slamc3_(&r__1, &zero);
-/* L20: */
-	}
-	a = slamc3_(&one, &small);
-	slamc4_(&ngpmin, &one, &lbeta);
-	r__1 = -(double)one;
-	slamc4_(&ngnmin, &r__1, &lbeta);
-	slamc4_(&gpmin, &a, &lbeta);
-	r__1 = -(double)a;
-	slamc4_(&gnmin, &r__1, &lbeta);
-	ieee = FALSE_;
-
-	if (ngpmin == ngnmin && gpmin == gnmin) {
-	    if (ngpmin == gpmin) {
-		lemin = ngpmin;
-/*            ( Non twos-complement machines, no gradual under
-flow;   
-                e.g.,  VAX ) */
-	    } else if (gpmin - ngpmin == 3) {
-		lemin = ngpmin - 1 + lt;
-		ieee = TRUE_;
-/*            ( Non twos-complement machines, with gradual und
-erflow;   
-                e.g., IEEE standard followers ) */
-	    } else {
-		lemin = min(ngpmin,gpmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if (ngpmin == gpmin && ngnmin == gnmin) {
-	    if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1) {
-		lemin = max(ngpmin,ngnmin);
-/*            ( Twos-complement machines, no gradual underflow
-;   
-                e.g., CYBER 205 ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else if ((i__1 = ngpmin - ngnmin, abs(i__1)) == 1 && gpmin == gnmin)
-		 {
-	    if (gpmin - min(ngpmin,ngnmin) == 3) {
-		lemin = max(ngpmin,ngnmin) - 1 + lt;
-/*            ( Twos-complement machines with gradual underflo
-w;   
-                no known machine ) */
-	    } else {
-		lemin = min(ngpmin,ngnmin);
-/*            ( A guess; no known machine ) */
-		iwarn = TRUE_;
-	    }
-
-	} else {
-/* Computing MIN */
-	    i__1 = min(ngpmin,ngnmin), i__1 = min(i__1,gpmin);
-	    lemin = min(i__1,gnmin);
-/*         ( A guess; no known machine ) */
-	    iwarn = TRUE_;
-	}
-/* **   
-   Comment out this if block if EMIN is ok */
-	if (iwarn) {
-	    first = TRUE_;
-	    printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    printf("EMIN = %8i\n",lemin);
-	    printf("If, after inspection, the value EMIN looks acceptable");
-            printf("please comment out \n the IF block as marked within the"); 
-            printf("code of routine SLAMC2, \n otherwise supply EMIN"); 
-            printf("explicitly.\n");
-	}
-/* **   
-
-          Assume IEEE arithmetic if we found denormalised  numbers abo
-ve,   
-          or if arithmetic seems to round in the  IEEE style,  determi
-ned   
-          in routine SLAMC1. A true IEEE machine should have both  thi
-ngs   
-          true; however, faulty machines may have one or the other. */
-
-	ieee = ieee || lieee1;
-
-/*        Compute  RMIN by successive division by  BETA. We could comp
-ute   
-          RMIN as BASE**( EMIN - 1 ),  but some machines underflow dur
-ing   
-          this computation. */
-
-	lrmin = 1.f;
-	i__1 = 1 - lemin;
-	for (i = 1; i <= 1-lemin; ++i) {
-	    r__1 = lrmin * rbase;
-	    lrmin = slamc3_(&r__1, &zero);
-/* L30: */
-	}
-
-/*        Finally, call SLAMC5 to compute EMAX and RMAX. */
-
-	slamc5_(&lbeta, &lt, &lemin, &ieee, &lemax, &lrmax);
-    }
-
-    *beta = lbeta;
-    *t = lt;
-    *rnd = lrnd;
-    *eps = leps;
-    *emin = lemin;
-    *rmin = lrmin;
-    *emax = lemax;
-    *rmax = lrmax;
-
-    return 0;
-
-
-/*     End of SLAMC2 */
-
-} /* slamc2_ */
-
-
-double slamc3_(float *a, float *b)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC3  is intended to force  A  and  B  to be stored prior to doing 
-  
-    the addition of  A  and  B ,  for use in situations where optimizers 
-  
-    might hold one of these in a register.   
-
-    Arguments   
-    =========   
-
-    A, B    (input) FLOAT   
-            The values A and B.   
-
-   ===================================================================== 
-*/
-/* >>Start of File<<   
-       System generated locals */
-    float ret_val;
-
-
-
-    ret_val = *a + *b;
-
-    return ret_val;
-
-/*     End of SLAMC3 */
-
-} /* slamc3_ */
-
-
-/* Subroutine */ int slamc4_(int *emin, float *start, int *base)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC4 is a service routine for SLAMC2.   
-
-    Arguments   
-    =========   
-
-    EMIN    (output) EMIN   
-            The minimum exponent before (gradual) underflow, computed by 
-  
-            setting A = START and dividing by BASE until the previous A   
-            can not be recovered.   
-
-    START   (input) FLOAT   
-            The starting point for determining EMIN.   
-
-    BASE    (input) INT   
-            The base of the machine.   
-
-   ===================================================================== 
-*/
-    /* System generated locals */
-    int i__1;
-    float r__1;
-    /* Local variables */
-    static float zero, a;
-    static int i;
-    static float rbase, b1, b2, c1, c2, d1, d2;
-    extern double slamc3_(float *, float *);
-    static float one;
-
-
-
-    a = *start;
-    one = 1.f;
-    rbase = one / *base;
-    zero = 0.f;
-    *emin = 1;
-    r__1 = a * rbase;
-    b1 = slamc3_(&r__1, &zero);
-    c1 = a;
-    c2 = a;
-    d1 = a;
-    d2 = a;
-/* +    WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND.   
-      $       ( D1.EQ.A ).AND.( D2.EQ.A )      )LOOP */
-L10:
-    if (c1 == a && c2 == a && d1 == a && d2 == a) {
-	--(*emin);
-	a = b1;
-	r__1 = a / *base;
-	b1 = slamc3_(&r__1, &zero);
-	r__1 = b1 * *base;
-	c1 = slamc3_(&r__1, &zero);
-	d1 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d1 += b1;
-/* L20: */
-	}
-	r__1 = a * rbase;
-	b2 = slamc3_(&r__1, &zero);
-	r__1 = b2 / rbase;
-	c2 = slamc3_(&r__1, &zero);
-	d2 = zero;
-	i__1 = *base;
-	for (i = 1; i <= *base; ++i) {
-	    d2 += b2;
-/* L30: */
-	}
-	goto L10;
-    }
-/* +    END WHILE */
-
-    return 0;
-
-/*     End of SLAMC4 */
-
-} /* slamc4_ */
-
-
-/* Subroutine */ int slamc5_(int *beta, int *p, int *emin, 
-	int *ieee, int *emax, float *rmax)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       October 31, 1992   
-
-
-    Purpose   
-    =======   
-
-    SLAMC5 attempts to compute RMAX, the largest machine floating-point   
-    number, without overflow.  It assumes that EMAX + abs(EMIN) sum   
-    approximately to a power of 2.  It will fail on machines where this   
-    assumption does not hold, for example, the Cyber 205 (EMIN = -28625, 
-  
-    EMAX = 28718).  It will also fail if the value supplied for EMIN is   
-    too large (i.e. too close to zero), probably with overflow.   
-
-    Arguments   
-    =========   
-
-    BETA    (input) INT   
-            The base of floating-point arithmetic.   
-
-    P       (input) INT   
-            The number of base BETA digits in the mantissa of a   
-            floating-point value.   
-
-    EMIN    (input) INT   
-            The minimum exponent before (gradual) underflow.   
-
-    IEEE    (input) INT   
-            A logical flag specifying whether or not the arithmetic   
-            system is thought to comply with the IEEE standard.   
-
-    EMAX    (output) INT   
-            The largest exponent before overflow   
-
-    RMAX    (output) FLOAT   
-            The largest machine floating-point number.   
-
-   ===================================================================== 
-  
-
-
-       First compute LEXP and UEXP, two powers of 2 that bound   
-       abs(EMIN). We then assume that EMAX + abs(EMIN) will sum   
-       approximately to the bound that is closest to abs(EMIN).   
-       (EMAX is the exponent of the required number RMAX). */
-    /* Table of constant values */
-    static float c_b5 = 0.f;
-    
-    /* System generated locals */
-    int i__1;
-    float r__1;
-    /* Local variables */
-    static int lexp;
-    static float oldy;
-    static int uexp, i;
-    static float y, z;
-    static int nbits;
-    extern double slamc3_(float *, float *);
-    static float recbas;
-    static int exbits, expsum, try__;
-
-
-
-    lexp = 1;
-    exbits = 1;
-L10:
-    try__ = lexp << 1;
-    if (try__ <= -(*emin)) {
-	lexp = try__;
-	++exbits;
-	goto L10;
-    }
-    if (lexp == -(*emin)) {
-	uexp = lexp;
-    } else {
-	uexp = try__;
-	++exbits;
-    }
-
-/*     Now -LEXP is less than or equal to EMIN, and -UEXP is greater   
-       than or equal to EMIN. EXBITS is the number of bits needed to   
-       store the exponent. */
-
-    if (uexp + *emin > -lexp - *emin) {
-	expsum = lexp << 1;
-    } else {
-	expsum = uexp << 1;
-    }
-
-/*     EXPSUM is the exponent range, approximately equal to   
-       EMAX - EMIN + 1 . */
-
-    *emax = expsum + *emin - 1;
-    nbits = exbits + 1 + *p;
-
-/*     NBITS is the total number of bits needed to store a   
-       floating-point number. */
-
-    if (nbits % 2 == 1 && *beta == 2) {
-
-/*        Either there are an odd number of bits used to store a   
-          floating-point number, which is unlikely, or some bits are 
-  
-          not used in the representation of numbers, which is possible
-,   
-          (e.g. Cray machines) or the mantissa has an implicit bit,   
-          (e.g. IEEE machines, Dec Vax machines), which is perhaps the
-   
-          most likely. We have to assume the last alternative.   
-          If this is true, then we need to reduce EMAX by one because 
-  
-          there must be some way of representing zero in an implicit-b
-it   
-          system. On machines like Cray, we are reducing EMAX by one 
-  
-          unnecessarily. */
-
-	--(*emax);
-    }
-
-    if (*ieee) {
-
-/*        Assume we are on an IEEE machine which reserves one exponent
-   
-          for infinity and NaN. */
-
-	--(*emax);
-    }
-
-/*     Now create RMAX, the largest machine number, which should   
-       be equal to (1.0 - BETA**(-P)) * BETA**EMAX .   
-
-       First compute 1.0 - BETA**(-P), being careful that the   
-       result is less than 1.0 . */
-
-    recbas = 1.f / *beta;
-    z = *beta - 1.f;
-    y = 0.f;
-    i__1 = *p;
-    for (i = 1; i <= *p; ++i) {
-	z *= recbas;
-	if (y < 1.f) {
-	    oldy = y;
-	}
-	y = slamc3_(&y, &z);
-/* L20: */
-    }
-    if (y >= 1.f) {
-	y = oldy;
-    }
-
-/*     Now multiply by BETA**EMAX to get RMAX. */
-
-    i__1 = *emax;
-    for (i = 1; i <= *emax; ++i) {
-	r__1 = y * *beta;
-	y = slamc3_(&r__1, &c_b5);
-/* L30: */
-    }
-
-    *rmax = y;
-    return 0;
-
-/*     End of SLAMC5 */
-
-} /* slamc5_ */
-
-
-double pow_ri(float *ap, int *bp)
-{
-double pow, x;
-int n;
-
-pow = 1;
-x = *ap;
-n = *bp;
-
-if(n != 0)
-	{
-	if(n < 0)
-		{
-		n = -n;
-		x = 1/x;
-		}
-	for( ; ; )
-		{
-		if(n & 01)
-			pow *= x;
-		if(n >>= 1)
-			x *= x;
-		else
-			break;
-		}
-	}
-return(pow);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slangs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,110 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	slangs.c
- * History:     Modified from lapack routine SLANGE
- */
-#include <math.h>
-#include "slu_sdefs.h"
-
-float slangs(char *norm, SuperMatrix *A)
-{
-/* 
-    Purpose   
-    =======   
-
-    SLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    SLANGE returns the value   
-
-       SLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                (   
-                ( norm1(A),         NORM = '1', 'O' or 'o'   
-                (   
-                ( normI(A),         NORM = 'I' or 'i'   
-                (   
-                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in SLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NCformat *Astore;
-    float   *Aval;
-    int      i, j, irow;
-    float   value, sum;
-    float   *rwork;
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-	
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
-		value = SUPERLU_MAX( value, fabs( Aval[i]) );
-	
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += fabs(Aval[i]);
-	    value = SUPERLU_MAX(value,sum);
-	}
-	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) )
-	    ABORT("SUPERLU_MALLOC fails for rwork.");
-	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
-		irow = Astore->rowind[i];
-		rwork[irow] += fabs(Aval[i]);
-	    }
-	value = 0.;
-	for (i = 0; i < A->nrow; ++i)
-	    value = SUPERLU_MAX(value, rwork[i]);
-	
-	SUPERLU_FREE (rwork);
-	
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else
-	ABORT("Illegal norm specified.");
-
-    return (value);
-
-} /* slangs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slaqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slaqgs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,136 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	slaqgs.c
- * History:     Modified from LAPACK routine SLAQGE
- */
-#include <math.h>
-#include "slu_sdefs.h"
-
-void
-slaqgs(SuperMatrix *A, float *r, float *c, 
-	float rowcnd, float colcnd, float amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    SLAQGS equilibrates a general sparse M by N matrix A using the row and   
-    scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = NC; Dtype = SLU_S; Mtype = GE.
-	    
-    R       (input) float*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) float*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) float
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) float
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) float
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NCformat *Astore;
-    float   *Aval;
-    int i, j, irow;
-    float large, small, cj;
-    extern double slamch_(char *);
-
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Initialize LARGE and SMALL. */
-    small = slamch_("Safe minimum") / slamch_("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    for (j = 0; j < A->ncol; ++j) {
-		cj = c[j];
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    Aval[i] *= cj;
-                }
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		Aval[i] *= r[irow];
-	    }
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Row and column scaling */
-	for (j = 0; j < A->ncol; ++j) {
-	    cj = c[j];
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		Aval[i] *= cj * r[irow];
-	    }
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* slaqgs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_cdefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_cdefs.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_cdefs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_cdefs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,237 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#ifndef __SUPERLU_cSP_DEFS /* allow multiple inclusions */
-#define __SUPERLU_cSP_DEFS
-
-/*
- * File name:		csp_defs.h
- * Purpose:             Sparse matrix types and function prototypes
- * History:
- */
-
-#ifdef _CRAY
-#include <fortran.h>
-#include <string.h>
-#endif
-
-/* Define my integer type int_t */
-typedef int int_t; /* default */
-
-#include "slu_Cnames.h"
-#include "supermatrix.h"
-#include "slu_util.h"
-#include "slu_scomplex.h"
-
-
-/*
- * Global data structures used in LU factorization -
- * 
- *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
- *   (xsup,supno): supno[i] is the supernode no to which i belongs;
- *	xsup(s) points to the beginning of the s-th supernode.
- *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
- *	        xsup 0 1 2 4 7 12
- *	Note: dfs will be performed on supernode rep. relative to the new 
- *	      row pivoting ordering
- *
- *   (xlsub,lsub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xlsub[j] points to the starting
- *	location of the j-th column in lsub[*]. Note that xlsub 
- *	is indexed by column.
- *	Storage: original row subscripts
- *
- *      During the course of sparse LU factorization, we also use
- *	(xlsub,lsub) for the purpose of symmetric pruning. For each
- *	supernode {s,s+1,...,t=s+r} with first column s and last
- *	column t, the subscript set
- *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
- *	is the structure of column s (i.e. structure of this supernode).
- *	It is used for the storage of numerical values.
- *	Furthermore,
- *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
- *	is the structure of the last column t of this supernode.
- *	It is for the purpose of symmetric pruning. Therefore, the
- *	structural subscripts can be rearranged without making physical
- *	interchanges among the numerical values.
- *
- *	However, if the supernode has only one column, then we
- *	only keep one set of subscripts. For any subscript interchange
- *	performed, similar interchange must be done on the numerical
- *	values.
- *
- *	The last column structures (for pruning) will be removed
- *	after the numercial LU factorization phase.
- *
- *   (xlusup,lusup): lusup[*] contains the numerical values of the
- *	rectangular supernodes; xlusup[j] points to the starting
- *	location of the j-th column in storage vector lusup[*]
- *	Note: xlusup is indexed by column.
- *	Each rectangular supernode is stored by column-major
- *	scheme, consistent with Fortran 2-dim array storage.
- *
- *   (xusub,ucol,usub): ucol[*] stores the numerical values of
- *	U-columns outside the rectangular supernodes. The row
- *	subscript of nonzero ucol[k] is stored in usub[k].
- *	xusub[i] points to the starting location of column i in ucol.
- *	Storage: new row subscripts; that is subscripts of PA.
- */
-typedef struct {
-    int     *xsup;    /* supernode and column mapping */
-    int     *supno;   
-    int     *lsub;    /* compressed L subscripts */
-    int	    *xlsub;
-    complex  *lusup;   /* L supernodes */
-    int     *xlusup;
-    complex  *ucol;    /* U columns */
-    int     *usub;
-    int	    *xusub;
-    int     nzlmax;   /* current max size of lsub */
-    int     nzumax;   /*    "    "    "      ucol */
-    int     nzlumax;  /*    "    "    "     lusup */
-    int     n;        /* number of columns in the matrix */
-    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
-} GlobalLU_t;
-
-typedef struct {
-    float for_lu;
-    float total_needed;
-    int   expansions;
-} mem_usage_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Driver routines */
-extern void
-cgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *,
-      SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *);
-extern void
-cgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,
-       char *, float *, float *, SuperMatrix *, SuperMatrix *,
-       void *, int, SuperMatrix *, SuperMatrix *,
-       float *, float *, float *, float *,
-       mem_usage_t *, SuperLUStat_t *, int *);
-
-/* Supernodal LU factor related */
-extern void
-cCreate_CompCol_Matrix(SuperMatrix *, int, int, int, complex *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-cCreate_CompRow_Matrix(SuperMatrix *, int, int, int, complex *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-cCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *);
-extern void
-cCreate_Dense_Matrix(SuperMatrix *, int, int, complex *, int,
-		     Stype_t, Dtype_t, Mtype_t);
-extern void
-cCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, complex *, 
-		         int *, int *, int *, int *, int *,
-			 Stype_t, Dtype_t, Mtype_t);
-extern void
-cCopy_Dense_Matrix(int, int, complex *, int, complex *, int);
-
-extern void    countnz (const int, int *, int *, int *, GlobalLU_t *);
-extern void    fixupL (const int, const int *, GlobalLU_t *);
-
-extern void    callocateA (int, int, complex **, int **, int **);
-extern void    cgstrf (superlu_options_t*, SuperMatrix*, float, 
-                       int, int, int*, void *, int, int *, int *, 
-                       SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *);
-extern int     csnode_dfs (const int, const int, const int *, const int *,
-			     const int *, int *, int *, GlobalLU_t *);
-extern int     csnode_bmod (const int, const int, const int, complex *,
-                              complex *, GlobalLU_t *, SuperLUStat_t*);
-extern void    cpanel_dfs (const int, const int, const int, SuperMatrix *,
-			   int *, int *, complex *, int *, int *, int *,
-			   int *, int *, int *, int *, GlobalLU_t *);
-extern void    cpanel_bmod (const int, const int, const int, const int,
-                           complex *, complex *, int *, int *,
-			   GlobalLU_t *, SuperLUStat_t*);
-extern int     ccolumn_dfs (const int, const int, int *, int *, int *, int *,
-			   int *, int *, int *, int *, int *, GlobalLU_t *);
-extern int     ccolumn_bmod (const int, const int, complex *,
-			   complex *, int *, int *, int,
-                           GlobalLU_t *, SuperLUStat_t*);
-extern int     ccopy_to_ucol (int, int, int *, int *, int *,
-                              complex *, GlobalLU_t *);         
-extern int     cpivotL (const int, const float, int *, int *, 
-                         int *, int *, int *, GlobalLU_t *, SuperLUStat_t*);
-extern void    cpruneL (const int, const int *, const int, const int,
-			  const int *, const int *, int *, GlobalLU_t *);
-extern void    creadmt (int *, int *, int *, complex **, int **, int **);
-extern void    cGenXtrue (int, int, complex *, int);
-extern void    cFillRHS (trans_t, int, complex *, int, SuperMatrix *,
-			  SuperMatrix *);
-extern void    cgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
-                        SuperMatrix *, SuperLUStat_t*, int *);
-
-
-/* Driver related */
-
-extern void    cgsequ (SuperMatrix *, float *, float *, float *,
-			float *, float *, int *);
-extern void    claqgs (SuperMatrix *, float *, float *, float,
-                        float, float, char *);
-extern void    cgscon (char *, SuperMatrix *, SuperMatrix *, 
-		         float, float *, SuperLUStat_t*, int *);
-extern float   cPivotGrowth(int, SuperMatrix *, int *, 
-                            SuperMatrix *, SuperMatrix *);
-extern void    cgsrfs (trans_t, SuperMatrix *, SuperMatrix *,
-                       SuperMatrix *, int *, int *, char *, float *, 
-                       float *, SuperMatrix *, SuperMatrix *,
-                       float *, float *, SuperLUStat_t*, int *);
-
-extern int     sp_ctrsv (char *, char *, char *, SuperMatrix *,
-			SuperMatrix *, complex *, SuperLUStat_t*, int *);
-extern int     sp_cgemv (char *, complex, SuperMatrix *, complex *,
-			int, complex, complex *, int);
-
-extern int     sp_cgemm (char *, char *, int, int, int, complex,
-			SuperMatrix *, complex *, int, complex, 
-			complex *, int);
-
-/* Memory-related */
-extern int     cLUMemInit (fact_t, void *, int, int, int, int, int,
-			     SuperMatrix *, SuperMatrix *,
-			     GlobalLU_t *, int **, complex **);
-extern void    cSetRWork (int, int, complex *, complex **, complex **);
-extern void    cLUWorkFree (int *, complex *, GlobalLU_t *);
-extern int     cLUMemXpand (int, int, MemType, int *, GlobalLU_t *);
-
-extern complex  *complexMalloc(int);
-extern complex  *complexCalloc(int);
-extern float  *floatMalloc(int);
-extern float  *floatCalloc(int);
-extern int     cmemory_usage(const int, const int, const int, const int);
-extern int     cQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *);
-
-/* Auxiliary routines */
-extern void    creadhb(int *, int *, int *, complex **, int **, int **);
-extern void    cCompRow_to_CompCol(int, int, int, complex*, int*, int*,
-		                   complex **, int **, int **);
-extern void    cfill (complex *, int, complex);
-extern void    cinf_norm_error (int, SuperMatrix *, complex *);
-extern void    PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *,
-			 complex, complex, complex *, complex *, char *);
-
-/* Routines for debugging */
-extern void    cPrint_CompCol_Matrix(char *, SuperMatrix *);
-extern void    cPrint_SuperNode_Matrix(char *, SuperMatrix *);
-extern void    cPrint_Dense_Matrix(char *, SuperMatrix *);
-extern void    print_lu_col(char *, int, int, int *, GlobalLU_t *);
-extern void    check_tempv(int, complex *);
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_cSP_DEFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_Cnames.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_Cnames.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_Cnames.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_Cnames.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,335 +0,0 @@
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 1, 1997
- *
- */
-#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
-#define __SUPERLU_CNAMES
-
-/*
- * These macros define how C routines will be called.  ADD_ assumes that
- * they will be called by fortran, which expects C routines to have an
- * underscore postfixed to the name (Suns, and the Intel expect this).
- * NOCHANGE indicates that fortran will be calling, and that it expects
- * the name called by fortran to be identical to that compiled by the C
- * (RS6K's do this).  UPCASE says it expects C routines called by fortran
- * to be in all upcase (CRAY wants this). 
- */
-
-#define ADD_       0
-#define ADD__      1
-#define NOCHANGE   2
-#define UPCASE     3
-#define C_CALL     4
-
-#ifdef UpCase
-#define F77_CALL_C UPCASE
-#endif
-
-#ifdef NoChange
-#define F77_CALL_C NOCHANGE
-#endif
-
-#ifdef Add_
-#define F77_CALL_C ADD_
-#endif
-
-#ifdef Add__
-#define F77_CALL_C ADD__
-#endif
-
-/* Default */
-#ifndef F77_CALL_C
-#define F77_CALL_C ADD_
-#endif
-
-
-#if (F77_CALL_C == ADD_)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine
- * No redefinition necessary to have following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm_(...)
- *
- * This is the default.
- */
-
-#endif
-
-#if (F77_CALL_C == ADD__)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * for following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm__(...)
- */
-/* BLAS */
-#define sasum_    sasum__
-#define isamax_   isamax__
-#define scopy_    scopy__
-#define sscal_    sscal__
-#define sger_     sger__
-#define snrm2_    snrm2__
-#define ssymv_    ssymv__
-#define sdot_     sdot__
-#define saxpy_    saxpy__
-#define ssyr2_    ssyr2__
-#define srot_     srot__
-#define sgemv_    sgemv__
-#define strsv_    strsv__
-#define sgemm_    sgemm__
-#define strsm_    strsm__
-
-#define dasum_    dasum__
-#define idamax_   idamax__
-#define dcopy_    dcopy__
-#define dscal_    dscal__
-#define dger_     dger__
-#define dnrm2_    dnrm2__
-#define dsymv_    dsymv__
-#define ddot_     ddot__
-#define daxpy_    daxpy__
-#define dsyr2_    dsyr2__
-#define drot_     drot__
-#define dgemv_    dgemv__
-#define dtrsv_    dtrsv__
-#define dgemm_    dgemm__
-#define dtrsm_    dtrsm__
-
-#define scasum_   scasum__
-#define icamax_   icamax__
-#define ccopy_    ccopy__
-#define cscal_    cscal__
-#define scnrm2_   scnrm2__
-#define caxpy_    caxpy__
-#define cgemv_    cgemv__
-#define ctrsv_    ctrsv__
-#define cgemm_    cgemm__
-#define ctrsm_    ctrsm__
-#define cgerc_    cgerc__
-#define chemv_    chemv__
-#define cher2_    cher2__
-
-#define dzasum_   dzasum__
-#define izamax_   izamax__
-#define zcopy_    zcopy__
-#define zscal_    zscal__
-#define dznrm2_   dznrm2__
-#define zaxpy_    zaxpy__
-#define zgemv_    zgemv__
-#define ztrsv_    ztrsv__
-#define zgemm_    zgemm__
-#define ztrsm_    ztrsm__
-#define zgerc_    zgerc__
-#define zhemv_    zhemv__
-#define zher2_    zher2__
-
-/* LAPACK */
-#define dlamch_   dlamch__
-#define slamch_   slamch__
-#define xerbla_   xerbla__
-#define lsame_    lsame__
-#define dlacon_   dlacon__
-#define slacon_   slacon__
-#define icmax1_   icmax1__
-#define scsum1_   scsum1__
-#define clacon_   clacon__
-#define dzsum1_   dzsum1__
-#define izmax1_   izmax1__
-#define zlacon_   zlacon__
-
-/* Fortran interface */
-#define c_bridge_dgssv_ c_bridge_dgssv__
-#define c_fortran_sgssv_ c_fortran_sgssv__
-#define c_fortran_dgssv_ c_fortran_dgssv__
-#define c_fortran_cgssv_ c_fortran_cgssv__
-#define c_fortran_zgssv_ c_fortran_zgssv__
-#endif
-
-#if (F77_CALL_C == UPCASE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void DGEMM(...)
- */
-/* BLAS */
-#define sasum_    SASUM
-#define isamax_   ISAMAX
-#define scopy_    SCOPY
-#define sscal_    SSCAL
-#define sger_     SGER
-#define snrm2_    SNRM2
-#define ssymv_    SSYMV
-#define sdot_     SDOT
-#define saxpy_    SAXPY
-#define ssyr2_    SSYR2
-#define srot_     SROT
-#define sgemv_    SGEMV
-#define strsv_    STRSV
-#define sgemm_    SGEMM
-#define strsm_    STRSM
-
-#define dasum_    SASUM
-#define idamax_   ISAMAX
-#define dcopy_    SCOPY
-#define dscal_    SSCAL
-#define dger_     SGER
-#define dnrm2_    SNRM2
-#define dsymv_    SSYMV
-#define ddot_     SDOT
-#define daxpy_    SAXPY
-#define dsyr2_    SSYR2
-#define drot_     SROT
-#define dgemv_    SGEMV
-#define dtrsv_    STRSV
-#define dgemm_    SGEMM
-#define dtrsm_    STRSM
-
-#define scasum_   SCASUM
-#define icamax_   ICAMAX
-#define ccopy_    CCOPY
-#define cscal_    CSCAL
-#define scnrm2_   SCNRM2
-#define caxpy_    CAXPY
-#define cgemv_    CGEMV
-#define ctrsv_    CTRSV
-#define cgemm_    CGEMM
-#define ctrsm_    CTRSM
-#define cgerc_    CGERC
-#define chemv_    CHEMV
-#define cher2_    CHER2
-
-#define dzasum_   SCASUM
-#define izamax_   ICAMAX
-#define zcopy_    CCOPY
-#define zscal_    CSCAL
-#define dznrm2_   SCNRM2
-#define zaxpy_    CAXPY
-#define zgemv_    CGEMV
-#define ztrsv_    CTRSV
-#define zgemm_    CGEMM
-#define ztrsm_    CTRSM
-#define zgerc_    CGERC
-#define zhemv_    CHEMV
-#define zher2_    CHER2
-
-/* LAPACK */
-#define dlamch_   DLAMCH
-#define slamch_   SLAMCH
-#define xerbla_   XERBLA
-#define lsame_    LSAME
-#define dlacon_   DLACON
-#define slacon_   SLACON
-#define icmax1_   ICMAX1
-#define scsum1_   SCSUM1
-#define clacon_   CLACON
-#define dzsum1_   DZSUM1
-#define izmax1_   IZMAX1
-#define zlacon_   ZLACON
-
-/* Fortran interface */
-#define c_bridge_dgssv_ C_BRIDGE_DGSSV
-#define c_fortran_sgssv_ C_FORTRAN_SGSSV
-#define c_fortran_dgssv_ C_FORTRAN_DGSSV
-#define c_fortran_cgssv_ C_FORTRAN_CGSSV
-#define c_fortran_zgssv_ C_FORTRAN_ZGSSV
-#endif
-
-#if (F77_CALL_C == NOCHANGE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * for following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm(...)
- */
-/* BLAS */
-#define sasum_    sasum
-#define isamax_   isamax
-#define scopy_    scopy
-#define sscal_    sscal
-#define sger_     sger
-#define snrm2_    snrm2
-#define ssymv_    ssymv
-#define sdot_     sdot
-#define saxpy_    saxpy
-#define ssyr2_    ssyr2
-#define srot_     srot
-#define sgemv_    sgemv
-#define strsv_    strsv
-#define sgemm_    sgemm
-#define strsm_    strsm
-
-#define dasum_    dasum
-#define idamax_   idamax
-#define dcopy_    dcopy
-#define dscal_    dscal
-#define dger_     dger
-#define dnrm2_    dnrm2
-#define dsymv_    dsymv
-#define ddot_     ddot
-#define daxpy_    daxpy
-#define dsyr2_    dsyr2
-#define drot_     drot
-#define dgemv_    dgemv
-#define dtrsv_    dtrsv
-#define dgemm_    dgemm
-#define dtrsm_    dtrsm
-
-#define scasum_   scasum
-#define icamax_   icamax
-#define ccopy_    ccopy
-#define cscal_    cscal
-#define scnrm2_   scnrm2
-#define caxpy_    caxpy
-#define cgemv_    cgemv
-#define ctrsv_    ctrsv
-#define cgemm_    cgemm
-#define ctrsm_    ctrsm
-#define cgerc_    cgerc
-#define chemv_    chemv
-#define cher2_    cher2
-
-#define dzasum_   dzasum
-#define izamax_   izamax
-#define zcopy_    zcopy
-#define zscal_    zscal
-#define dznrm2_   dznrm2
-#define zaxpy_    zaxpy
-#define zgemv_    zgemv
-#define ztrsv_    ztrsv
-#define zgemm_    zgemm
-#define ztrsm_    ztrsm
-#define zgerc_    zgerc
-#define zhemv_    zhemv
-#define zher2_    zher2
-
-/* LAPACK */
-#define dlamch_   dlamch
-#define slamch_   slamch
-#define xerbla_   xerbla
-#define lsame_    lsame
-#define dlacon_   dlacon
-#define slacon_   slacon
-#define icmax1_   icmax1
-#define scsum1_   scsum1
-#define clacon_   clacon
-#define dzsum1_   dzsum1
-#define izmax1_   izmax1
-#define zlacon_   zlacon
-
-/* Fortran interface */
-#define c_bridge_dgssv_ c_bridge_dgssv
-#define c_fortran_sgssv_ c_fortran_sgssv
-#define c_fortran_dgssv_ c_fortran_dgssv
-#define c_fortran_cgssv_ c_fortran_cgssv
-#define c_fortran_zgssv_ c_fortran_zgssv
-#endif
-
-#endif /* __SUPERLU_CNAMES */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_dcomplex.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,72 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#ifndef __SUPERLU_DCOMPLEX /* allow multiple inclusions */
-#define __SUPERLU_DCOMPLEX
-
-/* 
- * This header file is to be included in source files z*.c
- */
-#ifndef DCOMPLEX_INCLUDE
-#define DCOMPLEX_INCLUDE
-
-typedef struct { double r, i; } doublecomplex;
-
-
-/* Macro definitions */
-
-/* Complex Addition c = a + b */
-#define z_add(c, a, b) { (c)->r = (a)->r + (b)->r; \
-			 (c)->i = (a)->i + (b)->i; }
-
-/* Complex Subtraction c = a - b */
-#define z_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \
-			 (c)->i = (a)->i - (b)->i; }
-
-/* Complex-Double Multiplication */
-#define zd_mult(c, a, b) { (c)->r = (a)->r * (b); \
-                           (c)->i = (a)->i * (b); }
-
-/* Complex-Complex Multiplication */
-#define zz_mult(c, a, b) { \
-	double cr, ci; \
-    	cr = (a)->r * (b)->r - (a)->i * (b)->i; \
-    	ci = (a)->i * (b)->r + (a)->r * (b)->i; \
-    	(c)->r = cr; \
-    	(c)->i = ci; \
-    }
-
-#define zz_conj(a, b) { \
-        (a)->r = (b)->r; \
-        (a)->i = -((b)->i); \
-    }
-
-/* Complex equality testing */
-#define z_eq(a, b)  ( (a)->r == (b)->r && (a)->i == (b)->i )
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Prototypes for functions in dcomplex.c */
-void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
-double z_abs(doublecomplex *);     /* exact */
-double z_abs1(doublecomplex *);    /* approximate */
-void z_exp(doublecomplex *, doublecomplex *);
-void d_cnjg(doublecomplex *r, doublecomplex *z);
-double d_imag(doublecomplex *);
-
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif
-
-#endif  /* __SUPERLU_DCOMPLEX */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_ddefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_ddefs.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_ddefs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_ddefs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,240 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#ifndef __SUPERLU_dSP_DEFS /* allow multiple inclusions */
-#define __SUPERLU_dSP_DEFS
-
-/*
- * File name:		dsp_defs.h
- * Purpose:             Sparse matrix types and function prototypes
- * History:
- */
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <HYPRE_utilities.h>
-#include <fortran.h>
-#ifdef _CRAY
-#include <string.h>
-#endif
-
-/* Define my integer type int_t */
-typedef int int_t; /* default */
-
-/* Added by Rob Falgout temporarily */
-//typedef int logical;
-
-#include "supermatrix.h"
-#include "slu_util.h"
-
-/*
- * Global data structures used in LU factorization -
- * 
- *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
- *   (xsup,supno): supno[i] is the supernode no to which i belongs;
- *	xsup(s) points to the beginning of the s-th supernode.
- *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
- *	        xsup 0 1 2 4 7 12
- *	Note: dfs will be performed on supernode rep. relative to the new 
- *	      row pivoting ordering
- *
- *   (xlsub,lsub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xlsub[j] points to the starting
- *	location of the j-th column in lsub[*]. Note that xlsub 
- *	is indexed by column.
- *	Storage: original row subscripts
- *
- *      During the course of sparse LU factorization, we also use
- *	(xlsub,lsub) for the purpose of symmetric pruning. For each
- *	supernode {s,s+1,...,t=s+r} with first column s and last
- *	column t, the subscript set
- *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
- *	is the structure of column s (i.e. structure of this supernode).
- *	It is used for the storage of numerical values.
- *	Furthermore,
- *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
- *	is the structure of the last column t of this supernode.
- *	It is for the purpose of symmetric pruning. Therefore, the
- *	structural subscripts can be rearranged without making physical
- *	interchanges among the numerical values.
- *
- *	However, if the supernode has only one column, then we
- *	only keep one set of subscripts. For any subscript interchange
- *	performed, similar interchange must be done on the numerical
- *	values.
- *
- *	The last column structures (for pruning) will be removed
- *	after the numercial LU factorization phase.
- *
- *   (xlusup,lusup): lusup[*] contains the numerical values of the
- *	rectangular supernodes; xlusup[j] points to the starting
- *	location of the j-th column in storage vector lusup[*]
- *	Note: xlusup is indexed by column.
- *	Each rectangular supernode is stored by column-major
- *	scheme, consistent with Fortran 2-dim array storage.
- *
- *   (xusub,ucol,usub): ucol[*] stores the numerical values of
- *	U-columns outside the rectangular supernodes. The row
- *	subscript of nonzero ucol[k] is stored in usub[k].
- *	xusub[i] points to the starting location of column i in ucol.
- *	Storage: new row subscripts; that is subscripts of PA.
- */
-typedef struct {
-    int     *xsup;    /* supernode and column mapping */
-    int     *supno;   
-    int     *lsub;    /* compressed L subscripts */
-    int	    *xlsub;
-    double  *lusup;   /* L supernodes */
-    int     *xlusup;
-    double  *ucol;    /* U columns */
-    int     *usub;
-    int	    *xusub;
-    int     nzlmax;   /* current max size of lsub */
-    int     nzumax;   /*    "    "    "      ucol */
-    int     nzlumax;  /*    "    "    "     lusup */
-    int     n;        /* number of columns in the matrix */
-    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
-} GlobalLU_t;
-
-typedef struct {
-    float for_lu;
-    float total_needed;
-    int   expansions;
-} mem_usage_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Driver routines */
-extern void
-dgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *,
-      SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *);
-extern void
-dgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,
-       char *, double *, double *, SuperMatrix *, SuperMatrix *,
-       void *, int, SuperMatrix *, SuperMatrix *,
-       double *, double *, double *, double *,
-       mem_usage_t *, SuperLUStat_t *, int *);
-
-/* Supernodal LU factor related */
-extern void
-dCreate_CompCol_Matrix(SuperMatrix *, int, int, int, double *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-dCreate_CompRow_Matrix(SuperMatrix *, int, int, int, double *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-dCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *);
-extern void
-dCreate_Dense_Matrix(SuperMatrix *, int, int, double *, int,
-		     Stype_t, Dtype_t, Mtype_t);
-extern void
-dCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, double *, 
-		         int *, int *, int *, int *, int *,
-			 Stype_t, Dtype_t, Mtype_t);
-extern void
-dCopy_Dense_Matrix(int, int, double *, int, double *, int);
-
-extern void    countnz (const int, int *, int *, int *, GlobalLU_t *);
-extern void    fixupL (const int, const int *, GlobalLU_t *);
-
-extern void    dallocateA (int, int, double **, int **, int **);
-extern void    dgstrf (superlu_options_t*, SuperMatrix*, double, 
-                       int, int, int*, void *, int, int *, int *, 
-                       SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *);
-extern int     dsnode_dfs (const int, const int, const int *, const int *,
-			     const int *, int *, int *, GlobalLU_t *);
-extern int     dsnode_bmod (const int, const int, const int, double *,
-                              double *, GlobalLU_t *, SuperLUStat_t*);
-extern void    dpanel_dfs (const int, const int, const int, SuperMatrix *,
-			   int *, int *, double *, int *, int *, int *,
-			   int *, int *, int *, int *, GlobalLU_t *);
-extern void    dpanel_bmod (const int, const int, const int, const int,
-                           double *, double *, int *, int *,
-			   GlobalLU_t *, SuperLUStat_t*);
-extern int     dcolumn_dfs (const int, const int, int *, int *, int *, int *,
-			   int *, int *, int *, int *, int *, GlobalLU_t *);
-extern int     dcolumn_bmod (const int, const int, double *,
-			   double *, int *, int *, int,
-                           GlobalLU_t *, SuperLUStat_t*);
-extern int     dcopy_to_ucol (int, int, int *, int *, int *,
-                              double *, GlobalLU_t *);         
-extern int     dpivotL (const int, const double, int *, int *, 
-                         int *, int *, int *, GlobalLU_t *, SuperLUStat_t*);
-extern void    dpruneL (const int, const int *, const int, const int,
-			  const int *, const int *, int *, GlobalLU_t *);
-extern void    dreadmt (int *, int *, int *, double **, int **, int **);
-extern void    dGenXtrue (int, int, double *, int);
-extern void    dFillRHS (trans_t, int, double *, int, SuperMatrix *,
-			  SuperMatrix *);
-extern void    dgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
-                        SuperMatrix *, SuperLUStat_t*, int *);
-
-
-/* Driver related */
-
-extern void    dgsequ (SuperMatrix *, double *, double *, double *,
-			double *, double *, int *);
-extern void    dlaqgs (SuperMatrix *, double *, double *, double,
-                        double, double, char *);
-extern void    dgscon (char *, SuperMatrix *, SuperMatrix *, 
-		         double, double *, SuperLUStat_t*, int *);
-extern double   dPivotGrowth(int, SuperMatrix *, int *, 
-                            SuperMatrix *, SuperMatrix *);
-extern void    dgsrfs (trans_t, SuperMatrix *, SuperMatrix *,
-                       SuperMatrix *, int *, int *, char *, double *, 
-                       double *, SuperMatrix *, SuperMatrix *,
-                       double *, double *, SuperLUStat_t*, int *);
-
-extern int     sp_dtrsv (const char *,const char *,const char *, SuperMatrix *,
-			SuperMatrix *, double *, SuperLUStat_t*, int *);
-extern int     sp_dgemv (const char *, double, SuperMatrix *, double *,
-			int, double, double *, int);
-
-extern int     sp_dgemm (const char *,const char *, int, int, int, double,
-			SuperMatrix *, double *, int, double, 
-			double *, int);
-
-/* Memory-related */
-extern int     dLUMemInit (fact_t, void *, int, int, int, int, int,
-			     SuperMatrix *, SuperMatrix *,
-			     GlobalLU_t *, int **, double **);
-extern void    dSetRWork (int, int, double *, double **, double **);
-extern void    dLUWorkFree (int *, double *, GlobalLU_t *);
-extern int     dLUMemXpand (int, int, MemType, int *, GlobalLU_t *);
-
-extern double  *doubleMalloc(int);
-extern double  *doubleCalloc(int);
-extern int     dmemory_usage(const int, const int, const int, const int);
-extern int     dQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *);
-
-/* Auxiliary routines */
-extern void    dreadhb(int *, int *, int *, double **, int **, int **);
-extern void    dCompRow_to_CompCol(int, int, int, double*, int*, int*,
-		                   double **, int **, int **);
-extern void    dfill (double *, int, double);
-extern void    dinf_norm_error (int, SuperMatrix *, double *);
-extern void    PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *,
-			 double, double, double *, double *, char *);
-
-/* Routines for debugging */
-extern void    dPrint_CompCol_Matrix(char *, SuperMatrix *);
-extern void    dPrint_SuperNode_Matrix(char *, SuperMatrix *);
-extern void    dPrint_Dense_Matrix(char *, SuperMatrix *);
-extern void    print_lu_col(char *, int, int, int *, GlobalLU_t *);
-extern void    check_tempv(int, double *);
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_dSP_DEFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_scomplex.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_scomplex.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_scomplex.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_scomplex.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,72 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#ifndef __SUPERLU_SCOMPLEX /* allow multiple inclusions */
-#define __SUPERLU_SCOMPLEX
-
-/* 
- * This header file is to be included in source files c*.c
- */
-#ifndef SCOMPLEX_INCLUDE
-#define SCOMPLEX_INCLUDE
-
-typedef struct { float r, i; } complex;
-
-
-/* Macro definitions */
-
-/* Complex Addition c = a + b */
-#define c_add(c, a, b) { (c)->r = (a)->r + (b)->r; \
-			 (c)->i = (a)->i + (b)->i; }
-
-/* Complex Subtraction c = a - b */
-#define c_sub(c, a, b) { (c)->r = (a)->r - (b)->r; \
-			 (c)->i = (a)->i - (b)->i; }
-
-/* Complex-Double Multiplication */
-#define cs_mult(c, a, b) { (c)->r = (a)->r * (b); \
-                           (c)->i = (a)->i * (b); }
-
-/* Complex-Complex Multiplication */
-#define cc_mult(c, a, b) { \
-	float cr, ci; \
-    	cr = (a)->r * (b)->r - (a)->i * (b)->i; \
-    	ci = (a)->i * (b)->r + (a)->r * (b)->i; \
-    	(c)->r = cr; \
-    	(c)->i = ci; \
-    }
-
-#define cc_conj(a, b) { \
-        (a)->r = (b)->r; \
-        (a)->i = -((b)->i); \
-    }
-
-/* Complex equality testing */
-#define c_eq(a, b)  ( (a)->r == (b)->r && (a)->i == (b)->i )
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Prototypes for functions in scomplex.c */
-void c_div(complex *, complex *, complex *);
-double c_abs(complex *);     /* exact */
-double c_abs1(complex *);    /* approximate */
-void c_exp(complex *, complex *);
-void r_cnjg(complex *, complex *);
-double r_imag(complex *);
-
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif
-
-#endif  /* __SUPERLU_SCOMPLEX */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_sdefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_sdefs.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_sdefs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_sdefs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,234 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#ifndef __SUPERLU_sSP_DEFS /* allow multiple inclusions */
-#define __SUPERLU_sSP_DEFS
-
-/*
- * File name:		ssp_defs.h
- * Purpose:             Sparse matrix types and function prototypes
- * History:
- */
-
-#ifdef _CRAY
-#include <fortran.h>
-#include <string.h>
-#endif
-
-/* Define my integer type int_t */
-typedef int int_t; /* default */
-
-#include "slu_Cnames.h"
-#include "supermatrix.h"
-#include "slu_util.h"
-
-
-/*
- * Global data structures used in LU factorization -
- * 
- *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
- *   (xsup,supno): supno[i] is the supernode no to which i belongs;
- *	xsup(s) points to the beginning of the s-th supernode.
- *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
- *	        xsup 0 1 2 4 7 12
- *	Note: dfs will be performed on supernode rep. relative to the new 
- *	      row pivoting ordering
- *
- *   (xlsub,lsub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xlsub[j] points to the starting
- *	location of the j-th column in lsub[*]. Note that xlsub 
- *	is indexed by column.
- *	Storage: original row subscripts
- *
- *      During the course of sparse LU factorization, we also use
- *	(xlsub,lsub) for the purpose of symmetric pruning. For each
- *	supernode {s,s+1,...,t=s+r} with first column s and last
- *	column t, the subscript set
- *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
- *	is the structure of column s (i.e. structure of this supernode).
- *	It is used for the storage of numerical values.
- *	Furthermore,
- *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
- *	is the structure of the last column t of this supernode.
- *	It is for the purpose of symmetric pruning. Therefore, the
- *	structural subscripts can be rearranged without making physical
- *	interchanges among the numerical values.
- *
- *	However, if the supernode has only one column, then we
- *	only keep one set of subscripts. For any subscript interchange
- *	performed, similar interchange must be done on the numerical
- *	values.
- *
- *	The last column structures (for pruning) will be removed
- *	after the numercial LU factorization phase.
- *
- *   (xlusup,lusup): lusup[*] contains the numerical values of the
- *	rectangular supernodes; xlusup[j] points to the starting
- *	location of the j-th column in storage vector lusup[*]
- *	Note: xlusup is indexed by column.
- *	Each rectangular supernode is stored by column-major
- *	scheme, consistent with Fortran 2-dim array storage.
- *
- *   (xusub,ucol,usub): ucol[*] stores the numerical values of
- *	U-columns outside the rectangular supernodes. The row
- *	subscript of nonzero ucol[k] is stored in usub[k].
- *	xusub[i] points to the starting location of column i in ucol.
- *	Storage: new row subscripts; that is subscripts of PA.
- */
-typedef struct {
-    int     *xsup;    /* supernode and column mapping */
-    int     *supno;   
-    int     *lsub;    /* compressed L subscripts */
-    int	    *xlsub;
-    float  *lusup;   /* L supernodes */
-    int     *xlusup;
-    float  *ucol;    /* U columns */
-    int     *usub;
-    int	    *xusub;
-    int     nzlmax;   /* current max size of lsub */
-    int     nzumax;   /*    "    "    "      ucol */
-    int     nzlumax;  /*    "    "    "     lusup */
-    int     n;        /* number of columns in the matrix */
-    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
-} GlobalLU_t;
-
-typedef struct {
-    float for_lu;
-    float total_needed;
-    int   expansions;
-} mem_usage_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Driver routines */
-extern void
-sgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *,
-      SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *);
-extern void
-sgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,
-       char *, float *, float *, SuperMatrix *, SuperMatrix *,
-       void *, int, SuperMatrix *, SuperMatrix *,
-       float *, float *, float *, float *,
-       mem_usage_t *, SuperLUStat_t *, int *);
-
-/* Supernodal LU factor related */
-extern void
-sCreate_CompCol_Matrix(SuperMatrix *, int, int, int, float *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-sCreate_CompRow_Matrix(SuperMatrix *, int, int, int, float *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-sCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *);
-extern void
-sCreate_Dense_Matrix(SuperMatrix *, int, int, float *, int,
-		     Stype_t, Dtype_t, Mtype_t);
-extern void
-sCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, float *, 
-		         int *, int *, int *, int *, int *,
-			 Stype_t, Dtype_t, Mtype_t);
-extern void
-sCopy_Dense_Matrix(int, int, float *, int, float *, int);
-
-extern void    countnz (const int, int *, int *, int *, GlobalLU_t *);
-extern void    fixupL (const int, const int *, GlobalLU_t *);
-
-extern void    sallocateA (int, int, float **, int **, int **);
-extern void    sgstrf (superlu_options_t*, SuperMatrix*, float, 
-                       int, int, int*, void *, int, int *, int *, 
-                       SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *);
-extern int     ssnode_dfs (const int, const int, const int *, const int *,
-			     const int *, int *, int *, GlobalLU_t *);
-extern int     ssnode_bmod (const int, const int, const int, float *,
-                              float *, GlobalLU_t *, SuperLUStat_t*);
-extern void    spanel_dfs (const int, const int, const int, SuperMatrix *,
-			   int *, int *, float *, int *, int *, int *,
-			   int *, int *, int *, int *, GlobalLU_t *);
-extern void    spanel_bmod (const int, const int, const int, const int,
-                           float *, float *, int *, int *,
-			   GlobalLU_t *, SuperLUStat_t*);
-extern int     scolumn_dfs (const int, const int, int *, int *, int *, int *,
-			   int *, int *, int *, int *, int *, GlobalLU_t *);
-extern int     scolumn_bmod (const int, const int, float *,
-			   float *, int *, int *, int,
-                           GlobalLU_t *, SuperLUStat_t*);
-extern int     scopy_to_ucol (int, int, int *, int *, int *,
-                              float *, GlobalLU_t *);         
-extern int     spivotL (const int, const float, int *, int *, 
-                         int *, int *, int *, GlobalLU_t *, SuperLUStat_t*);
-extern void    spruneL (const int, const int *, const int, const int,
-			  const int *, const int *, int *, GlobalLU_t *);
-extern void    sreadmt (int *, int *, int *, float **, int **, int **);
-extern void    sGenXtrue (int, int, float *, int);
-extern void    sFillRHS (trans_t, int, float *, int, SuperMatrix *,
-			  SuperMatrix *);
-extern void    sgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
-                        SuperMatrix *, SuperLUStat_t*, int *);
-
-
-/* Driver related */
-
-extern void    sgsequ (SuperMatrix *, float *, float *, float *,
-			float *, float *, int *);
-extern void    slaqgs (SuperMatrix *, float *, float *, float,
-                        float, float, char *);
-extern void    sgscon (char *, SuperMatrix *, SuperMatrix *, 
-		         float, float *, SuperLUStat_t*, int *);
-extern float   sPivotGrowth(int, SuperMatrix *, int *, 
-                            SuperMatrix *, SuperMatrix *);
-extern void    sgsrfs (trans_t, SuperMatrix *, SuperMatrix *,
-                       SuperMatrix *, int *, int *, char *, float *, 
-                       float *, SuperMatrix *, SuperMatrix *,
-                       float *, float *, SuperLUStat_t*, int *);
-
-extern int     sp_strsv (const char *,const char *,const char *, SuperMatrix *,
-			SuperMatrix *, float *, SuperLUStat_t*, int *);
-extern int     sp_sgemv (char *, float, SuperMatrix *, float *,
-			int, float, float *, int);
-
-extern int     sp_sgemm (char *, char *, int, int, int, float,
-			SuperMatrix *, float *, int, float, 
-			float *, int);
-
-/* Memory-related */
-extern int     sLUMemInit (fact_t, void *, int, int, int, int, int,
-			     SuperMatrix *, SuperMatrix *,
-			     GlobalLU_t *, int **, float **);
-extern void    sSetRWork (int, int, float *, float **, float **);
-extern void    sLUWorkFree (int *, float *, GlobalLU_t *);
-extern int     sLUMemXpand (int, int, MemType, int *, GlobalLU_t *);
-
-extern float  *floatMalloc(int);
-extern float  *floatCalloc(int);
-extern int     smemory_usage(const int, const int, const int, const int);
-extern int     sQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *);
-
-/* Auxiliary routines */
-extern void    sreadhb(int *, int *, int *, float **, int **, int **);
-extern void    sCompRow_to_CompCol(int, int, int, float*, int*, int*,
-		                   float **, int **, int **);
-extern void    sfill (float *, int, float);
-extern void    sinf_norm_error (int, SuperMatrix *, float *);
-extern void    PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *,
-			 float, float, float *, float *, char *);
-
-/* Routines for debugging */
-extern void    sPrint_CompCol_Matrix(char *, SuperMatrix *);
-extern void    sPrint_SuperNode_Matrix(char *, SuperMatrix *);
-extern void    sPrint_Dense_Matrix(char *, SuperMatrix *);
-extern void    print_lu_col(char *, int, int, int *, GlobalLU_t *);
-extern void    check_tempv(int, float *);
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_sSP_DEFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,736 +0,0 @@
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-#include <math.h>
-#include "slu_ddefs.h"
-
-/* 
- * Global statistics variale
- */
-
-void superlu_abort_and_exit(char* msg)
-{
-    fprintf(stderr, "%s", msg);
-    exit (-1);
-}
-
-/*
- * Set the default values for the options argument.
- */
-void set_default_options(superlu_options_t *options)
-{
-    options->Fact = DOFACT;
-    options->Equil = YES;
-    options->ColPerm = COLAMD;
-    options->DiagPivotThresh = 1.0;
-    options->Trans = NOTRANS;
-    options->IterRefine = NOREFINE;
-    options->SymmetricMode = NO;
-    options->PivotGrowth = NO;
-    options->ConditionNumber = NO;
-    options->PrintStat = YES;
-}
-
-/*
- * Print the options setting.
- */
-void print_options(superlu_options_t *options)
-{
-    printf(".. options:\n");
-    printf("\tFact\t %8d\n", options->Fact);
-    printf("\tEquil\t %8d\n", options->Equil);
-    printf("\tColPerm\t %8d\n", options->ColPerm);
-    printf("\tDiagPivotThresh %8.4f\n", options->DiagPivotThresh);
-    printf("\tTrans\t %8d\n", options->Trans);
-    printf("\tIterRefine\t%4d\n", options->IterRefine);
-    printf("\tSymmetricMode\t%4d\n", options->SymmetricMode);
-    printf("\tPivotGrowth\t%4d\n", options->PivotGrowth);
-    printf("\tConditionNumber\t%4d\n", options->ConditionNumber);
-    printf("..\n");
-}
-
-/* Deallocate the structure pointing to the actual storage of the matrix. */
-void
-Destroy_SuperMatrix_Store(SuperMatrix *A)
-{
-    SUPERLU_FREE ( A->Store );
-}
-
-void
-Destroy_CompCol_Matrix(SuperMatrix *A)
-{
-    SUPERLU_FREE( ((NCformat *)A->Store)->rowind );
-    SUPERLU_FREE( ((NCformat *)A->Store)->colptr );
-    SUPERLU_FREE( ((NCformat *)A->Store)->nzval );
-    SUPERLU_FREE( A->Store );
-}
-
-void
-Destroy_CompRow_Matrix(SuperMatrix *A)
-{
-    SUPERLU_FREE( ((NRformat *)A->Store)->colind );
-    SUPERLU_FREE( ((NRformat *)A->Store)->rowptr );
-    SUPERLU_FREE( ((NRformat *)A->Store)->nzval );
-    SUPERLU_FREE( A->Store );
-}
-
-void
-Destroy_SuperNode_Matrix(SuperMatrix *A)
-{
-    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
-    SUPERLU_FREE ( A->Store );
-}
-
-/* A is of type Stype==NCP */
-void
-Destroy_CompCol_Permuted(SuperMatrix *A)
-{
-    SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
-    SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
-    SUPERLU_FREE ( A->Store );
-}
-
-/* A is of type Stype==DN */
-void
-Destroy_Dense_Matrix(SuperMatrix *A)
-{
-    DNformat* Astore = (DNformat*) A->Store;
-    SUPERLU_FREE (Astore->nzval);
-    SUPERLU_FREE ( A->Store );
-}
-
-/*
- * Reset repfnz[] for the current column 
- */
-void
-resetrep_col (const int nseg, const int *segrep, int *repfnz)
-{
-    int i, irep;
-    
-    for (i = 0; i < nseg; i++) {
-	irep = segrep[i];
-	repfnz[irep] = EMPTY;
-    }
-}
-
-
-/*
- * Count the total number of nonzeros in factors L and U,  and in the 
- * symmetrically reduced L. 
- */
-void
-countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLU_t *Glu)
-{
-    int          nsuper, fsupc, i, j;
-    int          nnzL0, jlen, irep;
-    int          *xsup, *xlsub;
-
-    xsup   = Glu->xsup;
-    xlsub  = Glu->xlsub;
-    *nnzL  = 0;
-    *nnzU  = (Glu->xusub)[n];
-    nnzL0  = 0;
-    nsuper = (Glu->supno)[n];
-
-    if ( n <= 0 ) return;
-
-    /* 
-     * For each supernode
-     */
-    for (i = 0; i <= nsuper; i++) {
-	fsupc = xsup[i];
-	jlen = xlsub[fsupc+1] - xlsub[fsupc];
-
-	for (j = fsupc; j < xsup[i+1]; j++) {
-	    *nnzL += jlen;
-	    *nnzU += j - fsupc + 1;
-	    jlen--;
-	}
-	irep = xsup[i+1] - 1;
-	nnzL0 += xprune[irep] - xlsub[irep];
-    }
-    
-    /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/
-}
-
-
-
-/*
- * Fix up the data storage lsub for L-subscripts. It removes the subscript
- * sets for structural pruning,	and applies permuation to the remaining
- * subscripts.
- */
-void
-fixupL(const int n, const int *perm_r, GlobalLU_t *Glu)
-{
-    register int nsuper, fsupc, nextl, i, j, k, jstrt;
-    int          *xsup, *lsub, *xlsub;
-
-    if ( n <= 1 ) return;
-
-    xsup   = Glu->xsup;
-    lsub   = Glu->lsub;
-    xlsub  = Glu->xlsub;
-    nextl  = 0;
-    nsuper = (Glu->supno)[n];
-    
-    /* 
-     * For each supernode ...
-     */
-    for (i = 0; i <= nsuper; i++) {
-	fsupc = xsup[i];
-	jstrt = xlsub[fsupc];
-	xlsub[fsupc] = nextl;
-	for (j = jstrt; j < xlsub[fsupc+1]; j++) {
-	    lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
-	    nextl++;
-  	}
-	for (k = fsupc+1; k < xsup[i+1]; k++) 
-	    	xlsub[k] = nextl;	/* Other columns in supernode i */
-
-    }
-
-    xlsub[n] = nextl;
-}
-
-
-/*
- * Diagnostic print of segment info after panel_dfs().
- */
-void print_panel_seg(int n, int w, int jcol, int nseg, 
-		     int *segrep, int *repfnz)
-{
-    int j, k;
-    
-    for (j = jcol; j < jcol+w; j++) {
-	printf("\tcol %d:\n", j);
-	for (k = 0; k < nseg; k++)
-	    printf("\t\tseg %d, segrep %d, repfnz %d\n", k, 
-			segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
-    }
-
-}
-
-
-void
-StatInit(SuperLUStat_t *stat)
-{
-    register int i, w, panel_size, relax;
-
-    panel_size = sp_ienv(1);
-    relax = sp_ienv(2);
-    w = SUPERLU_MAX(panel_size, relax);
-    stat->panel_histo = intCalloc(w+1);
-    stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double));
-    if (!stat->utime) ABORT("SUPERLU_MALLOC fails for stat->utime");
-    stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t));
-    if (!stat->ops) ABORT("SUPERLU_MALLOC fails for stat->ops");
-    for (i = 0; i < NPHASES; ++i) {
-        stat->utime[i] = 0.;
-        stat->ops[i] = 0.;
-    }
-}
-
-
-void
-StatPrint(SuperLUStat_t *stat)
-{
-    double         *utime;
-    flops_t        *ops;
-
-    utime = stat->utime;
-    ops   = stat->ops;
-    printf("Factor time  = %8.2f\n", utime[FACT]);
-    if ( utime[FACT] != 0.0 )
-      printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
-	     ops[FACT]*1e-6/utime[FACT]);
-
-    printf("Solve time   = %8.2f\n", utime[SOLVE]);
-    if ( utime[SOLVE] != 0.0 )
-      printf("Solve flops = %e\tMflops = %8.2f\n", ops[SOLVE],
-	     ops[SOLVE]*1e-6/utime[SOLVE]);
-
-}
-
-
-void
-StatFree(SuperLUStat_t *stat)
-{
-    SUPERLU_FREE(stat->panel_histo);
-    SUPERLU_FREE(stat->utime);
-    SUPERLU_FREE(stat->ops);
-}
-
-
-flops_t
-LUFactFlops(SuperLUStat_t *stat)
-{
-    return (stat->ops[FACT]);
-}
-
-flops_t
-LUSolveFlops(SuperLUStat_t *stat)
-{
-    return (stat->ops[SOLVE]);
-}
-
-
-
-
-
-/* 
- * Fills an integer array with a given value.
- */
-void ifill(int *a, int alen, int ival)
-{
-    register int i;
-    for (i = 0; i < alen; i++) a[i] = ival;
-}
-
-
-
-/* 
- * Get the statistics of the supernodes 
- */
-#define NBUCKS 10
-static 	int	max_sup_size;
-
-void super_stats(int nsuper, int *xsup)
-{
-    register int nsup1 = 0;
-    int          i, isize, whichb, bl, bh;
-    int          bucket[NBUCKS];
-
-    max_sup_size = 0;
-
-    for (i = 0; i <= nsuper; i++) {
-	isize = xsup[i+1] - xsup[i];
-	if ( isize == 1 ) nsup1++;
-	if ( max_sup_size < isize ) max_sup_size = isize;	
-    }
-
-    printf("    Supernode statistics:\n\tno of super = %d\n", nsuper+1);
-    printf("\tmax supernode size = %d\n", max_sup_size);
-    printf("\tno of size 1 supernodes = %d\n", nsup1);
-
-    /* Histogram of the supernode sizes */
-    ifill (bucket, NBUCKS, 0);
-
-    for (i = 0; i <= nsuper; i++) {
-        isize = xsup[i+1] - xsup[i];
-        whichb = (float) isize / max_sup_size * NBUCKS;
-        if (whichb >= NBUCKS) whichb = NBUCKS - 1;
-        bucket[whichb]++;
-    }
-    
-    printf("\tHistogram of supernode sizes:\n");
-    for (i = 0; i < NBUCKS; i++) {
-        bl = (float) i * max_sup_size / NBUCKS;
-        bh = (float) (i+1) * max_sup_size / NBUCKS;
-        printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]);
-    }
-
-}
-
-
-float SpaSize(int n, int np, float sum_npw)
-{
-    return (sum_npw*8 + np*8 + n*4)/1024.;
-}
-
-float DenseSize(int n, float sum_nw)
-{
-    return (sum_nw*8 + n*8)/1024.;;
-}
-
-
-
-/*
- * Check whether repfnz[] == EMPTY after reset.
- */
-void check_repfnz(int n, int w, int jcol, int *repfnz)
-{
-    int jj, k;
-
-    for (jj = jcol; jj < jcol+w; jj++) 
-	for (k = 0; k < n; k++)
-	    if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
-		fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj,
-			k, repfnz[(jj-jcol)*n + k]);
-		ABORT("check_repfnz");
-	    }
-}
-
-
-/* Print a summary of the testing results. */
-void
-PrintSumm(char *type, int nfail, int nrun, int nerrs)
-{
-    if ( nfail > 0 )
-	printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
-	       type, nfail, nrun);
-    else
-	printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
-
-    if ( nerrs > 0 )
-	printf("%6d error messages recorded\n", nerrs);
-}
-
-
-int print_int_vec(char *what, int n, int *vec)
-{
-    int i;
-    printf("%s\n", what);
-    for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]);
-    return 0;
-}
-
-int superlu_lsame(const char *ca,const char *cb)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
-       Courant Institute, Argonne National Lab, and Rice University
-       September 30, 1994
-
-    Purpose
-    =======
-
-    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.
-
-    Arguments
-    =========
-
-    CA      (input) CHARACTER*1
-    CB      (input) CHARACTER*1
-            CA and CB specify the single characters to be compared.
-
-   =====================================================================
-*/
-
-    /* System generated locals */
-    int ret_val;
-
-    /* Local variables */
-    int inta, intb, zcode;
-
-    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
-    if (ret_val) {
-        return ret_val;
-    }
-
-    /* Now test for equivalence if both characters are alphabetic. */
-
-    zcode = 'Z';
-
-    /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime
-       machines, on which ICHAR returns a value with bit 8 set.
-       ICHAR('A') on Prime machines returns 193 which is the same as
-       ICHAR('A') on an EBCDIC machine. */
-
-    inta = *(unsigned char *)ca;
-    intb = *(unsigned char *)cb;
-
-    if (zcode == 90 || zcode == 122) {
-        /* ASCII is assumed - ZCODE is the ASCII code of either lower or
-          upper case 'Z'. */
-        if (inta >= 97 && inta <= 122) inta += -32;
-        if (intb >= 97 && intb <= 122) intb += -32;
-
-    } else if (zcode == 233 || zcode == 169) {
-        /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
-          upper case 'Z'. */
-        if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) ||
-                (inta >= 162 && inta <= 169))
-            inta += 64;
-        if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) ||
-                (intb >= 162 && intb <= 169))
-            intb += 64;
-    } else if (zcode == 218 || zcode == 250) {
-        /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code
-          plus 128 of either lower or upper case 'Z'. */
-        if (inta >= 225 && inta <= 250) inta += -32;
-        if (intb >= 225 && intb <= 250) intb += -32;
-    }
-    ret_val = inta == intb;
-    return ret_val;
-
-} /* superlu_lsame */
-
-/* Subroutine */ int superlu_xerbla(const char *srname, int *info)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
-       Courant Institute, Argonne National Lab, and Rice University
-       September 30, 1994
-
-
-    Purpose
-    =======
-
-    XERBLA  is an error handler for the LAPACK routines.
-    It is called by an LAPACK routine if an input parameter has an
-    invalid value.  A message is printed and execution stops.
-
-    Installers may consider modifying the STOP statement in order to
-    call system-specific exception-handling facilities.
-
-    Arguments
-    =========
-
-    SRNAME  (input) CHARACTER*6
-            The name of the routine which called XERBLA.
-
-    INFO    (input) INT
-            The position of the invalid parameter in the parameter list
-
-            of the calling routine.
-
-   =====================================================================
-*/
-
-    printf("** On entry to %6s, parameter number %2d had an illegal value\n",
-                srname, *info);
-
-/*     End of XERBLA */
-
-    return 0;
-} /* superlu_xerbla */
-
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		dmyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void sludlsolve ( int ldm, int ncol, double *M, double *rhs )
-{
-    int k;
-    double x0, x1, x2, x3, x4, x5, x6, x7;
-    double *M0;
-    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-      Mki4 = Mki3 + ldm + 1;
-      Mki5 = Mki4 + ldm + 1;
-      Mki6 = Mki5 + ldm + 1;
-      Mki7 = Mki6 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++;
-      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++;
-      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
-      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
-			   - x6 * *Mki6++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      rhs[++firstcol] = x4;
-      rhs[++firstcol] = x5;
-      rhs[++firstcol] = x6;
-      rhs[++firstcol] = x7;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++
-                        - x4 * *Mki4++ - x5 * *Mki5++
-			- x6 * *Mki6++ - x7 * *Mki7++;
- 
-      M0 += 8 * ldm + 8;
-    }
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++;
- 
-      M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-
-      rhs[++firstcol] = x1;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
- 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-sludusolve (int ldm,int ncol,double *M,double *rhs )
-{
-    double xj;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++)
-	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void sludmatvec (int ldm,int nrow,int ncol,double *M,double *vec,double *Mxvec )
-{
-    double vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
-    double *M0;
-    register double *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-        Mki2 = Mki1 + ldm;
-        Mki3 = Mki2 + ldm;
-	Mki4 = Mki3 + ldm;
-	Mki5 = Mki4 + ldm;
-	Mki6 = Mki5 + ldm;
-	Mki7 = Mki6 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	vi4 = vec[firstcol++];
-	vi5 = vec[firstcol++];
-	vi6 = vec[firstcol++];
-	vi7 = vec[firstcol++];	
-
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
-		      + vi4 * *Mki4++ + vi5 * *Mki5++
-		      + vi6 * *Mki6++ + vi7 * *Mki7++;
-
-	M0 += 8 * ldm;
-    }
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
-
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++)
-	    Mxvec[k] += vi0 * *Mki0++;
-
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_util.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_util.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,271 +0,0 @@
-/*
-  This file has been modified to be compatible with the HYPRE
-  linear solver
-*/
-
-
-#ifndef __SUPERLU_UTIL /* allow multiple inclusions */
-#define __SUPERLU_UTIL
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-/*
-#ifndef __STDC__
-#include <malloc.h>
-#endif
-*/
-#include <assert.h>
-
-/***********************************************************************
- * Macros
- ***********************************************************************/
-#define FIRSTCOL_OF_SNODE(i)	(xsup[i])
-/* No of marker arrays used in the symbolic factorization,
-   each of size n */
-#define NO_MARKER     3
-#define NUM_TEMPV(m,w,t,b)  ( SUPERLU_MAX(m, (t + b)*w) )
-
-#ifndef USER_ABORT
-#define USER_ABORT(msg) superlu_abort_and_exit(msg)
-#endif
-
-#define ABORT(err_msg) \
- { char msg[256];\
-   sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
-   USER_ABORT(msg); }
-
-
-#ifndef USER_MALLOC
-#if 1
-#define USER_MALLOC(size) superlu_malloc(size)
-#else
-/* The following may check out some uninitialized data */
-#define USER_MALLOC(size) memset (superlu_malloc(size), '\x0F', size)
-#endif
-#endif
-
-#define SUPERLU_MALLOC(size) USER_MALLOC(size)
-
-#ifndef USER_FREE
-#define USER_FREE(addr) superlu_free(addr)
-#endif
-
-#define SUPERLU_FREE(addr) USER_FREE(addr)
-
-#define CHECK_MALLOC(where) {                 \
-    extern int superlu_malloc_total;        \
-    printf("%s: malloc_total %d Bytes\n",     \
-	   where, superlu_malloc_total); \
-}
-
-#define SUPERLU_MAX(x, y) 	( (x) > (y) ? (x) : (y) )
-#define SUPERLU_MIN(x, y) 	( (x) < (y) ? (x) : (y) )
-
-/***********************************************************************
- * Constants 
- ***********************************************************************/
-#define EMPTY	(-1)
-/*#define NO	(-1)*/
-#define FALSE	0
-#define TRUE	1
-
-/***********************************************************************
- * Enumerate types
- ***********************************************************************/
-typedef enum {NO, YES}                                          yes_no_t;
-typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
-typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                   rowperm_t;
-typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, MY_PERMC}colperm_t;
-typedef enum {NOTRANS, TRANS, CONJ}                             trans_t;
-typedef enum {NOEQUIL, ROW, COL, BOTH}                          DiagScale_t;
-typedef enum {NOREFINE, SINGLE=1, DOUBLE, EXTRA}                IterRefine_t;
-typedef enum {LUSUP, UCOL, LSUB, USUB}                          MemType;
-typedef enum {HEAD, TAIL}                                       stack_end_t;
-typedef enum {SYSTEM, USER}                                     LU_space_t;
-
-/* 
- * The following enumerate type is used by the statistics variable 
- * to keep track of flop count and time spent at various stages.
- *
- * Note that not all of the fields are disjoint.
- */
-typedef enum {
-    COLPERM, /* find a column ordering that minimizes fills */
-    RELAX,   /* find artificial supernodes */
-    ETREE,   /* compute column etree */
-    EQUIL,   /* equilibrate the original matrix */
-    FACT,    /* perform LU factorization */
-    RCOND,   /* estimate reciprocal condition number */
-    SOLVE,   /* forward and back solves */
-    REFINE,  /* perform iterative refinement */
-    TRSV,    /* fraction of FACT spent in xTRSV */
-    GEMV,    /* fraction of FACT spent in xGEMV */
-    FERR,    /* estimate error bounds after iterative refinement */
-    NPHASES  /* total number of phases */
-} PhaseType;
-
-
-/***********************************************************************
- * Type definitions
- ***********************************************************************/
-typedef float    flops_t;
-typedef unsigned char Logical;
-
-/* 
- *-- This contains the options used to control the solve process.
- *
- * Fact   (fact_t)
- *        Specifies whether or not the factored form of the matrix
- *        A is supplied on entry, and if not, how the matrix A should
- *        be factorizaed.
- *        = DOFACT: The matrix A will be factorized from scratch, and the
- *             factors will be stored in L and U.
- *        = SamePattern: The matrix A will be factorized assuming
- *             that a factorization of a matrix with the same sparsity
- *             pattern was performed prior to this one. Therefore, this
- *             factorization will reuse column permutation vector 
- *             ScalePermstruct->perm_c and the column elimination tree
- *             LUstruct->etree.
- *        = SamePattern_SameRowPerm: The matrix A will be factorized
- *             assuming that a factorization of a matrix with the same
- *             sparsity	pattern and similar numerical values was performed
- *             prior to this one. Therefore, this factorization will reuse
- *             both row and column scaling factors R and C, both row and
- *             column permutation vectors perm_r and perm_c, and the
- *             data structure set up from the previous symbolic factorization.
- *        = FACTORED: On entry, L, U, perm_r and perm_c contain the 
- *              factored form of A. If DiagScale is not NOEQUIL, the matrix
- *              A has been equilibrated with scaling factors R and C.
- *
- * Equil  (yes_no_t)
- *        Specifies whether to equilibrate the system (scale A's row and
- *        columns to have unit norm).
- *
- * ColPerm (colperm_t)
- *        Specifies what type of column permutation to use to reduce fill.
- *        = NATURAL: use the natural ordering 
- *        = MMD_ATA: use minimum degree ordering on structure of A'*A
- *        = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A
- *        = COLAMD: use approximate minimum degree column ordering
- *        = MY_PERMC: use the ordering specified in ScalePermstruct->perm_c[]
- *         
- * Trans  (trans_t)
- *        Specifies the form of the system of equations:
- *        = NOTRANS: A * X = B        (No transpose)
- *        = TRANS:   A**T * X = B     (Transpose)
- *        = CONJ:    A**H * X = B     (Transpose)
- *
- * IterRefine (IterRefine_t)
- *        Specifies whether to perform iterative refinement.
- *        = NO: no iterative refinement
- *        = WorkingPrec: perform iterative refinement in working precision
- *        = ExtraPrec: perform iterative refinement in extra precision
- *
- * PrintStat (yes_no_t)
- *        Specifies whether to print the solver's statistics.
- *
- * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU)
- *        Specifies the threshold used for a diagonal entry to be an
- *        acceptable pivot.
- *
- * PivotGrowth (yes_no_t)
- *        Specifies whether to compute the reciprocal pivot growth.
- *
- * ConditionNumber (ues_no_t)
- *        Specifies whether to compute the reciprocal condition number.
- *
- * RowPerm (rowperm_t) (only for SuperLU_DIST)
- *        Specifies whether to permute rows of the original matrix.
- *        = NO: not to permute the rows
- *        = LargeDiag: make the diagonal large relative to the off-diagonal
- *        = MY_PERMR: use the permutation given in ScalePermstruct->perm_r[]
- *           
- * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST)
- *        Specifies whether to replace the tiny diagonals by
- *        sqrt(epsilon)*||A|| during LU factorization.
- *
- * SolveInitialized (yes_no_t) (only for SuperLU_DIST)
- *        Specifies whether the initialization has been performed to the
- *        triangular solve.
- *
- * RefineInitialized (yes_no_t) (only for SuperLU_DIST)
- *        Specifies whether the initialization has been performed to the
- *        sparse matrix-vector multiplication routine needed in iterative
- *        refinement.
- */
-typedef struct {
-    fact_t        Fact;
-    yes_no_t      Equil;
-    colperm_t     ColPerm;
-    trans_t       Trans;
-    IterRefine_t  IterRefine;
-    yes_no_t      PrintStat;
-    yes_no_t      SymmetricMode;
-    double        DiagPivotThresh;
-    yes_no_t      PivotGrowth;
-    yes_no_t      ConditionNumber;
-    rowperm_t     RowPerm;
-    yes_no_t      ReplaceTinyPivot;
-    yes_no_t      SolveInitialized;
-    yes_no_t      RefineInitialized;
-} superlu_options_t;
-
-typedef struct {
-    int     *panel_histo; /* histogram of panel size distribution */
-    double  *utime;       /* running time at various phases */
-    flops_t *ops;         /* operation count at various phases */
-    int     TinyPivots;   /* number of tiny pivots */
-    int     RefineSteps;  /* number of iterative refinement steps */
-} SuperLUStat_t;
-
-
-/***********************************************************************
- * Prototypes
- ***********************************************************************/
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern void    Destroy_SuperMatrix_Store(SuperMatrix *);
-extern void    Destroy_CompCol_Matrix(SuperMatrix *);
-extern void    Destroy_CompRow_Matrix(SuperMatrix *);
-extern void    Destroy_SuperNode_Matrix(SuperMatrix *);
-extern void    Destroy_CompCol_Permuted(SuperMatrix *);
-extern void    Destroy_Dense_Matrix(SuperMatrix *);
-extern void    get_perm_c(int, SuperMatrix *, int *);
-extern void    set_default_options(superlu_options_t *options);
-extern void    sp_preorder (superlu_options_t *, SuperMatrix*, int*, int*,
-			    SuperMatrix*);
-extern void    superlu_abort_and_exit(char*);
-extern void    *superlu_malloc (size_t);
-extern int     *intMalloc (int);
-extern int     *intCalloc (int);
-extern void    superlu_free (void*);
-extern void    SetIWork (int, int, int, int *, int **, int **, int **,
-                         int **, int **, int **, int **);
-extern int     sp_coletree (int *, int *, int *, int, int, int *);
-extern void    relax_snode (const int, int *, const int, int *, int *);
-extern void    heap_relax_snode (const int, int *, const int, int *, int *);
-extern void    resetrep_col (const int, const int *, int *);
-extern int     spcoletree (int *, int *, int *, int, int, int *);
-extern int     *TreePostorder (int, int *);
-extern double  SuperLU_timer_ ();
-extern int     sp_ienv (int);
-extern int     superlu_lsame (const char *,const char *);
-extern int     superlu_xerbla (const char *, int *);
-extern void    ifill (int *, int, int);
-extern void    snode_profile (int, int *);
-extern void    super_stats (int, int *);
-extern void    PrintSumm (char *, int, int, int);
-extern void    StatInit(SuperLUStat_t *);
-extern void    StatPrint (SuperLUStat_t *);
-extern void    StatFree(SuperLUStat_t *);
-extern void    print_panel_seg(int, int, int, int, int *, int *);
-extern void    check_repfnz(int, int, int, int *);
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_UTIL */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_zdefs.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_zdefs.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/slu_zdefs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/slu_zdefs.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,237 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#ifndef __SUPERLU_zSP_DEFS /* allow multiple inclusions */
-#define __SUPERLU_zSP_DEFS
-
-/*
- * File name:		zsp_defs.h
- * Purpose:             Sparse matrix types and function prototypes
- * History:
- */
-
-#ifdef _CRAY
-#include <fortran.h>
-#include <string.h>
-#endif
-
-/* Define my integer type int_t */
-typedef int int_t; /* default */
-
-#include "slu_Cnames.h"
-#include "supermatrix.h"
-#include "slu_util.h"
-#include "slu_dcomplex.h"
-
-
-/*
- * Global data structures used in LU factorization -
- * 
- *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
- *   (xsup,supno): supno[i] is the supernode no to which i belongs;
- *	xsup(s) points to the beginning of the s-th supernode.
- *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
- *	        xsup 0 1 2 4 7 12
- *	Note: dfs will be performed on supernode rep. relative to the new 
- *	      row pivoting ordering
- *
- *   (xlsub,lsub): lsub[*] contains the compressed subscript of
- *	rectangular supernodes; xlsub[j] points to the starting
- *	location of the j-th column in lsub[*]. Note that xlsub 
- *	is indexed by column.
- *	Storage: original row subscripts
- *
- *      During the course of sparse LU factorization, we also use
- *	(xlsub,lsub) for the purpose of symmetric pruning. For each
- *	supernode {s,s+1,...,t=s+r} with first column s and last
- *	column t, the subscript set
- *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
- *	is the structure of column s (i.e. structure of this supernode).
- *	It is used for the storage of numerical values.
- *	Furthermore,
- *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
- *	is the structure of the last column t of this supernode.
- *	It is for the purpose of symmetric pruning. Therefore, the
- *	structural subscripts can be rearranged without making physical
- *	interchanges among the numerical values.
- *
- *	However, if the supernode has only one column, then we
- *	only keep one set of subscripts. For any subscript interchange
- *	performed, similar interchange must be done on the numerical
- *	values.
- *
- *	The last column structures (for pruning) will be removed
- *	after the numercial LU factorization phase.
- *
- *   (xlusup,lusup): lusup[*] contains the numerical values of the
- *	rectangular supernodes; xlusup[j] points to the starting
- *	location of the j-th column in storage vector lusup[*]
- *	Note: xlusup is indexed by column.
- *	Each rectangular supernode is stored by column-major
- *	scheme, consistent with Fortran 2-dim array storage.
- *
- *   (xusub,ucol,usub): ucol[*] stores the numerical values of
- *	U-columns outside the rectangular supernodes. The row
- *	subscript of nonzero ucol[k] is stored in usub[k].
- *	xusub[i] points to the starting location of column i in ucol.
- *	Storage: new row subscripts; that is subscripts of PA.
- */
-typedef struct {
-    int     *xsup;    /* supernode and column mapping */
-    int     *supno;   
-    int     *lsub;    /* compressed L subscripts */
-    int	    *xlsub;
-    doublecomplex  *lusup;   /* L supernodes */
-    int     *xlusup;
-    doublecomplex  *ucol;    /* U columns */
-    int     *usub;
-    int	    *xusub;
-    int     nzlmax;   /* current max size of lsub */
-    int     nzumax;   /*    "    "    "      ucol */
-    int     nzlumax;  /*    "    "    "     lusup */
-    int     n;        /* number of columns in the matrix */
-    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
-} GlobalLU_t;
-
-typedef struct {
-    float for_lu;
-    float total_needed;
-    int   expansions;
-} mem_usage_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Driver routines */
-extern void
-zgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *,
-      SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *);
-extern void
-zgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,
-       char *, double *, double *, SuperMatrix *, SuperMatrix *,
-       void *, int, SuperMatrix *, SuperMatrix *,
-       double *, double *, double *, double *,
-       mem_usage_t *, SuperLUStat_t *, int *);
-
-/* Supernodal LU factor related */
-extern void
-zCreate_CompCol_Matrix(SuperMatrix *, int, int, int, doublecomplex *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-zCreate_CompRow_Matrix(SuperMatrix *, int, int, int, doublecomplex *,
-		       int *, int *, Stype_t, Dtype_t, Mtype_t);
-extern void
-zCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *);
-extern void
-zCreate_Dense_Matrix(SuperMatrix *, int, int, doublecomplex *, int,
-		     Stype_t, Dtype_t, Mtype_t);
-extern void
-zCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, doublecomplex *, 
-		         int *, int *, int *, int *, int *,
-			 Stype_t, Dtype_t, Mtype_t);
-extern void
-zCopy_Dense_Matrix(int, int, doublecomplex *, int, doublecomplex *, int);
-
-extern void    countnz (const int, int *, int *, int *, GlobalLU_t *);
-extern void    fixupL (const int, const int *, GlobalLU_t *);
-
-extern void    zallocateA (int, int, doublecomplex **, int **, int **);
-extern void    zgstrf (superlu_options_t*, SuperMatrix*, double, 
-                       int, int, int*, void *, int, int *, int *, 
-                       SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *);
-extern int     zsnode_dfs (const int, const int, const int *, const int *,
-			     const int *, int *, int *, GlobalLU_t *);
-extern int     zsnode_bmod (const int, const int, const int, doublecomplex *,
-                              doublecomplex *, GlobalLU_t *, SuperLUStat_t*);
-extern void    zpanel_dfs (const int, const int, const int, SuperMatrix *,
-			   int *, int *, doublecomplex *, int *, int *, int *,
-			   int *, int *, int *, int *, GlobalLU_t *);
-extern void    zpanel_bmod (const int, const int, const int, const int,
-                           doublecomplex *, doublecomplex *, int *, int *,
-			   GlobalLU_t *, SuperLUStat_t*);
-extern int     zcolumn_dfs (const int, const int, int *, int *, int *, int *,
-			   int *, int *, int *, int *, int *, GlobalLU_t *);
-extern int     zcolumn_bmod (const int, const int, doublecomplex *,
-			   doublecomplex *, int *, int *, int,
-                           GlobalLU_t *, SuperLUStat_t*);
-extern int     zcopy_to_ucol (int, int, int *, int *, int *,
-                              doublecomplex *, GlobalLU_t *);         
-extern int     zpivotL (const int, const double, int *, int *, 
-                         int *, int *, int *, GlobalLU_t *, SuperLUStat_t*);
-extern void    zpruneL (const int, const int *, const int, const int,
-			  const int *, const int *, int *, GlobalLU_t *);
-extern void    zreadmt (int *, int *, int *, doublecomplex **, int **, int **);
-extern void    zGenXtrue (int, int, doublecomplex *, int);
-extern void    zFillRHS (trans_t, int, doublecomplex *, int, SuperMatrix *,
-			  SuperMatrix *);
-extern void    zgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
-                        SuperMatrix *, SuperLUStat_t*, int *);
-
-
-/* Driver related */
-
-extern void    zgsequ (SuperMatrix *, double *, double *, double *,
-			double *, double *, int *);
-extern void    zlaqgs (SuperMatrix *, double *, double *, double,
-                        double, double, char *);
-extern void    zgscon (char *, SuperMatrix *, SuperMatrix *, 
-		         double, double *, SuperLUStat_t*, int *);
-extern double   zPivotGrowth(int, SuperMatrix *, int *, 
-                            SuperMatrix *, SuperMatrix *);
-extern void    zgsrfs (trans_t, SuperMatrix *, SuperMatrix *,
-                       SuperMatrix *, int *, int *, char *, double *, 
-                       double *, SuperMatrix *, SuperMatrix *,
-                       double *, double *, SuperLUStat_t*, int *);
-
-extern int     sp_ztrsv (char *, char *, char *, SuperMatrix *,
-			SuperMatrix *, doublecomplex *, SuperLUStat_t*, int *);
-extern int     sp_zgemv (char *, doublecomplex, SuperMatrix *, doublecomplex *,
-			int, doublecomplex, doublecomplex *, int);
-
-extern int     sp_zgemm (char *, char *, int, int, int, doublecomplex,
-			SuperMatrix *, doublecomplex *, int, doublecomplex, 
-			doublecomplex *, int);
-
-/* Memory-related */
-extern int     zLUMemInit (fact_t, void *, int, int, int, int, int,
-			     SuperMatrix *, SuperMatrix *,
-			     GlobalLU_t *, int **, doublecomplex **);
-extern void    zSetRWork (int, int, doublecomplex *, doublecomplex **, doublecomplex **);
-extern void    zLUWorkFree (int *, doublecomplex *, GlobalLU_t *);
-extern int     zLUMemXpand (int, int, MemType, int *, GlobalLU_t *);
-
-extern doublecomplex  *doublecomplexMalloc(int);
-extern doublecomplex  *doublecomplexCalloc(int);
-extern double  *doubleMalloc(int);
-extern double  *doubleCalloc(int);
-extern int     zmemory_usage(const int, const int, const int, const int);
-extern int     zQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *);
-
-/* Auxiliary routines */
-extern void    zreadhb(int *, int *, int *, doublecomplex **, int **, int **);
-extern void    zCompRow_to_CompCol(int, int, int, doublecomplex*, int*, int*,
-		                   doublecomplex **, int **, int **);
-extern void    zfill (doublecomplex *, int, doublecomplex);
-extern void    zinf_norm_error (int, SuperMatrix *, doublecomplex *);
-extern void    PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *,
-			 doublecomplex, doublecomplex, doublecomplex *, doublecomplex *, char *);
-
-/* Routines for debugging */
-extern void    zPrint_CompCol_Matrix(char *, SuperMatrix *);
-extern void    zPrint_SuperNode_Matrix(char *, SuperMatrix *);
-extern void    zPrint_Dense_Matrix(char *, SuperMatrix *);
-extern void    print_lu_col(char *, int, int, int *, GlobalLU_t *);
-extern void    check_tempv(int, doublecomplex *);
-
-#ifdef __cplusplus
-  }
-#endif
-
-#endif /* __SUPERLU_zSP_DEFS */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smemory.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smemory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smemory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,680 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_sdefs.h"
-
-/* Constants */
-#define NO_MEMTYPE  4      /* 0: lusup;
-			      1: ucol;
-			      2: lsub;
-			      3: usub */
-#define GluIntArray(n)   (5 * (n) + 5)
-
-/* Internal prototypes */
-void  *sexpand (int *, MemType,int, int, GlobalLU_t *);
-int   sLUWorkInit (int, int, int, int **, float **, LU_space_t);
-void  copy_mem_float (int, void *, void *);
-void  sStackCompress (GlobalLU_t *);
-void  sSetupSpace (void *, int, LU_space_t *);
-void  *suser_malloc (int, int);
-void  suser_free (int, int);
-
-/* External prototypes (in memory.c - prec-indep) */
-extern void    copy_mem_int    (int, void *, void *);
-extern void    user_bcopy      (char *, char *, int);
-
-/* Headers for 4 types of dynamatically managed memory */
-typedef struct e_node {
-    int size;      /* length of the memory that has been used */
-    void *mem;     /* pointer to the new malloc'd store */
-} ExpHeader;
-
-typedef struct {
-    int  size;
-    int  used;
-    int  top1;  /* grow upward, relative to &array[0] */
-    int  top2;  /* grow downward */
-    void *array;
-} LU_stack_t;
-
-/* Variables local to this file */
-static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */
-static LU_stack_t stack;
-static int no_expand;
-
-/* Macros to manipulate stack */
-#define StackFull(x)         ( x + stack.used >= stack.size )
-#define NotDoubleAlign(addr) ( (long int)addr & 7 )
-#define DoubleAlign(addr)    ( ((long int)addr + 7) & ~7L )
-#define TempSpace(m, w)      ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \
-			      (w + 1) * m * sizeof(float) )
-#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
-
-
-
-
-/*
- * Setup the memory model to be used for factorization.
- *    lwork = 0: use system malloc;
- *    lwork > 0: use user-supplied work[] space.
- */
-void sSetupSpace(void *work, int lwork, LU_space_t *MemModel)
-{
-    if ( lwork == 0 ) {
-	*MemModel = SYSTEM; /* malloc/free */
-    } else if ( lwork > 0 ) {
-	*MemModel = USER;   /* user provided space */
-	stack.used = 0;
-	stack.top1 = 0;
-	stack.top2 = (lwork/4)*4; /* must be word addressable */
-	stack.size = stack.top2;
-	stack.array = (void *) work;
-    }
-}
-
-
-
-void *suser_malloc(int bytes, int which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-
-void suser_free(int bytes, int which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total_needed (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int sQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    register int n, iword, dword, panel_size = sp_ienv(1);
-
-    Lstore = L->Store;
-    Ustore = U->Store;
-    n = L->ncol;
-    iword = sizeof(int);
-    dword = sizeof(float);
-
-    /* For LU factors */
-    mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] *
-				 dword + Lstore->rowind_colptr[n] * iword );
-    mem_usage->for_lu += (float)( (n + 1) * iword +
-				 Ustore->colptr[n] * (dword + iword) );
-
-    /* Working storage to support factorization */
-    mem_usage->total_needed = mem_usage->for_lu +
-	(float)( (2 * panel_size + 4 + NO_MARKER) * n * iword +
-		(panel_size + 1) * n * dword );
-
-    mem_usage->expansions = --no_expand;
-
-    return 0;
-} /* sQuerySpace */
-
-/*
- * Allocate storage for the data structures common to all factor routines.
- * For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     If lwork = -1, return the estimated amount of space required, plus n;
- *     otherwise, return the amount of space actually allocated when
- *     memory allocation failure occurred.
- */
-int
-sLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz,
-	  int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu,
-	  int **iwork, float **dwork)
-{
-    int      info, iword, dword;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    int      *xsup, *supno;
-    int      *lsub, *xlsub;
-    float   *lusup;
-    int      *xlusup;
-    float   *ucol;
-    int      *usub, *xusub;
-    int      nzlmax, nzumax, nzlumax;
-    int      FILL = sp_ienv(6);
-    
-    Glu->n    = n;
-    no_expand = 0;
-    iword     = sizeof(int);
-    dword     = sizeof(float);
-
-    if ( !expanders )	
-        expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader));
-    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
-    
-    if ( fact != SamePattern_SameRowPerm ) {
-	/* Guess for L\U factors */
-	nzumax = nzlumax = FILL * annz;
-	nzlmax = SUPERLU_MAX(1, FILL/4.) * annz;
-
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else {
-	    sSetupSpace(work, lwork, &Glu->MemModel);
-	}
-	
-#if ( PRNTlevel >= 1 )
-	printf("sLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", 
-	       FILL, nzlmax, nzumax);
-	fflush(stdout);
-#endif	
-	
-	/* Integer pointers for L\U factors */
-	if ( Glu->MemModel == SYSTEM ) {
-	    xsup   = intMalloc(n+1);
-	    supno  = intMalloc(n+1);
-	    xlsub  = intMalloc(n+1);
-	    xlusup = intMalloc(n+1);
-	    xusub  = intMalloc(n+1);
-	} else {
-	    xsup   = (int *)suser_malloc((n+1) * iword, HEAD);
-	    supno  = (int *)suser_malloc((n+1) * iword, HEAD);
-	    xlsub  = (int *)suser_malloc((n+1) * iword, HEAD);
-	    xlusup = (int *)suser_malloc((n+1) * iword, HEAD);
-	    xusub  = (int *)suser_malloc((n+1) * iword, HEAD);
-	}
-
-	lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	ucol  = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu );
-	lsub  = (int *)    sexpand( &nzlmax, LSUB, 0, 0, Glu );
-	usub  = (int *)    sexpand( &nzumax, USUB, 0, 1, Glu );
-
-	while ( !lusup || !ucol || !lsub || !usub ) {
-	    if ( Glu->MemModel == SYSTEM ) {
-		SUPERLU_FREE(lusup); 
-		SUPERLU_FREE(ucol); 
-		SUPERLU_FREE(lsub); 
-		SUPERLU_FREE(usub);
-	    } else {
-		suser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD);
-	    }
-	    nzlumax /= 2;
-	    nzumax /= 2;
-	    nzlmax /= 2;
-	    if ( nzlumax < annz ) {
-		printf("Not enough memory to perform factorization.\n");
-		return (smemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-	    }
-#if ( PRNTlevel >= 1)
-	    printf("sLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", 
-		   nzlmax, nzumax);
-	    fflush(stdout);
-#endif
-	    lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	    ucol  = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu );
-	    lsub  = (int *)    sexpand( &nzlmax, LSUB, 0, 0, Glu );
-	    usub  = (int *)    sexpand( &nzumax, USUB, 0, 1, Glu );
-	}
-	
-    } else {
-	/* fact == SamePattern_SameRowPerm */
-	Lstore   = L->Store;
-	Ustore   = U->Store;
-	xsup     = Lstore->sup_to_col;
-	supno    = Lstore->col_to_sup;
-	xlsub    = Lstore->rowind_colptr;
-	xlusup   = Lstore->nzval_colptr;
-	xusub    = Ustore->colptr;
-	nzlmax   = Glu->nzlmax;    /* max from previous factorization */
-	nzumax   = Glu->nzumax;
-	nzlumax  = Glu->nzlumax;
-	
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else if ( lwork == 0 ) {
-	    Glu->MemModel = SYSTEM;
-	} else {
-	    Glu->MemModel = USER;
-	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
-	    stack.size = stack.top2;
-	}
-	
-	lsub  = expanders[LSUB].mem  = Lstore->rowind;
-	lusup = expanders[LUSUP].mem = Lstore->nzval;
-	usub  = expanders[USUB].mem  = Ustore->rowind;
-	ucol  = expanders[UCOL].mem  = Ustore->nzval;;
-	expanders[LSUB].size         = nzlmax;
-	expanders[LUSUP].size        = nzlumax;
-	expanders[USUB].size         = nzumax;
-	expanders[UCOL].size         = nzumax;	
-    }
-
-    Glu->xsup    = xsup;
-    Glu->supno   = supno;
-    Glu->lsub    = lsub;
-    Glu->xlsub   = xlsub;
-    Glu->lusup   = lusup;
-    Glu->xlusup  = xlusup;
-    Glu->ucol    = ucol;
-    Glu->usub    = usub;
-    Glu->xusub   = xusub;
-    Glu->nzlmax  = nzlmax;
-    Glu->nzumax  = nzumax;
-    Glu->nzlumax = nzlumax;
-    
-    info = sLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel);
-    if ( info )
-	return ( info + smemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-    
-    ++no_expand;
-    return 0;
-    
-} /* sLUMemInit */
-
-/* Allocate known working storage. Returns 0 if success, otherwise
-   returns the number of bytes allocated so far when failure occurred. */
-int
-sLUWorkInit(int m, int n, int panel_size, int **iworkptr, 
-            float **dworkptr, LU_space_t MemModel)
-{
-    int    isize, dsize, extra;
-    float *old_ptr;
-    int    maxsuper = sp_ienv(3),
-           rowblk   = sp_ienv(4);
-
-    isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int);
-    dsize = (m * panel_size +
-	     NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(float);
-    
-    if ( MemModel == SYSTEM ) 
-	*iworkptr = (int *) intCalloc(isize/sizeof(int));
-    else
-	*iworkptr = (int *) suser_malloc(isize, TAIL);
-    if ( ! *iworkptr ) {
-	fprintf(stderr, "sLUWorkInit: malloc fails for local iworkptr[]\n");
-	return (isize + n);
-    }
-
-    if ( MemModel == SYSTEM )
-	*dworkptr = (float *) SUPERLU_MALLOC(dsize);
-    else {
-	*dworkptr = (float *) suser_malloc(dsize, TAIL);
-	if ( NotDoubleAlign(*dworkptr) ) {
-	    old_ptr = *dworkptr;
-	    *dworkptr = (float*) DoubleAlign(*dworkptr);
-	    *dworkptr = (float*) ((double*)*dworkptr - 1);
-	    extra = (char*)old_ptr - (char*)*dworkptr;
-#ifdef DEBUG	    
-	    printf("sLUWorkInit: not aligned, extra %d\n", extra);
-#endif	    
-	    stack.top2 -= extra;
-	    stack.used += extra;
-	}
-    }
-    if ( ! *dworkptr ) {
-	fprintf(stderr, "malloc fails for local dworkptr[].");
-	return (isize + dsize + n);
-    }
-	
-    return 0;
-}
-
-
-/*
- * Set up pointers for real working arrays.
- */
-void
-sSetRWork(int m, int panel_size, float *dworkptr,
-	 float **dense, float **tempv)
-{
-    float zero = 0.0;
-
-    int maxsuper = sp_ienv(3),
-        rowblk   = sp_ienv(4);
-    *dense = dworkptr;
-    *tempv = *dense + panel_size*m;
-    sfill (*dense, m * panel_size, zero);
-    sfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero);     
-}
-	
-/*
- * Free the working storage used by factor routines.
- */
-void sLUWorkFree(int *iwork, float *dwork, GlobalLU_t *Glu)
-{
-    if ( Glu->MemModel == SYSTEM ) {
-	SUPERLU_FREE (iwork);
-	SUPERLU_FREE (dwork);
-    } else {
-	stack.used -= (stack.size - stack.top2);
-	stack.top2 = stack.size;
-/*	sStackCompress(Glu);  */
-    }
-    
-    SUPERLU_FREE (expanders);	
-    expanders = 0;
-}
-
-/* Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-sLUMemXpand(int jcol,
-	   int next,          /* number of elements currently in the factors */
-	   MemType mem_type,  /* which type of memory to expand  */
-	   int *maxlen,       /* modified - maximum length of a data structure */
-	   GlobalLU_t *Glu    /* modified - global LU data structures */
-	   )
-{
-    void   *new_mem;
-    
-#ifdef DEBUG    
-    printf("sLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n",
-	   jcol, next, *maxlen, mem_type);
-#endif    
-
-    if (mem_type == USUB) 
-    	new_mem = sexpand(maxlen, mem_type, next, 1, Glu);
-    else
-	new_mem = sexpand(maxlen, mem_type, next, 0, Glu);
-    
-    if ( !new_mem ) {
-	int    nzlmax  = Glu->nzlmax;
-	int    nzumax  = Glu->nzumax;
-	int    nzlumax = Glu->nzlumax;
-    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
-    	return (smemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n);
-    }
-
-    switch ( mem_type ) {
-      case LUSUP:
-	Glu->lusup   = (float *) new_mem;
-	Glu->nzlumax = *maxlen;
-	break;
-      case UCOL:
-	Glu->ucol   = (float *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-      case LSUB:
-	Glu->lsub   = (int *) new_mem;
-	Glu->nzlmax = *maxlen;
-	break;
-      case USUB:
-	Glu->usub   = (int *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-    }
-    
-    return 0;
-    
-}
-
-
-
-void
-copy_mem_float(int howmany, void *old, void *new)
-{
-    register int i;
-    float *dold = old;
-    float *dnew = new;
-    for (i = 0; i < howmany; i++) dnew[i] = dold[i];
-}
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-void
-*sexpand (
-	 int *prev_len,   /* length used from previous call */
-	 MemType type,    /* which part of the memory to expand */
-	 int len_to_copy, /* size of the memory to be copied to new store */
-	 int keep_prev,   /* = 1: use prev_len;
-			     = 0: compute new_len to expand */
-	 GlobalLU_t *Glu  /* modified - global LU data structures */
-	)
-{
-    float    EXPAND = 1.5;
-    float    alpha;
-    void     *new_mem, *old_mem;
-    int      new_len, tries, lword, extra, bytes_to_copy;
-
-    alpha = EXPAND;
-
-    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
-        new_len = *prev_len;
-    else {
-	new_len = alpha * *prev_len;
-    }
-    
-    if ( type == LSUB || type == USUB ) lword = sizeof(int);
-    else lword = sizeof(float);
-
-    if ( Glu->MemModel == SYSTEM ) {
-	new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-	if ( no_expand != 0 ) {
-	    tries = 0;
-	    if ( keep_prev ) {
-		if ( !new_mem ) return (NULL);
-	    } else {
-		while ( !new_mem ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-		}
-	    }
-	    if ( type == LSUB || type == USUB ) {
-		copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
-	    } else {
-		copy_mem_float(len_to_copy, expanders[type].mem, new_mem);
-	    }
-	    SUPERLU_FREE (expanders[type].mem);
-	}
-	expanders[type].mem = (void *) new_mem;
-	
-    } else { /* MemModel == USER */
-	if ( no_expand == 0 ) {
-	    new_mem = suser_malloc(new_len * lword, HEAD);
-	    if ( NotDoubleAlign(new_mem) &&
-		(type == LUSUP || type == UCOL) ) {
-		old_mem = new_mem;
-		new_mem = (void *)DoubleAlign(new_mem);
-		extra = (char*)new_mem - (char*)old_mem;
-#ifdef DEBUG		
-		printf("expand(): not aligned, extra %d\n", extra);
-#endif		
-		stack.top1 += extra;
-		stack.used += extra;
-	    }
-	    expanders[type].mem = (void *) new_mem;
-	}
-	else {
-	    tries = 0;
-	    extra = (new_len - *prev_len) * lword;
-	    if ( keep_prev ) {
-		if ( StackFull(extra) ) return (NULL);
-	    } else {
-		while ( StackFull(extra) ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    extra = (new_len - *prev_len) * lword;	    
-		}
-	    }
-
-	    if ( type != USUB ) {
-		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
-		bytes_to_copy = (char*)stack.array + stack.top1
-		    - (char*)expanders[type + 1].mem;
-		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
-
-		if ( type < USUB ) {
-		    Glu->usub = expanders[USUB].mem =
-			(void*)((char*)expanders[USUB].mem + extra);
-		}
-		if ( type < LSUB ) {
-		    Glu->lsub = expanders[LSUB].mem =
-			(void*)((char*)expanders[LSUB].mem + extra);
-		}
-		if ( type < UCOL ) {
-		    Glu->ucol = expanders[UCOL].mem =
-			(void*)((char*)expanders[UCOL].mem + extra);
-		}
-		stack.top1 += extra;
-		stack.used += extra;
-		if ( type == UCOL ) {
-		    stack.top1 += extra;   /* Add same amount for USUB */
-		    stack.used += extra;
-		}
-		
-	    } /* if ... */
-
-	} /* else ... */
-    }
-
-    expanders[type].size = new_len;
-    *prev_len = new_len;
-    if ( no_expand ) ++no_expand;
-    
-    return (void *) expanders[type].mem;
-    
-} /* sexpand */
-
-
-/*
- * Compress the work[] array to remove fragmentation.
- */
-void
-sStackCompress(GlobalLU_t *Glu)
-{
-    register int iword, dword, ndim;
-    char    *last, *fragment;
-    int      *ifrom, *ito;
-    float   *dfrom, *dto;
-    int      *xlsub, *lsub, *xusub, *usub, *xlusup;
-    float   *ucol, *lusup;
-    
-    iword = sizeof(int);
-    dword = sizeof(float);
-    ndim = Glu->n;
-
-    xlsub  = Glu->xlsub;
-    lsub   = Glu->lsub;
-    xusub  = Glu->xusub;
-    usub   = Glu->usub;
-    xlusup = Glu->xlusup;
-    ucol   = Glu->ucol;
-    lusup  = Glu->lusup;
-    
-    dfrom = ucol;
-    dto = (float *)((char*)lusup + xlusup[ndim] * dword);
-    copy_mem_float(xusub[ndim], dfrom, dto);
-    ucol = dto;
-
-    ifrom = lsub;
-    ito = (int *) ((char*)ucol + xusub[ndim] * iword);
-    copy_mem_int(xlsub[ndim], ifrom, ito);
-    lsub = ito;
-    
-    ifrom = usub;
-    ito = (int *) ((char*)lsub + xlsub[ndim] * iword);
-    copy_mem_int(xusub[ndim], ifrom, ito);
-    usub = ito;
-    
-    last = (char*)usub + xusub[ndim] * iword;
-    fragment = (char*) (((char*)stack.array + stack.top1) - last);
-    stack.used -= (long int) fragment;
-    stack.top1 -= (long int) fragment;
-
-    Glu->ucol = ucol;
-    Glu->lsub = lsub;
-    Glu->usub = usub;
-    
-#ifdef DEBUG
-    printf("sStackCompress: fragment %d\n", fragment);
-    /* for (last = 0; last < ndim; ++last)
-	print_lu_col("After compress:", last, 0);*/
-#endif    
-    
-}
-
-/*
- * Allocate storage for original matrix A
- */
-void
-sallocateA(int n, int nnz, float **a, int **asub, int **xa)
-{
-    *a    = (float *) floatMalloc(nnz);
-    *asub = (int *) intMalloc(nnz);
-    *xa   = (int *) intMalloc(n+1);
-}
-
-
-float *floatMalloc(int n)
-{
-    float *buf;
-    buf = (float *) SUPERLU_MALLOC((size_t)n * sizeof(float)); 
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in floatMalloc()\n");
-    }
-    return (buf);
-}
-
-float *floatCalloc(int n)
-{
-    float *buf;
-    register int i;
-    float zero = 0.0;
-    buf = (float *) SUPERLU_MALLOC((size_t)n * sizeof(float));
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in floatCalloc()\n");
-    }
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
-
-int smemory_usage(const int nzlmax, const int nzumax, 
-		  const int nzlumax, const int n)
-{
-    register int iword, dword;
-
-    iword   = sizeof(int);
-    dword   = sizeof(float);
-    
-    return (10 * n * iword +
-	    nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smyblas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/smyblas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/smyblas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,224 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		smyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void slsolve ( int ldm, int ncol, float *M, float *rhs )
-{
-    int k;
-    float x0, x1, x2, x3, x4, x5, x6, x7;
-    float *M0;
-    register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-      Mki4 = Mki3 + ldm + 1;
-      Mki5 = Mki4 + ldm + 1;
-      Mki6 = Mki5 + ldm + 1;
-      Mki7 = Mki6 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++;
-      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++;
-      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
-      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
-	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
-			   - x6 * *Mki6++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      rhs[++firstcol] = x4;
-      rhs[++firstcol] = x5;
-      rhs[++firstcol] = x6;
-      rhs[++firstcol] = x7;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++
-                        - x4 * *Mki4++ - x5 * *Mki5++
-			- x6 * *Mki6++ - x7 * *Mki7++;
- 
-      M0 += 8 * ldm + 8;
-    }
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-      Mki2 = Mki1 + ldm + 1;
-      Mki3 = Mki2 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
-      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
-
-      rhs[++firstcol] = x1;
-      rhs[++firstcol] = x2;
-      rhs[++firstcol] = x3;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
-	                - x2 * *Mki2++ - x3 * *Mki3++;
- 
-      M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-      Mki0 = M0 + 1;
-      Mki1 = Mki0 + ldm + 1;
-
-      x0 = rhs[firstcol];
-      x1 = rhs[firstcol+1] - x0 * *Mki0++;
-
-      rhs[++firstcol] = x1;
-      ++firstcol;
-    
-      for (k = firstcol; k < ncol; k++)
-	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
- 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-susolve ( ldm, ncol, M, rhs )
-int ldm;	/* in */
-int ncol;	/* in */
-float *M;	/* in */
-float *rhs;	/* modified */
-{
-    float xj;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++)
-	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void smatvec ( ldm, nrow, ncol, M, vec, Mxvec )
-
-int ldm;	/* in -- leading dimension of M */
-int nrow;	/* in */ 
-int ncol;	/* in */
-float *M;	/* in */
-float *vec;	/* in */
-float *Mxvec;	/* in/out */
-
-{
-    float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
-    float *M0;
-    register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-        Mki2 = Mki1 + ldm;
-        Mki3 = Mki2 + ldm;
-	Mki4 = Mki3 + ldm;
-	Mki5 = Mki4 + ldm;
-	Mki6 = Mki5 + ldm;
-	Mki7 = Mki6 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	vi4 = vec[firstcol++];
-	vi5 = vec[firstcol++];
-	vi6 = vec[firstcol++];
-	vi7 = vec[firstcol++];	
-
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
-		      + vi4 * *Mki4++ + vi5 * *Mki5++
-		      + vi6 * *Mki6++ + vi7 * *Mki7++;
-
-	M0 += 8 * ldm;
-    }
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) 
-	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
-		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
-
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++)
-	    Mxvec[k] += vi0 * *Mki0++;
-
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,450 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_sdefs.h"
-
-/* 
- * Function prototypes 
- */
-void slsolve(int, int, float *, float *);
-void smatvec(int, int, int, float *, float *, float *);
-extern void scheck_tempv();
-
-void
-spanel_bmod (
-	    const int  m,          /* in - number of rows in the matrix */
-	    const int  w,          /* in */
-	    const int  jcol,       /* in */
-	    const int  nseg,       /* in */
-	    float     *dense,     /* out, of size n by w */
-	    float     *tempv,     /* working array */
-	    int        *segrep,    /* in */
-	    int        *repfnz,    /* in, of size n by w */
-	    GlobalLU_t *Glu,       /* modified */
-	    SuperLUStat_t *stat    /* output */
-	    )
-{
-/* 
- * Purpose
- * =======
- *
- *    Performs numeric block updates (sup-panel) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- *    Before entering this routine, the original nonzeros in the panel 
- *    were already copied into the spa[m,w].
- *
- *    Updated/Output parameters-
- *	dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
- *      collectively in the m-by-w vector dense[*]. 
- *
- */
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int          incx = 1, incy = 1;
-    float       alpha, beta;
-#endif
-
-    register int k, ksub;
-    int          fsupc, nsupc, nsupr, nrow;
-    int          krep, krep_ind;
-    float       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          segsze;
-    int          block_nrow;  /* no of rows in a block row */
-    register int lptr;	      /* Points to the row subscripts of a supernode */
-    int          kfnz, irow, no_zeros; 
-    register int isub, isub1, i;
-    register int jj;	      /* Index through each column in the panel */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    float       *lusup;
-    int          *xlusup;
-    int          *repfnz_col; /* repfnz[] for a column in the panel */
-    float       *dense_col;  /* dense[] for a column in the panel */
-    float       *tempv1;             /* Used in 1-D update */
-    float       *TriTmp, *MatvecTmp; /* used in 2-D update */
-    float      zero = 0.0;
-    float      one = 1.0;
-    register int ldaTmp;
-    register int r_ind, r_hi;
-    static   int first = 1, maxsuper, rowblk, colblk;
-    flops_t  *ops = stat->ops;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	rowblk   = sp_ienv(4);
-	colblk   = sp_ienv(5);
-	first = 0;
-    }
-    ldaTmp = maxsuper + rowblk;
-
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
-
-	/* krep = representative of current k-th supernode
-	 * fsupc = first supernodal column
-	 * nsupc = no of columns in a supernode
-	 * nsupr = no of rows in a supernode
-	 */
-        krep = segrep[k--];
-	fsupc = xsup[supno[krep]];
-	nsupc = krep - fsupc + 1;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nrow = nsupr - nsupc;
-	lptr = xlsub[fsupc];
-	krep_ind = lptr + nsupc - 1;
-
-	repfnz_col = repfnz;
-	dense_col = dense;
-	
-	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
-
-	    TriTmp = tempv;
-	
-	    /* Sequence through each column in panel -- triangular solves */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
-
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-	    
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += segsze * (segsze - 1);
-		ops[GEMV] += 2 * nrow * segsze;
-	
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-			dense_col[irow] -= ukj * lusup[luptr];
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-			ukj -= ukj1 * lusup[luptr1];
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++;
-			    dense_col[irow] -= (ukj*lusup[luptr]
-						+ ukj1*lusup[luptr1]);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-			ukj1 -= ukj2 * lusup[luptr2-1];
-			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++; luptr2++;
-			    dense_col[irow] -= ( ukj*lusup[luptr]
-                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
-			}
-		    }
-
-		} else  {	/* segsze >= 4 */
-		    
-		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
-		       holds the result of triangular solves.    */
-		    no_zeros = kfnz - fsupc;
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			TriTmp[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#else
-		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#endif
-#else		
-		    slsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
-#endif
-		    
-
-		} /* else ... */
-	    
-	    }  /* for jj ... end tri-solves */
-
-	    /* Block row updates; push all the way into dense[*] block */
-	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
-		
-		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
-		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
-		luptr = xlusup[fsupc] + nsupc + r_ind;
-		isub1 = lptr + nsupc + r_ind;
-		
-		repfnz_col = repfnz;
-		TriTmp = tempv;
-		dense_col = dense;
-		
-		/* Sequence through each column in panel -- matrix-vector */
-		for (jj = jcol; jj < jcol + w; jj++,
-		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		    
-		    kfnz = repfnz_col[krep];
-		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		    
-		    segsze = krep - kfnz + 1;
-		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
-		    
-		    /* Perform a block update, and scatter the result of
-		       matrix-vector to dense[].		 */
-		    no_zeros = kfnz - fsupc;
-		    luptr1 = luptr + nsupr * no_zeros;
-		    MatvecTmp = &TriTmp[maxsuper];
-		    
-#ifdef USE_VENDOR_BLAS
-		    alpha = one; 
-                    beta = zero;
-#ifdef _CRAY
-		    SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#else
-		    sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#endif
-#else
-		    smatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
-			   TriTmp, MatvecTmp);
-#endif
-		    
-		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
-		     * such that MatvecTmp[*] can be re-used for the
-		     * the next blok row update. dense[] will be copied into 
-		     * global store after the whole panel has been finished.
-		     */
-		    isub = isub1;
-		    for (i = 0; i < block_nrow; i++) {
-			irow = lsub[isub];
-			dense_col[irow] -= MatvecTmp[i];
-			MatvecTmp[i] = zero;
-			++isub;
-		    }
-		    
-		} /* for jj ... */
-		
-	    } /* for each block row ... */
-	    
-	    /* Scatter the triangular solves into SPA dense[*] */
-	    repfnz_col = repfnz;
-	    TriTmp = tempv;
-	    dense_col = dense;
-	    
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		if ( segsze <= 3 ) continue; /* skip unrolled cases */
-		
-		no_zeros = kfnz - fsupc;		
-		isub = lptr + no_zeros;
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    dense_col[irow] = TriTmp[i];
-		    TriTmp[i] = zero;
-		    ++isub;
-		}
-		
-	    } /* for jj ... */
-	    
-	} else { /* 1-D block modification */
-	    
-	    
-	    /* Sequence through each column in the panel */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m) {
-		
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += segsze * (segsze - 1);
-		ops[GEMV] += 2 * nrow * segsze;
-		
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-			dense_col[irow] -= ukj * lusup[luptr];
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-			ukj -= ukj1 * lusup[luptr1];
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr;  ++luptr1;
-			    dense_col[irow] -= (ukj*lusup[luptr]
-						+ ukj1*lusup[luptr1]);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-			ukj1 -= ukj2 * lusup[luptr2-1];
-			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr; ++luptr1; ++luptr2;
-			    dense_col[irow] -= ( ukj*lusup[luptr]
-                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
-			}
-		    }
-
-		} else  { /* segsze >= 4 */
-		    /* 
-		     * Perform a triangular solve and block update,
-		     * then scatter the result of sup-col update to dense[].
-		     */
-		    no_zeros = kfnz - fsupc;
-		    
-		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
-		     *    The result of triangular solve is in tempv[*];
-		     *    The result of matrix vector update is in dense_col[*]
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			tempv[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-		    
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#else
-		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#endif
-		    
-		    luptr += segsze;	/* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-                    alpha = one;
-                    beta = zero;
-#ifdef _CRAY
-		    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-		    sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		    slsolve ( nsupr, segsze, &lusup[luptr], tempv );
-		    
-		    luptr += segsze;        /* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-		    smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		    
-		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
-		     * that tempv[*] can be used for the triangular solve of
-		     * the next column of the panel. They will be copied into 
-		     * ucol[*] after the whole panel has been finished.
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; i++) {
-			irow = lsub[isub];
-			dense_col[irow] = tempv[i];
-			tempv[i] = zero;
-			isub++;
-		    }
-		    
-		    /* Scatter the update from tempv1[*] into SPA dense[*] */
-		    /* Start dense rectangular L */
-		    for (i = 0; i < nrow; i++) {
-			irow = lsub[isub];
-			dense_col[irow] -= tempv1[i];
-			tempv1[i] = zero;
-			++isub;	
-		    }
-		    
-		} /* else segsze>=4 ... */
-		
-	    } /* for each column in the panel... */
-	    
-	} /* else 1-D update ... */
-
-    } /* for each updating supernode ... */
-
-}
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spanel_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spanel_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,247 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-void
-spanel_dfs (
-	   const int  m,           /* in - number of rows in the matrix */
-	   const int  w,           /* in */
-	   const int  jcol,        /* in */
-	   SuperMatrix *A,       /* in - original matrix */
-	   int        *perm_r,     /* in */
-	   int        *nseg,	   /* out */
-	   float     *dense,      /* out */
-	   int        *panel_lsub, /* out */
-	   int        *segrep,     /* out */
-	   int        *repfnz,     /* out */
-	   int        *xprune,     /* out */
-	   int        *marker,     /* out */     
-	   int        *parent,     /* working array */
-	   int        *xplore,     /* working array */
-	   GlobalLU_t *Glu         /* modified */
-	   )
-{
-/*
- * Purpose
- * =======
- *
- *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives.
- *
- *   The routine returns one list of the supernodal representatives
- *   in topological order of the dfs that generates them. This list is
- *   a superset of the topological order of each individual column within
- *   the panel. 
- *   The location of the first nonzero in each supernodal segment
- *   (supernodal entry location) is also returned. Each column has a 
- *   separate list for this purpose.
- *
- *   Two marker arrays are used for dfs:
- *     marker[i] == jj, if i was visited during dfs of current column jj;
- *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
- *
- *   marker: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- */
-    NCPformat *Astore;
-    float    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int	      krep, chperm, chmark, chrep, oldrep, kchild, myfnz;
-    int       k, krow, kmark, kperm;
-    int       xdfs, maxdfs, kpar;
-    int       jj;	   /* index through each column in the panel */
-    int       *marker1;	   /* marker1[jj] >= jcol if vertex jj was visited 
-			      by a previous column within this panel.   */
-    int       *repfnz_col; /* start of each column in the panel */
-    float    *dense_col;  /* start of each column in the panel */
-    int       nextl_col;   /* next available position in panel_lsub[*,jj] */
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-
-    /* Initialize pointers */
-    Astore     = A->Store;
-    a          = Astore->nzval;
-    asub       = Astore->rowind;
-    xa_begin   = Astore->colbeg;
-    xa_end     = Astore->colend;
-    marker1    = marker + m;
-    repfnz_col = repfnz;
-    dense_col  = dense;
-    *nseg      = 0;
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-
-    /* For each column in the panel */
-    for (jj = jcol; jj < jcol + w; jj++) {
-	nextl_col = (jj - jcol) * m;
-
-#ifdef CHK_DFS
-	printf("\npanel col %d: ", jj);
-#endif
-
-	/* For each nonz in A[*,jj] do dfs */
-	for (k = xa_begin[jj]; k < xa_end[jj]; k++) {
-	    krow = asub[k];
-            dense_col[krow] = a[k];
-	    kmark = marker[krow];    	
-	    if ( kmark == jj ) 
-		continue;     /* krow visited before, go to the next nonzero */
-
-	    /* For each unmarked nbr krow of jj
-	     * krow is in L: place it in structure of L[*,jj]
-	     */
-	    marker[krow] = jj;
-	    kperm = perm_r[krow];
-	    
-	    if ( kperm == EMPTY ) {
-		panel_lsub[nextl_col++] = krow; /* krow is indexed into A */
-	    }
-	    /* 
-	     * krow is in U: if its supernode-rep krep
-	     * has been explored, update repfnz[*]
-	     */
-	    else {
-		
-		krep = xsup[supno[kperm]+1] - 1;
-		myfnz = repfnz_col[krep];
-		
-#ifdef CHK_DFS
-		printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm);
-#endif
-		if ( myfnz != EMPTY ) {	/* Representative visited before */
-		    if ( myfnz > kperm ) repfnz_col[krep] = kperm;
-		    /* continue; */
-		}
-		else {
-		    /* Otherwise, perform dfs starting at krep */
-		    oldrep = EMPTY;
-		    parent[krep] = oldrep;
-		    repfnz_col[krep] = kperm;
-		    xdfs = xlsub[krep];
-		    maxdfs = xprune[krep];
-		    
-#ifdef CHK_DFS 
-		    printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-		    for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-		    printf("\n");
-#endif
-		    do {
-			/* 
-			 * For each unmarked kchild of krep 
-			 */
-			while ( xdfs < maxdfs ) {
-			    
-			    kchild = lsub[xdfs];
-			    xdfs++;
-			    chmark = marker[kchild];
-			    
-			    if ( chmark != jj ) { /* Not reached yet */
-				marker[kchild] = jj;
-				chperm = perm_r[kchild];
-			      
-				/* Case kchild is in L: place it in L[*,j] */
-				if ( chperm == EMPTY ) {
-				    panel_lsub[nextl_col++] = kchild;
-				} 
-				/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-				 *   been explored, update its repfnz[*]
-				 */
-				else {
-				    
-				    chrep = xsup[supno[chperm]+1] - 1;
-				    myfnz = repfnz_col[chrep];
-#ifdef CHK_DFS
-				    printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm);
-#endif
-				    if ( myfnz != EMPTY ) { /* Visited before */
-					if ( myfnz > chperm )
-					    repfnz_col[chrep] = chperm;
-				    }
-				    else {
-					/* Cont. dfs at snode-rep of kchild */
-					xplore[krep] = xdfs;	
-					oldrep = krep;
-					krep = chrep; /* Go deeper down G(L) */
-					parent[krep] = oldrep;
-					repfnz_col[krep] = chperm;
-					xdfs = xlsub[krep];     
-					maxdfs = xprune[krep];
-#ifdef CHK_DFS 
-					printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-					for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);	
-					printf("\n");
-#endif
-				    } /* else */
-				  
-				} /* else */
-			      
-			    } /* if... */
-			    
-			} /* while xdfs < maxdfs */
-			
-			/* krow has no more unexplored nbrs:
-			 *    Place snode-rep krep in postorder DFS, if this 
-			 *    segment is seen for the first time. (Note that
-			 *    "repfnz[krep]" may change later.)
-			 *    Backtrack dfs to its parent.
-			 */
-			if ( marker1[krep] < jcol ) {
-			    segrep[*nseg] = krep;
-			    ++(*nseg);
-			    marker1[krep] = jj;
-			}
-			
-			kpar = parent[krep]; /* Pop stack, mimic recursion */
-			if ( kpar == EMPTY ) break; /* dfs done */
-			krep = kpar;
-			xdfs = xplore[krep];
-			maxdfs = xprune[krep];
-			
-#ifdef CHK_DFS 
-			printf("  pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs);
-			for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-			printf("\n");
-#endif
-		    } while ( kpar != EMPTY ); /* do-while - until empty stack */
-		    
-		} /* else */
-		
-	    } /* else */
-	    
-	} /* for each nonz in A[*,jj] */
-	
-	repfnz_col += m;    /* Move to next column */
-        dense_col += m;
-	
-    } /* for jj ... */
-    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_coletree.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_coletree.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_coletree.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_coletree.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,332 +0,0 @@
-
-/*  Elimination tree computation and layout routines */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_ddefs.h"
-
-/* 
- *  Implementation of disjoint set union routines.
- *  Elements are integers in 0..n-1, and the 
- *  names of the sets themselves are of type int.
- *  
- *  Calls are:
- *  initialize_disjoint_sets (n) initial call.
- *  s = make_set (i)             returns a set containing only i.
- *  s = splink (t, u)		 returns s = t union u, destroying t and u.
- *  s = find (i)		 return name of set containing i.
- *  finalize_disjoint_sets 	 final call.
- *
- *  This implementation uses path compression but not weighted union.
- *  See Tarjan's book for details.
- *  John Gilbert, CMI, 1987.
- *
- *  Implemented path-halving by XSL 07/05/95.
- */
-
-static int	*pp;		/* parent array for sets */
-
-static 
-int *mxCallocInt(int n)
-{
-    register int i;
-    int *buf;
-
-    buf = (int *) SUPERLU_MALLOC( n * sizeof(int) );
-    if ( !buf ) {
-         ABORT("SUPERLU_MALLOC fails for buf in mxCallocInt()");
-       }
-    for (i = 0; i < n; i++) buf[i] = 0;
-    return (buf);
-}
-      
-static
-void initialize_disjoint_sets (
-	int n
-	)
-{
-	pp = mxCallocInt(n);
-}
-
-
-static
-int make_set (
-	int i
-	)
-{
-	pp[i] = i;
-	return i;
-}
-
-
-static
-int splink (
-	int s,
-	int t
-	)
-{
-	pp[s] = t;
-	return t;
-}
-
-
-/* PATH HALVING */
-static
-int find (int i)
-{
-    register int p, gp;
-    
-    p = pp[i];
-    gp = pp[p];
-    while (gp != p) {
-	pp[i] = gp;
-	i = gp;
-	p = pp[i];
-	gp = pp[p];
-    }
-    return (p);
-}
-
-#if 0
-/* PATH COMPRESSION */
-static
-int find (
-	int i
-	)
-{
-	if (pp[i] != i) 
-		pp[i] = find (pp[i]);
-	return pp[i];
-}
-#endif
-
-static
-void finalize_disjoint_sets (
-	void
-	)
-{
-	SUPERLU_FREE(pp);
-}
-
-
-/*
- *      Find the elimination tree for A'*A.
- *      This uses something similar to Liu's algorithm. 
- *      It runs in time O(nz(A)*log n) and does not form A'*A.
- *
- *      Input:
- *        Sparse matrix A.  Numeric values are ignored, so any
- *        explicit zeros are treated as nonzero.
- *      Output:
- *        Integer array of parents representing the elimination
- *        tree of the symbolic product A'*A.  Each vertex is a
- *        column of A, and nc means a root of the elimination forest.
- *
- *      John R. Gilbert, Xerox, 10 Dec 1990
- *      Based on code by JRG dated 1987, 1988, and 1990.
- */
-
-/*
- * Nonsymmetric elimination tree
- */
-int
-sp_coletree(
-	    int *acolst, int *acolend, /* column start and end past 1 */
-	    int *arow,                 /* row indices of A */
-	    int nr, int nc,            /* dimension of A */
-	    int *parent	               /* parent in elim tree */
-	    )
-{
-	int	*root;			/* root of subtee of etree 	*/
-	int     *firstcol;		/* first nonzero col in each row*/
-	int	rset, cset;             
-	int	row, col;
-	int	rroot;
-	int	p;
-
-	root = mxCallocInt (nc);
-	initialize_disjoint_sets (nc);
-
-	/* Compute firstcol[row] = first nonzero column in row */
-
-	firstcol = mxCallocInt (nr);
-	for (row = 0; row < nr; firstcol[row++] = nc);
-	for (col = 0; col < nc; col++) 
-		for (p = acolst[col]; p < acolend[col]; p++) {
-			row = arow[p];
-			firstcol[row] = SUPERLU_MIN(firstcol[row], col);
-		}
-
-	/* Compute etree by Liu's algorithm for symmetric matrices,
-           except use (firstcol[r],c) in place of an edge (r,c) of A.
-	   Thus each row clique in A'*A is replaced by a star
-	   centered at its first vertex, which has the same fill. */
-
-	for (col = 0; col < nc; col++) {
-		cset = make_set (col);
-		root[cset] = col;
-		parent[col] = nc; /* Matlab */
-		for (p = acolst[col]; p < acolend[col]; p++) {
-			row = firstcol[arow[p]];
-			if (row >= col) continue;
-			rset = find (row);
-			rroot = root[rset];
-			if (rroot != col) {
-				parent[rroot] = col;
-				cset = splink (cset, rset);
-				root[cset] = col;
-			}
-		}
-	}
-
-	SUPERLU_FREE (root);
-	SUPERLU_FREE (firstcol);
-	finalize_disjoint_sets ();
-	return 0;
-}
-
-/*
- *  q = TreePostorder (n, p);
- *
- *	Postorder a tree.
- *	Input:
- *	  p is a vector of parent pointers for a forest whose
- *        vertices are the integers 0 to n-1; p[root]==n.
- *	Output:
- *	  q is a vector indexed by 0..n-1 such that q[i] is the
- *	  i-th vertex in a postorder numbering of the tree.
- *
- *        ( 2/7/95 modified by X.Li:
- *          q is a vector indexed by 0:n-1 such that vertex i is the
- *          q[i]-th vertex in a postorder numbering of the tree.
- *          That is, this is the inverse of the previous q. )
- *
- *	In the child structure, lower-numbered children are represented
- *	first, so that a tree which is already numbered in postorder
- *	will not have its order changed.
- *    
- *  Written by John Gilbert, Xerox, 10 Dec 1990.
- *  Based on code written by John Gilbert at CMI in 1987.
- */
-
-static int	*first_kid, *next_kid;	/* Linked list of children.	*/
-static int	*post, postnum;
-
-static
-/*
- * Depth-first search from vertex v.
- */
-void etdfs (
-	int	v
-	)
-{
-	int	w;
-
-	for (w = first_kid[v]; w != -1; w = next_kid[w]) {
-		etdfs (w);
-	}
-	/* post[postnum++] = v; in Matlab */
-	post[v] = postnum++;    /* Modified by X.Li on 2/14/95 */
-}
-
-
-/*
- * Post order a tree
- */
-int *TreePostorder(
-	int n,
-	int *parent
-)
-{
-	int	v, dad;
-
-	/* Allocate storage for working arrays and results	*/
-	first_kid = 	mxCallocInt (n+1);
-	next_kid  = 	mxCallocInt (n+1);
-	post	  = 	mxCallocInt (n+1);
-
-	/* Set up structure describing children */
-	for (v = 0; v <= n; first_kid[v++] = -1);
-	for (v = n-1; v >= 0; v--) {
-		dad = parent[v];
-		next_kid[v] = first_kid[dad];
-		first_kid[dad] = v;
-	}
-
-	/* Depth-first search from dummy root vertex #n */
-	postnum = 0;
-	etdfs (n);
-
-	SUPERLU_FREE (first_kid);
-	SUPERLU_FREE (next_kid);
-	return post;
-}
-
-
-/*
- *      p = spsymetree (A);
- *
- *      Find the elimination tree for symmetric matrix A.
- *      This uses Liu's algorithm, and runs in time O(nz*log n).
- *
- *      Input:
- *        Square sparse matrix A.  No check is made for symmetry;
- *        elements below and on the diagonal are ignored.
- *        Numeric values are ignored, so any explicit zeros are 
- *        treated as nonzero.
- *      Output:
- *        Integer array of parents representing the etree, with n
- *        meaning a root of the elimination forest.
- *      Note:  
- *        This routine uses only the upper triangle, while sparse
- *        Cholesky (as in spchol.c) uses only the lower.  Matlab's
- *        dense Cholesky uses only the upper.  This routine could
- *        be modified to use the lower triangle either by transposing
- *        the matrix or by traversing it by rows with auxiliary
- *        pointer and link arrays.
- *
- *      John R. Gilbert, Xerox, 10 Dec 1990
- *      Based on code by JRG dated 1987, 1988, and 1990.
- *      Modified by X.S. Li, November 1999.
- */
-
-/*
- * Symmetric elimination tree
- */
-int
-sp_symetree(
-	    int *acolst, int *acolend, /* column starts and ends past 1 */
-	    int *arow,            /* row indices of A */
-	    int n,                /* dimension of A */
-	    int *parent	    /* parent in elim tree */
-	    )
-{
-	int	*root;		    /* root of subtree of etree 	*/
-	int	rset, cset;             
-	int	row, col;
-	int	rroot;
-	int	p;
-
-	root = mxCallocInt (n);
-	initialize_disjoint_sets (n);
-
-	for (col = 0; col < n; col++) {
-		cset = make_set (col);
-		root[cset] = col;
-		parent[col] = n; /* Matlab */
-		for (p = acolst[col]; p < acolend[col]; p++) {
-			row = arow[p];
-			if (row >= col) continue;
-			rset = find (row);
-			rroot = root[rset];
-			if (rroot != col) {
-				parent[rroot] = col;
-				cset = splink (cset, rset);
-				root[cset] = col;
-			}
-		}
-	}
-	SUPERLU_FREE (root);
-	finalize_disjoint_sets ();
-	return 0;
-} /* SP_SYMETREE */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_ienv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_ienv.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_ienv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_ienv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,63 +0,0 @@
-/*
- * File name:		sp_ienv.c
- * History:             Modified from lapack routine ILAENV
- */
-#include "slu_ddefs.h"
-
-int
-sp_ienv(int ispec)
-{
-/*
-    Purpose   
-    =======   
-
-    sp_ienv() is inquired to choose machine-dependent parameters for the
-    local environment. See ISPEC for a description of the parameters.   
-
-    This version provides a set of parameters which should give good,   
-    but not optimal, performance on many of the currently available   
-    computers.  Users are encouraged to modify this subroutine to set   
-    the tuning parameters for their particular machine using the option   
-    and problem size information in the arguments.   
-
-    Arguments   
-    =========   
-
-    ISPEC   (input) int
-            Specifies the parameter to be returned as the value of SP_IENV.   
-            = 1: the panel size w; a panel consists of w consecutive
-	         columns of matrix A in the process of Gaussian elimination.
-		 The best value depends on machine's cache characters.
-            = 2: the relaxation parameter relax; if the number of
-	         nodes (columns) in a subtree of the elimination tree is less
-		 than relax, this subtree is considered as one supernode,
-		 regardless of their row structures.
-            = 3: the maximum size for a supernode;
-	    = 4: the minimum row dimension for 2-D blocking to be used;
-	    = 5: the minimum column dimension for 2-D blocking to be used;
-	    = 6: the estimated fills factor for L and U, compared with A;
-	    
-   (SP_IENV) (output) int
-            >= 0: the value of the parameter specified by ISPEC   
-            < 0:  if SP_IENV = -k, the k-th argument had an illegal value. 
-  
-    ===================================================================== 
-*/
-    int i;
-
-    switch (ispec) {
-	case 1: return (10);
-	case 2: return (5);
-	case 3: return (100);
-	case 4: return (200);
-	case 5: return (40);
-        case 6: return (20);
-    }
-
-    /* Invalid value for ISPEC */
-    i = 1;
-    superlu_xerbla("sp_ienv", &i);
-    return 0;
-
-} /* sp_ienv_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotgrowth.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotgrowth.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotgrowth.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,107 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <math.h>
-#include "slu_sdefs.h"
-
-float
-sPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
-             SuperMatrix *L, SuperMatrix *U)
-{
-/*
- * Purpose
- * =======
- *
- * Compute the reciprocal pivot growth factor of the leading ncols columns
- * of the matrix, using the formula:
- *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
- *
- * Arguments
- * =========
- *
- * ncols    (input) int
- *          The number of columns of matrices A, L and U.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = NC; Dtype = SLU_S; Mtype = GE.
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SC; Dtype = SLU_S; Mtype = TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = NC;
- *          Dtype = SLU_S; Mtype = TRU.
- *
- */
-    NCformat *Astore;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    float  *Aval, *Lval, *Uval;
-    int      fsupc, nsupr, luptr, nz_in_U;
-    int      i, j, k, oldcol;
-    int      *inv_perm_c;
-    float   rpg, maxaj, maxuj;
-    extern   double slamch_(char *);
-    float   smlnum;
-    float   *luval;
-   
-    /* Get machine constants. */
-    smlnum = slamch_("S");
-    rpg = 1. / smlnum;
-
-    Astore = A->Store;
-    Lstore = L->Store;
-    Ustore = U->Store;
-    Aval = Astore->nzval;
-    Lval = Lstore->nzval;
-    Uval = Ustore->nzval;
-    
-    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
-    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;
-
-    for (k = 0; k <= Lstore->nsuper; ++k) {
-	fsupc = L_FST_SUPC(k);
-	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	luptr = L_NZ_START(fsupc);
-	luval = &Lval[luptr];
-	nz_in_U = 1;
-	
-	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
-	    maxaj = 0.;
-            oldcol = inv_perm_c[j];
-	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
-		maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) );
-	
-	    maxuj = 0.;
-	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
-		maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) );
-	    
-	    /* Supernode */
-	    for (i = 0; i < nz_in_U; ++i)
-		maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) );
-
-	    ++nz_in_U;
-	    luval += nsupr;
-
-	    if ( maxuj == 0. )
-		rpg = SUPERLU_MIN( rpg, 1.);
-	    else
-		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
-	}
-	
-	if ( j >= ncols ) break;
-    }
-
-    SUPERLU_FREE(inv_perm_c);
-    return (rpg);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spivotL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spivotL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,173 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include "slu_sdefs.h"
-
-#undef DEBUG
-
-int
-spivotL(
-        const int  jcol,     /* in */
-        const float u,      /* in - diagonal pivoting threshold */
-        int        *usepr,   /* re-use the pivot sequence given by perm_r/iperm_r */
-        int        *perm_r,  /* may be modified */
-        int        *iperm_r, /* in - inverse of perm_r */
-        int        *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */
-        int        *pivrow,  /* out */
-        GlobalLU_t *Glu,     /* modified - global LU data structures */
-	SuperLUStat_t *stat  /* output */
-       )
-{
-/*
- * Purpose
- * =======
- *   Performs the numerical pivoting on the current column of L,
- *   and the CDIV operation.
- *
- *   Pivot policy:
- *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
- *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
- *           pivot row = k;
- *       ELSE IF abs(A_jj) >= thresh THEN
- *           pivot row = j;
- *       ELSE
- *           pivot row = m;
- * 
- *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
- *
- *   Return value: 0      success;
- *                 i > 0  U(i,i) is exactly zero.
- *
- */
-    int          fsupc;	    /* first column in the supernode */
-    int          nsupc;	    /* no of columns in the supernode */
-    int          nsupr;     /* no of rows in the supernode */
-    int          lptr;	    /* points to the starting subscript of the supernode */
-    int          pivptr, old_pivptr, diag, diagind;
-    float       pivmax, rtemp, thresh;
-    float       temp;
-    float       *lu_sup_ptr; 
-    float       *lu_col_ptr;
-    int          *lsub_ptr;
-    int          isub, icol, k, itemp;
-    int          *lsub, *xlsub;
-    float       *lusup;
-    int          *xlusup;
-    flops_t      *ops = stat->ops;
-
-    /* Initialize pointers */
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    fsupc      = (Glu->xsup)[(Glu->supno)[jcol]];
-    nsupc      = jcol - fsupc;	        /* excluding jcol; nsupc >= 0 */
-    lptr       = xlsub[fsupc];
-    nsupr      = xlsub[fsupc+1] - lptr;
-    lu_sup_ptr = &lusup[xlusup[fsupc]];	/* start of the current supernode */
-    lu_col_ptr = &lusup[xlusup[jcol]];	/* start of jcol in the supernode */
-    lsub_ptr   = &lsub[lptr];	/* start of row indices of the supernode */
-
-#ifdef DEBUG
-if ( jcol == MIN_COL ) {
-    printf("Before cdiv: col %d\n", jcol);
-    for (k = nsupc; k < nsupr; k++) 
-	printf("  lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]);
-}
-#endif
-    
-    /* Determine the largest abs numerical value for partial pivoting;
-       Also search for user-specified pivot, and diagonal element. */
-    if ( *usepr ) *pivrow = iperm_r[jcol];
-    diagind = iperm_c[jcol];
-    pivmax = 0.0;
-    pivptr = nsupc;
-    diag = EMPTY;
-    old_pivptr = nsupc;
-    for (isub = nsupc; isub < nsupr; ++isub) {
-	rtemp = fabs (lu_col_ptr[isub]);
-	if ( rtemp > pivmax ) {
-	    pivmax = rtemp;
-	    pivptr = isub;
-	}
-	if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub;
-	if ( lsub_ptr[isub] == diagind ) diag = isub;
-    }
-
-    /* Test for singularity */
-    if ( pivmax == 0.0 ) {
-	*pivrow = lsub_ptr[pivptr];
-	perm_r[*pivrow] = jcol;
-	*usepr = 0;
-	return (jcol+1);
-    }
-
-    thresh = u * pivmax;
-    
-    /* Choose appropriate pivotal element by our policy. */
-    if ( *usepr ) {
-        rtemp = fabs (lu_col_ptr[old_pivptr]);
-	if ( rtemp != 0.0 && rtemp >= thresh )
-	    pivptr = old_pivptr;
-	else
-	    *usepr = 0;
-    }
-    if ( *usepr == 0 ) {
-	/* Use diagonal pivot? */
-	if ( diag >= 0 ) { /* diagonal exists */
-	    rtemp = fabs (lu_col_ptr[diag]);
-	    if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag;
-        }
-	*pivrow = lsub_ptr[pivptr];
-    }
-    
-    /* Record pivot row */
-    perm_r[*pivrow] = jcol;
-    
-    /* Interchange row subscripts */
-    if ( pivptr != nsupc ) {
-	itemp = lsub_ptr[pivptr];
-	lsub_ptr[pivptr] = lsub_ptr[nsupc];
-	lsub_ptr[nsupc] = itemp;
-
-	/* Interchange numerical values as well, for the whole snode, such 
-	 * that L is indexed the same way as A.
- 	 */
-	for (icol = 0; icol <= nsupc; icol++) {
-	    itemp = pivptr + icol * nsupr;
-	    temp = lu_sup_ptr[itemp];
-	    lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr];
-	    lu_sup_ptr[nsupc + icol*nsupr] = temp;
-	}
-    } /* if */
-
-    /* cdiv operation */
-    ops[FACT] += nsupr - nsupc;
-
-    temp = 1.0 / lu_col_ptr[nsupc];
-    for (k = nsupc+1; k < nsupr; k++) 
-	lu_col_ptr[k] *= temp;
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_preorder.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_preorder.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sp_preorder.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sp_preorder.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,204 +0,0 @@
-#include "slu_ddefs.h"
-
-void
-sp_preorder(superlu_options_t *options,  SuperMatrix *A, int *perm_c, 
-	    int *etree, SuperMatrix *AC)
-{
-/*
- * Purpose
- * =======
- *
- * sp_preorder() permutes the columns of the original matrix. It performs
- * the following steps:
- *
- *    1. Apply column permutation perm_c[] to A's column pointers to form AC;
- *
- *    2. If options->Fact = DOFACT, then
- *       (1) Compute column elimination tree etree[] of AC'AC;
- *       (2) Post order etree[] to get a postordered elimination tree etree[],
- *           and a postorder permutation post[];
- *       (3) Apply post[] permutation to columns of AC;
- *       (4) Overwrite perm_c[] with the product perm_c * post.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         Specifies whether or not the elimination tree will be re-used.
- *         If options->Fact == DOFACT, this means first time factor A, 
- *         etree is computed, postered, and output.
- *         Otherwise, re-factor A, etree is input, unchanged on exit.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = NC or SLU_NCP; Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- * perm_c  (input/output) int*
- *	   Column permutation vector of size A->ncol, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *         If options->Fact == DOFACT, perm_c is both input and output.
- *         On output, it is changed according to a postorder of etree.
- *         Otherwise, perm_c is input.
- *
- * etree   (input/output) int*
- *         Elimination tree of Pc'*A'*A*Pc, dimension A->ncol.
- *         If options->Fact == DOFACT, etree is an output argument,
- *         otherwise it is an input argument.
- *         Note: etree is a vector of parent pointers for a forest whose
- *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- * AC      (output) SuperMatrix*
- *         The resulting matrix after applied the column permutation
- *         perm_c[] to matrix A. The type of AC can be:
- *         Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
- *
- */
-
-    NCformat  *Astore;
-    NCPformat *ACstore;
-    int       *iwork, *post;
-    register  int n, i;
-
-    n = A->ncol;
-    
-    /* Apply column permutation perm_c to A's column pointers so to
-       obtain NCP format in AC = A*Pc.  */
-    AC->Stype       = SLU_NCP;
-    AC->Dtype       = A->Dtype;
-    AC->Mtype       = A->Mtype;
-    AC->nrow        = A->nrow;
-    AC->ncol        = A->ncol;
-    Astore          = (NCformat*) A->Store;
-    AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) );
-	ACstore = (NCPformat*) AC->Store;
-    if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore");
-    ACstore->nnz    = Astore->nnz;
-    ACstore->nzval  = Astore->nzval;
-    ACstore->rowind = Astore->rowind;
-    ACstore->colbeg = (int*) SUPERLU_MALLOC(n*sizeof(int));
-    if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg");
-    ACstore->colend = (int*) SUPERLU_MALLOC(n*sizeof(int));
-    if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend");
-
-#ifdef DEBUG
-    print_int_vec("pre_order:", n, perm_c);
-    check_perm("Initial perm_c", n, perm_c);
-#endif      
-
-    for (i = 0; i < n; i++) {
-	ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; 
-	ACstore->colend[perm_c[i]] = Astore->colptr[i+1];
-    }
-	
-    if ( options->Fact == DOFACT ) {
-#undef ETREE_ATplusA
-#ifdef ETREE_ATplusA
-        /*--------------------------------------------
-	  COMPUTE THE ETREE OF Pc*(A'+A)*Pc'.
-	  --------------------------------------------*/
-        int *b_colptr, *b_rowind, bnz, j;
-	int *c_colbeg, *c_colend;
-
-        /*printf("Use etree(A'+A)\n");*/
-
-	/* Form B = A + A'. */
-	at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind,
-		  &bnz, &b_colptr, &b_rowind);
-
-	/* Form C = Pc*B*Pc'. */
-	c_colbeg = (int*) SUPERLU_MALLOC(2*n*sizeof(int));
-	c_colend = c_colbeg + n;
-	if (!c_colbeg ) ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend");
-	for (i = 0; i < n; i++) {
-	    c_colbeg[perm_c[i]] = b_colptr[i]; 
-  	    c_colend[perm_c[i]] = b_colptr[i+1];
-	}
-	for (j = 0; j < n; ++j) {
-	    for (i = c_colbeg[j]; i < c_colend[j]; ++i) {
-	        b_rowind[i] = perm_c[b_rowind[i]];
-	    }
-	}
-
-	/* Compute etree of C. */
-	sp_symetree(c_colbeg, c_colend, b_rowind, n, etree);
-
-	SUPERLU_FREE(b_colptr);
-	if ( bnz ) SUPERLU_FREE(b_rowind);
-	SUPERLU_FREE(c_colbeg);
-	
-#else
-        /*--------------------------------------------
-	  COMPUTE THE COLUMN ELIMINATION TREE.
-	  --------------------------------------------*/
-	sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind,
-		    A->nrow, A->ncol, etree);
-#endif
-#ifdef DEBUG	
-	print_int_vec("etree:", n, etree);
-#endif	
-	
-	/* In symmetric mode, do not do postorder here. */
-	if ( options->SymmetricMode == NO ) {
-	    /* Post order etree */
-	    post = (int *) TreePostorder(n, etree);
-	    /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;
-	       iwork = post; */
-
-#ifdef DEBUG
-	    print_int_vec("post:", n+1, post);
-	    check_perm("post", n, post);	
-#endif	
-	    iwork = (int*) SUPERLU_MALLOC((n+1)*sizeof(int)); 
-	    if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
-
-	    /* Renumber etree in postorder */
-	    for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]];
-	    for (i = 0; i < n; ++i) etree[i] = iwork[i];
-
-#ifdef DEBUG	
-	    print_int_vec("postorder etree:", n, etree);
-#endif
-	
-	    /* Postmultiply A*Pc by post[] */
-	    for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i];
-	    for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i];
-	    for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i];
-	    for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i];
-
-	    for (i = 0; i < n; ++i)
-	        iwork[i] = post[perm_c[i]];  /* product of perm_c and post */
-	    for (i = 0; i < n; ++i) perm_c[i] = iwork[i];
-
-#ifdef DEBUG
-	    print_int_vec("Pc*post:", n, perm_c);
-	    check_perm("final perm_c", n, perm_c);	
-#endif
-	    SUPERLU_FREE (post);
-	    SUPERLU_FREE (iwork);
-	} /* end postordering */
-
-    } /* if options->Fact == DOFACT ... */
-
-}
-
-int check_perm(char *what, int n, int *perm)
-{
-    register int i;
-    int          *marker;
-    marker = (int *) calloc(n, sizeof(int));
-
-    for (i = 0; i < n; ++i) {
-	if ( marker[perm[i]] == 1 || perm[i] >= n ) {
-	    printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]);
-	    ABORT("check_perm");
-	} else {
-	    marker[perm[i]] = 1;
-	}
-    }
-
-    SUPERLU_FREE(marker);
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spruneL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/spruneL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/spruneL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,147 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-void
-spruneL(
-       const int  jcol,	     /* in */
-       const int  *perm_r,   /* in */
-       const int  pivrow,    /* in */
-       const int  nseg,	     /* in */
-       const int  *segrep,   /* in */
-       const int  *repfnz,   /* in */
-       int        *xprune,   /* out */
-       GlobalLU_t *Glu       /* modified - global LU data structures */
-       )
-{
-/*
- * Purpose
- * =======
- *   Prunes the L-structure of supernodes whose L-structure
- *   contains the current pivot row "pivrow"
- *
- */
-    float     utemp;
-    int        jsupno, irep, irep1, kmin, kmax, krow, movnum;
-    int        i, ktemp, minloc, maxloc;
-    int        do_prune; /* logical variable */
-    int        *xsup, *supno;
-    int        *lsub, *xlsub;
-    float     *lusup;
-    int        *xlusup;
-
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    
-    /*
-     * For each supernode-rep irep in U[*,j]
-     */
-    jsupno = supno[jcol];
-    for (i = 0; i < nseg; i++) {
-
-	irep = segrep[i];
-	irep1 = irep + 1;
-	do_prune = FALSE;
-
-	/* Don't prune with a zero U-segment */
- 	if ( repfnz[irep] == EMPTY )
-		continue;
-
-     	/* If a snode overlaps with the next panel, then the U-segment 
-   	 * is fragmented into two parts -- irep and irep1. We should let
-	 * pruning occur at the rep-column in irep1's snode. 
-	 */
-	if ( supno[irep] == supno[irep1] ) 	/* Don't prune */
-		continue;
-
-	/*
-	 * If it has not been pruned & it has a nonz in row L[pivrow,i]
-	 */
-	if ( supno[irep] != jsupno ) {
-	    if ( xprune[irep] >= xlsub[irep1] ) {
-		kmin = xlsub[irep];
-		kmax = xlsub[irep1] - 1;
-		for (krow = kmin; krow <= kmax; krow++) 
-		    if ( lsub[krow] == pivrow ) {
-			do_prune = TRUE;
-			break;
-		    }
-	    }
-	    
-    	    if ( do_prune ) {
-
-	     	/* Do a quicksort-type partition
-	     	 * movnum=TRUE means that the num values have to be exchanged.
-	     	 */
-	        movnum = FALSE;
-	        if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */
-			movnum = TRUE;
-
-	        while ( kmin <= kmax ) {
-
-	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
-			kmax--;
-		    else if ( perm_r[lsub[kmin]] != EMPTY )
-			kmin++;
-		    else { /* kmin below pivrow, and kmax above pivrow: 
-		            * 	interchange the two subscripts
-			    */
-		        ktemp = lsub[kmin];
-		        lsub[kmin] = lsub[kmax];
-		        lsub[kmax] = ktemp;
-
-			/* If the supernode has only one column, then we
- 			 * only keep one set of subscripts. For any subscript 
-			 * interchange performed, similar interchange must be 
-			 * done on the numerical values.
- 			 */
-		        if ( movnum ) {
-		    	    minloc = xlusup[irep] + (kmin - xlsub[irep]);
-		    	    maxloc = xlusup[irep] + (kmax - xlsub[irep]);
-			    utemp = lusup[minloc];
-		  	    lusup[minloc] = lusup[maxloc];
-			    lusup[maxloc] = utemp;
-		        }
-
-		        kmin++;
-		        kmax--;
-
-		    }
-
-	        } /* while */
-
-	        xprune[irep] = kmin;	/* Pruning */
-
-#ifdef CHK_PRUNE
-	printf("    After spruneL(),using col %d:  xprune[%d] = %d\n", 
-			jcol, irep, kmin);
-#endif
-	    } /* if do_prune */
-
-	} /* if */
-
-    } /* for each U-segment... */
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sreadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sreadhb.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sreadhb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sreadhb.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,255 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_sdefs.h"
-
-
-/* Eat up the rest of the current line */
-int sDumpLine(FILE *fp)
-{
-    register int c;
-    while ((c = fgetc(fp)) != '\n') ;
-    return 0;
-}
-
-int sParseIntFormat(char *buf, int *num, int *size)
-{
-    char *tmp;
-
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    sscanf(tmp, "%d", num);
-    while (*tmp != 'I' && *tmp != 'i') ++tmp;
-    ++tmp;
-    sscanf(tmp, "%d", size);
-    return 0;
-}
-
-int sParseFloatFormat(char *buf, int *num, int *size)
-{
-    char *tmp, *period;
-    
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
-	   && *tmp != 'F' && *tmp != 'f') {
-        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
-           num picked up refers to P, which should be skipped. */
-        if (*tmp=='p' || *tmp=='P') {
-           ++tmp;
-           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-        } else {
-           ++tmp;
-        }
-    }
-    ++tmp;
-    period = tmp;
-    while (*period != '.' && *period != ')') ++period ;
-    *period = '\0';
-    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
-
-    return 0;
-}
-
-int sReadVector(FILE *fp, int n, int *where, int perline, int persize)
-{
-    register int i, j, item;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	    where[i++] = item - 1;
-	}
-    }
-
-    return 0;
-}
-
-int sReadValues(FILE *fp, int n, float *destination, int perline, int persize)
-{
-    register int i, j, k, s;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    s = j*persize;
-	    for (k = 0; k < persize; ++k) /* No D_ format in C */
-		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
-	    destination[i++] = atof(&buf[s]);
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	}
-    }
-
-    return 0;
-}
-
-
-
-void
-sreadhb(int *nrow, int *ncol, int *nonz,
-	float **nzval, int **rowind, int **colptr)
-{
-/* 
- * Purpose
- * =======
- * 
- * Read a FLOAT PRECISION matrix stored in Harwell-Boeing format 
- * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
- *
- */
-
-    register int i, numer_lines = 0, rhscrd = 0;
-    int tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
-    char buf[100], type[4], key[10];
-    FILE *fp;
-
-    fp = stdin;
-
-    /* Line 1 */
-    fgets(buf, 100, fp);
-    fputs(buf, stdout);
-#if 0
-    fscanf(fp, "%72c", buf); buf[72] = 0;
-    printf("Title: %s", buf);
-    fscanf(fp, "%8c", key);  key[8] = 0;
-    printf("Key: %s\n", key);
-    sDumpLine(fp);
-#endif
-
-    /* Line 2 */
-    for (i=0; i<5; i++) {
-	fscanf(fp, "%14c", buf); buf[14] = 0;
-	sscanf(buf, "%d", &tmp);
-	if (i == 3) numer_lines = tmp;
-	if (i == 4 && tmp) rhscrd = tmp;
-    }
-    sDumpLine(fp);
-
-    /* Line 3 */
-    fscanf(fp, "%3c", type);
-    fscanf(fp, "%11c", buf); /* pad */
-    type[3] = 0;
-#ifdef DEBUG
-    printf("Matrix type %s\n", type);
-#endif
-    
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nrow);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", ncol);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nonz);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", &tmp);
-    
-    if (tmp != 0)
-	  printf("This is not an assembled matrix!\n");
-    if (*nrow != *ncol)
-	printf("Matrix is not square.\n");
-    sDumpLine(fp);
-
-    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
-    sallocateA(*ncol, *nonz, nzval, rowind, colptr);
-
-    /* Line 4: format statement */
-    fscanf(fp, "%16c", buf);
-    sParseIntFormat(buf, &colnum, &colsize);
-    fscanf(fp, "%16c", buf);
-    sParseIntFormat(buf, &rownum, &rowsize);
-    fscanf(fp, "%20c", buf);
-    sParseFloatFormat(buf, &valnum, &valsize);
-    fscanf(fp, "%20c", buf);
-    sDumpLine(fp);
-
-    /* Line 5: right-hand side */    
-    if ( rhscrd ) sDumpLine(fp); /* skip RHSFMT */
-    
-#ifdef DEBUG
-    printf("%d rows, %d nonzeros\n", *nrow, *nonz);
-    printf("colnum %d, colsize %d\n", colnum, colsize);
-    printf("rownum %d, rowsize %d\n", rownum, rowsize);
-    printf("valnum %d, valsize %d\n", valnum, valsize);
-#endif
-    
-    sReadVector(fp, *ncol+1, *colptr, colnum, colsize);
-    sReadVector(fp, *nonz, *rowind, rownum, rowsize);
-    if ( numer_lines ) {
-        sReadValues(fp, *nonz, *nzval, valnum, valsize);
-    }
-    
-    fclose(fp);
-
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssnode_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssnode_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssnode_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssnode_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,116 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-
-/*
- * Performs numeric block updates within the relaxed snode. 
- */
-int
-ssnode_bmod (
-	    const int  jcol,	  /* in */
-	    const int  jsupno,    /* in */
-	    const int  fsupc,     /* in */
-	    float     *dense,    /* in */
-	    float     *tempv,    /* working array */
-	    GlobalLU_t *Glu,      /* modified */
-	    SuperLUStat_t *stat   /* output */
-	    )
-{
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int            incx = 1, incy = 1;
-    float         alpha = -1.0, beta = 1.0;
-#endif
-
-    int            luptr, nsupc, nsupr, nrow;
-    int            isub, irow, i, iptr; 
-    register int   ufirst, nextlu;
-    int            *lsub, *xlsub;
-    float         *lusup;
-    int            *xlusup;
-    flops_t *ops = stat->ops;
-
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-
-    nextlu = xlusup[jcol];
-    
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-	dense[irow] = 0;
-	++nextlu;
-    }
-
-    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */
-    
-    if ( fsupc < jcol ) {
-
-	luptr = xlusup[fsupc];
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nsupc = jcol - fsupc;	/* Excluding jcol */
-	ufirst = xlusup[jcol];	/* Points to the beginning of column
-				   jcol in supernode L\U(jsupno). */
-	nrow = nsupr - nsupc;
-
-	ops[TRSV] += nsupc * (nsupc - 1);
-	ops[GEMV] += 2 * nrow * nsupc;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
-			&lusup[ufirst], &tempv[0] );
-
-        /* Scatter tempv[*] into lusup[*] */
-	iptr = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    lusup[iptr++] -= tempv[i];
-	    tempv[i] = 0.0;
-	}
-#endif
-
-    }
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssnode_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_sdefs.h"
-
-int
-ssnode_dfs (
-	   const int  jcol,	    /* in - start of the supernode */
-	   const int  kcol, 	    /* in - end of the supernode */
-	   const int  *asub,        /* in */
-	   const int  *xa_begin,    /* in */
-	   const int  *xa_end,      /* in */
-	   int        *xprune,      /* out */
-	   int        *marker,      /* modified */
-	   GlobalLU_t *Glu          /* modified */
-	   )
-{
-/* Purpose
- * =======
- *    ssnode_dfs() - Determine the union of the row structures of those 
- *    columns within the relaxed snode.
- *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
- *    the portion outside the rectangular supernode must be zero.
- *
- * Return value
- * ============
- *     0   success;
- *    >0   number of bytes allocated when run out of memory.
- *
- */
-    register int i, k, ifrom, ito, nextl, new_next;
-    int          nsuper, krow, kmark, mem_error;
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    int          nzlmax;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    nsuper = ++supno[jcol];	/* Next available supernode number */
-    nextl = xlsub[jcol];
-
-    for (i = jcol; i <= kcol; i++) {
-	/* For each nonzero in A[*,i] */
-	for (k = xa_begin[i]; k < xa_end[i]; k++) {	
-	    krow = asub[k];
-	    kmark = marker[krow];
-	    if ( kmark != kcol ) { /* First time visit krow */
-		marker[krow] = kcol;
-		lsub[nextl++] = krow;
-		if ( nextl >= nzlmax ) {
-		    if ( mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-			return (mem_error);
-		    lsub = Glu->lsub;
-		}
-	    }
-    	}
-	supno[i] = nsuper;
-    }
-
-    /* Supernode > 1, then make a copy of the subscripts for pruning */
-    if ( jcol < kcol ) {
-	new_next = nextl + (nextl - xlsub[jcol]);
-	while ( new_next > nzlmax ) {
-	    if ( mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-		return (mem_error);
-	    lsub = Glu->lsub;
-	}
-	ito = nextl;
-	for (ifrom = xlsub[jcol]; ifrom < nextl; )
-	    lsub[ito++] = lsub[ifrom++];	
-        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
-	nextl = ito;
-    }
-
-    xsup[nsuper+1] = kcol + 1;
-    supno[kcol+1]  = nsuper;
-    xprune[kcol]   = nextl;
-    xlsub[kcol+1]  = nextl;
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,470 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:		ssp_blas2.c
- * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
- */
-
-#include "slu_sdefs.h"
-
-/* 
- * Function prototypes 
- */
-void susolve(int, int, float*, float*);
-void slsolve(int, int, float*, float*);
-void smatvec(int, int, int, float*, float*, float*);
-
-
-int
-sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
-         SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose
- *   =======
- *
- *   sp_strsv() solves one of the systems of equations   
- *       A*x = b,   or   A'*x = b,
- *   where b and x are n element vectors and A is a sparse unit , or   
- *   non-unit, upper or lower triangular matrix.   
- *   No test for singularity or near-singularity is included in this   
- *   routine. Such tests must be performed before calling this routine.   
- *
- *   Parameters   
- *   ==========   
- *
- *   uplo   - (input) char*
- *            On entry, uplo specifies whether the matrix is an upper or   
- *             lower triangular matrix as follows:   
- *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
- *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
- *
- *   trans  - (input) char*
- *             On entry, trans specifies the equations to be solved as   
- *             follows:   
- *                trans = 'N' or 'n'   A*x = b.   
- *                trans = 'T' or 't'   A'*x = b.
- *                trans = 'C' or 'c'   A'*x = b.   
- *
- *   diag   - (input) char*
- *             On entry, diag specifies whether or not A is unit   
- *             triangular as follows:   
- *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
- *                diag = 'N' or 'n'   A is not assumed to be unit   
- *                                    triangular.   
- *	     
- *   L       - (input) SuperMatrix*
- *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes,
- *             i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU.
- *
- *   U       - (input) SuperMatrix*
- *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU.
- *    
- *   x       - (input/output) float*
- *             Before entry, the incremented array X must contain the n   
- *             element right-hand side vector b. On exit, X is overwritten 
- *             with the solution vector x.
- *
- *   info    - (output) int*
- *             If *info = -i, the i-th argument had an illegal value.
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    SCformat *Lstore;
-    NCformat *Ustore;
-    float   *Lval, *Uval;
-    int incx = 1, incy = 1;
-    float alpha = 1.0, beta = 1.0;
-    int nrow;
-    int fsupc, nsupr, nsupc, luptr, istart, irow;
-    int i, k, iptr, jcol;
-    float *work;
-    flops_t solve_ops;
-
-    /* Test the input parameters */
-    *info = 0;
-    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
-    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && 
-              !lsame_(trans, "C")) *info = -2;
-    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("sp_strsv", &i);
-	return 0;
-    }
-
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-
-    if ( !(work = floatCalloc(L->nrow)) )
-	ABORT("Malloc fails for work in sp_strsv().");
-    
-    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L)*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-		fsupc = L_FST_SUPC(k);
-		istart = L_SUB_START(fsupc);
-		nsupr = L_SUB_START(fsupc+1) - istart;
-		nsupc = L_FST_SUPC(k+1) - fsupc;
-		luptr = L_NZ_START(fsupc);
-		nrow = nsupr - nsupc;
-
-	        solve_ops += nsupc * (nsupc - 1);
-	        solve_ops += 2 * nrow * nsupc;
-
-		if ( nsupc == 1 ) {
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
-			irow = L_SUB(iptr);
-			++luptr;
-			x[irow] -= x[fsupc] * Lval[luptr];
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#else
-		    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#endif
-#else
-		    slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
-		
-		    smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
-                             &x[fsupc], &work[0] );
-#endif		
-		
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; ++i, ++iptr) {
-			irow = L_SUB(iptr);
-			x[irow] -= work[i];	/* Scatter */
-			work[i] = 0.0;
-
-		    }
-	 	}
-	    } /* for k ... */
-	    
-	} else {
-	    /* Form x := inv(U)*x */
-	    
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; k--) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-		
-    	        solve_ops += nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    x[fsupc] /= Lval[luptr];
-		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
-			irow = U_SUB(i);
-			x[irow] -= x[fsupc] * Uval[i];
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx);
-#else
-		    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
-                           &x[fsupc], &incx);
-#endif
-#else		
-		    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
-#endif		
-
-		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
-				i++) {
-			    irow = U_SUB(i);
-			    x[irow] -= x[jcol] * Uval[i];
-		    	}
-                    }
-		}
-	    } /* for k ... */
-	    
-	}
-    } else { /* Form x := inv(A')*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L')*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 2 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-			x[jcol] -= x[irow] * Lval[i];
-			iptr++;
-		    }
-		}
-		
-		if ( nsupc > 1 ) {
-		    solve_ops += nsupc * (nsupc - 1);
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := inv(U')*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-			x[jcol] -= x[irow] * Uval[i];
-		    }
-		}
-
-		solve_ops += nsupc * (nsupc + 1);
-
-		if ( nsupc == 1 ) {
-		    x[fsupc] /= Lval[luptr];
-		} else {
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#endif
-		}
-	    } /* for k ... */
-	}
-    }
-
-    stat->ops[SOLVE] += solve_ops;
-    SUPERLU_FREE(work);
-    return 0;
-}
-
-
-
-
-int
-sp_sgemv(char *trans, float alpha, SuperMatrix *A, float *x, 
-	 int incx, float beta, float *y, int incy)
-{
-/*  Purpose   
-    =======   
-
-    sp_sgemv()  performs one of the matrix-vector operations   
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-    where alpha and beta are scalars, x and y are vectors and A is a
-    sparse A->nrow by A->ncol matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - (input) char*
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-    ALPHA  - (input) float
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    X      - (input) float*, array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-
-    INCX   - (input) int
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-
-    BETA   - (input) float
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-
-    Y      - (output) float*,  array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-             updated vector y.
-	     
-    INCY   - (input) int
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-
-    ==== Sparse Level 2 Blas routine.   
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    float   *Aval;
-    int info;
-    float temp;
-    int lenx, leny, i, j, irow;
-    int iy, jx, jy, kx, ky;
-    int notran;
-
-    notran = lsame_(trans, "N");
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Test the input parameters */
-    info = 0;
-    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
-    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
-    else if (incx == 0) info = 5;
-    else if (incy == 0)	info = 8;
-    if (info != 0) {
-	xerbla_("sp_sgemv ", &info);
-	return 0;
-    }
-
-    /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
-	return 0;
-
-    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-       up the start points in  X  and  Y. */
-    if (lsame_(trans, "N")) {
-	lenx = A->ncol;
-	leny = A->nrow;
-    } else {
-	lenx = A->nrow;
-	leny = A->ncol;
-    }
-    if (incx > 0) kx = 0;
-    else kx =  - (lenx - 1) * incx;
-    if (incy > 0) ky = 0;
-    else ky =  - (leny - 1) * incy;
-
-    /* Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-    /* First form  y := beta*y. */
-    if (beta != 1.) {
-	if (incy == 1) {
-	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) y[i] = 0.;
-	    else
-		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
-	} else {
-	    iy = ky;
-	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = 0.;
-		    iy += incy;
-		}
-	    else
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = beta * y[iy];
-		    iy += incy;
-		}
-	}
-    }
-    
-    if (alpha == 0.) return 0;
-
-    if ( notran ) {
-	/* Form  y := alpha*A*x + y. */
-	jx = kx;
-	if (incy == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		if (x[jx] != 0.) {
-		    temp = alpha * x[jx];
-		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-			irow = Astore->rowind[i];
-			y[irow] += temp * Aval[i];
-		    }
-		}
-		jx += incx;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    } else {
-	/* Form  y := alpha*A'*x + y. */
-	jy = ky;
-	if (incx == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		temp = 0.;
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    temp += Aval[i] * x[irow];
-		}
-		y[jy] += alpha * temp;
-		jy += incy;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    }
-    return 0;
-} /* sp_sgemv */
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas3.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/ssp_blas3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/ssp_blas3.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,119 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		sp_blas3.c
- * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
- */
-
-#include "slu_sdefs.h"
-
-int
-sp_sgemm(char *transa, char *transb, int m, int n, int k, 
-         float alpha, SuperMatrix *A, float *b, int ldb, 
-         float beta, float *c, int ldc)
-{
-/*  Purpose   
-    =======   
-
-    sp_s performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of 
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - (input) char*
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-             Unchanged on exit.   
-
-    TRANSB - (input) char*
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-             Unchanged on exit.   
-
-    M      - (input) int   
-             On entry,  M  specifies  the number of rows of the matrix 
-	     op( A ) and of the matrix C.  M must be at least zero. 
-	     Unchanged on exit.   
-
-    N      - (input) int
-             On entry,  N specifies the number of columns of the matrix 
-	     op( B ) and the number of columns of the matrix C. N must be 
-	     at least zero.
-	     Unchanged on exit.   
-
-    K      - (input) int
-             On entry, K specifies the number of columns of the matrix 
-	     op( A ) and the number of rows of the matrix op( B ). K must 
-	     be at least  zero.   
-             Unchanged on exit.
-	     
-    ALPHA  - (input) float
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    B      - FLOAT PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
-             part of the array B must contain the matrix B, otherwise 
-             the leading n by k part of the array B must contain the 
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - (input) int
-             On entry, LDB specifies the first dimension of B as declared 
-             in the calling (sub) program. LDB must be at least max( 1, n ).  
-             Unchanged on exit.   
-
-    BETA   - (input) float
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then C need not be set on input.   
-
-    C      - FLOAT PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading m by n part of the array C must 
-             contain the matrix C,  except when beta is zero, in which 
-             case C need not be set on entry.   
-             On exit, the array C is overwritten by the m by n matrix 
-	     ( alpha*op( A )*B + beta*C ).   
-
-    LDC    - (input) int
-             On entry, LDC specifies the first dimension of C as declared 
-             in the calling (sub)program. LDC must be at least max(1,m).   
-             Unchanged on exit.   
-
-    ==== Sparse Level 3 Blas routine.   
-*/
-    int    incx = 1, incy = 1;
-    int    j;
-
-    for (j = 0; j < n; ++j) {
-	sp_sgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
-    }
-    return 0;    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/superlu_timer.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/superlu_timer.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/superlu_timer.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/superlu_timer.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,56 +0,0 @@
-/* 
- * Purpose
- * ======= 
- *	Returns the time in seconds used by the process.
- *
- * Note: the timer function call is machine dependent. Use conditional
- *       compilation to choose the appropriate function.
- *
- */
-
-#include "slu_ddefs.h"
-
-#ifdef SUN
-/*
- * 	It uses the system call gethrtime(3C), which is accurate to 
- *	nanoseconds. 
-*/
-#include <sys/time.h>
-
-double SuperLU_timer_() {
-    return ( (double)gethrtime() / 1e9 );
-}
-
-#else
-
-#ifndef NO_TIMER
-#include <time.h>
-#include <sys/types.h>
-#ifndef WIN32
-#include <sys/times.h>
-#include <sys/time.h>
-#endif
-#endif
-
-#ifndef CLK_TCK
-#define CLK_TCK 60
-#endif
-
-double SuperLU_timer_()
-{
-#ifdef NO_TIMER
-    /* no sys/times.h on WIN32 */
-    double tmp;
-    tmp = 0.0;
-#else
-    struct tms use;
-    double tmp;
-    times(&use);
-    tmp = use.tms_utime;
-    tmp += use.tms_stime;
-#endif
-    return (double)(tmp) / CLK_TCK;
-}
-
-#endif
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/supermatrix.h hypre-2.13.0/src/FEI_mv/SuperLU/SRC/supermatrix.h
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/supermatrix.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/supermatrix.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,140 +0,0 @@
-#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */
-#define __SUPERLU_SUPERMATRIX
-
-/********************************************
- * The matrix types are defined as follows. *
- ********************************************/
-typedef enum {
-    SLU_NC,    /* column-wise, no supernode */
-    SLU_NR,    /* row-wize, no supernode */
-    SLU_SC,    /* column-wise, supernode */
-    SLU_SR,    /* row-wise, supernode */
-    SLU_NCP,   /* column-wise, column-permuted, no supernode 
-                  (The consecutive columns of nonzeros, after permutation,
-		   may not be stored  contiguously.) */
-    SLU_DN     /* Fortran style column-wise storage for dense matrix */
-} Stype_t;
-
-typedef enum {
-    SLU_S,     /* single */
-    SLU_D,     /* double */
-    SLU_C,     /* single complex */
-    SLU_Z      /* double complex */
-} Dtype_t;
-
-typedef enum {
-    SLU_GE,    /* general */
-    SLU_TRLU,  /* lower triangular, unit diagonal */
-    SLU_TRUU,  /* upper triangular, unit diagonal */
-    SLU_TRL,   /* lower triangular */
-    SLU_TRU,   /* upper triangular */
-    SLU_SYL,   /* symmetric, store lower half */
-    SLU_SYU,   /* symmetric, store upper half */
-    SLU_HEL,   /* Hermitian, store lower half */
-    SLU_HEU    /* Hermitian, store upper half */
-} Mtype_t;
-
-typedef struct {
-	Stype_t Stype; /* Storage type: interprets the storage structure 
-		   	  pointed to by *Store. */
-	Dtype_t Dtype; /* Data type. */
-	Mtype_t Mtype; /* Matrix type: describes the mathematical property of 
-			  the matrix. */
-	int_t  nrow;   /* number of rows */
-	int_t  ncol;   /* number of columns */
-	void *Store;   /* pointer to the actual storage of the matrix */
-} SuperMatrix;
-
-/***********************************************
- * The storage schemes are defined as follows. *
- ***********************************************/
-
-/* Stype == NC (Also known as Harwell-Boeing sparse matrix format) */
-typedef struct {
-    int_t  nnz;	    /* number of nonzeros in the matrix */
-    void   *nzval;  /* pointer to array of nonzero values, packed by column */
-    int_t  *rowind; /* pointer to array of row indices of the nonzeros */
-    int_t  *colptr; /* pointer to array of beginning of columns in nzval[] 
-		       and rowind[]  */
-                    /* Note:
-		       Zero-based indexing is used;
-		       colptr[] has ncol+1 entries, the last one pointing
-		       beyond the last column, so that colptr[ncol] = nnz. */
-} NCformat;
-
-/* Stype == NR (Also known as row compressed storage (RCS). */
-typedef struct {
-    int_t nnz;	   /* number of nonzeros in the matrix */
-    void  *nzval;  /* pointer to array of nonzero values, packed by row */
-    int_t *colind; /* pointer to array of column indices of the nonzeros */
-    int_t *rowptr; /* pointer to array of beginning of rows in nzval[] 
-                      and colind[]  */
-                   /* Note:
-		      Zero-based indexing is used;
-		      nzval[] and colind[] are of the same length, nnz;
-		      rowptr[] has nrow+1 entries, the last one pointing
-		      beyond the last column, so that rowptr[nrow] = nnz. */
-} NRformat;
-
-/* Stype == SC */
-typedef struct {
-  int_t  nnz;	     /* number of nonzeros in the matrix */
-  int_t  nsuper;     /* number of supernodes, minus 1 */
-  void *nzval;       /* pointer to array of nonzero values, packed by column */
-  int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */
-  int_t *rowind;     /* pointer to array of compressed row indices of 
-			rectangular supernodes */
-  int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */
-  int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column 
-			j belongs; mapping from column to supernode number. */
-  int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th 
-			supernode; mapping from supernode number to column.
-		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
-		              sup_to_col: 0 1 2 4 7 12           (nsuper=4) */
-                     /* Note:
-		        Zero-based indexing is used;
-		        nzval_colptr[], rowind_colptr[], col_to_sup and
-		        sup_to_col[] have ncol+1 entries, the last one
-		        pointing beyond the last column.
-		        For col_to_sup[], only the first ncol entries are
-		        defined. For sup_to_col[], only the first nsuper+2
-		        entries are defined. */
-} SCformat;
-
-/* Stype == NCP */
-typedef struct {
-    int_t nnz;	  /* number of nonzeros in the matrix */
-    void *nzval;  /* pointer to array of nonzero values, packed by column */
-    int_t *rowind;/* pointer to array of row indices of the nonzeros */
-		  /* Note: nzval[]/rowind[] always have the same length */
-    int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] 
-                     and rowind[]  */
-    int_t *colend;/* colend[j] points to one past the last element of column
-		     j in nzval[] and rowind[]  */
-		  /* Note:
-		     Zero-based indexing is used;
-		     The consecutive columns of the nonzeros may not be 
-		     contiguous in storage, because the matrix has been 
-		     postmultiplied by a column permutation matrix. */
-} NCPformat;
-
-/* Stype == DN */
-typedef struct {
-    int_t lda;    /* leading dimension */
-    void *nzval;  /* array of size lda*ncol to represent a dense matrix */
-} DNformat;
-
-
-
-/*********************************************************
- * Macros used for easy access of sparse matrix entries. *
- *********************************************************/
-#define L_SUB_START(col)     ( Lstore->rowind_colptr[col] )
-#define L_SUB(ptr)           ( Lstore->rowind[ptr] )
-#define L_NZ_START(col)      ( Lstore->nzval_colptr[col] )
-#define L_FST_SUPC(superno)  ( Lstore->sup_to_col[superno] )
-#define U_NZ_START(col)      ( Ustore->colptr[col] )
-#define U_SUB(ptr)           ( Ustore->rowind[ptr] )
-
-
-#endif  /* __SUPERLU_SUPERMATRIX */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sutil.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/sutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/sutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,481 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include "slu_sdefs.h"
-
-void
-sCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       float *nzval, int *rowind, int *colptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NCformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->rowind = rowind;
-    Astore->colptr = colptr;
-}
-
-void
-sCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       float *nzval, int *colind, int *rowptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NRformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->colind = colind;
-    Astore->rowptr = rowptr;
-}
-
-/* Copy matrix A into matrix B. */
-void
-sCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore, *Bstore;
-    int      ncol, nnz, i;
-
-    B->Stype = A->Stype;
-    B->Dtype = A->Dtype;
-    B->Mtype = A->Mtype;
-    B->nrow  = A->nrow;;
-    B->ncol  = ncol = A->ncol;
-    Astore   = (NCformat *) A->Store;
-    Bstore   = (NCformat *) B->Store;
-    Bstore->nnz = nnz = Astore->nnz;
-    for (i = 0; i < nnz; ++i)
-	((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i];
-    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
-    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
-}
-
-
-void
-sCreate_Dense_Matrix(SuperMatrix *X, int m, int n, float *x, int ldx,
-		    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    DNformat    *Xstore;
-    
-    X->Stype = stype;
-    X->Dtype = dtype;
-    X->Mtype = mtype;
-    X->nrow = m;
-    X->ncol = n;
-    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
-    Xstore = (DNformat *) X->Store;
-    Xstore->lda = ldx;
-    Xstore->nzval = (float *) x;
-}
-
-void
-sCopy_Dense_Matrix(int M, int N, float *X, int ldx,
-			float *Y, int ldy)
-{
-/*
- *
- *  Purpose
- *  =======
- *
- *  Copies a two-dimensional matrix X to another matrix Y.
- */
-    int    i, j;
-    
-    for (j = 0; j < N; ++j)
-        for (i = 0; i < M; ++i)
-            Y[i + j*ldy] = X[i + j*ldx];
-}
-
-void
-sCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, 
-			float *nzval, int *nzval_colptr, int *rowind,
-			int *rowind_colptr, int *col_to_sup, int *sup_to_col,
-			Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    SCformat *Lstore;
-
-    L->Stype = stype;
-    L->Dtype = dtype;
-    L->Mtype = mtype;
-    L->nrow = m;
-    L->ncol = n;
-    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
-    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
-    Lstore = L->Store;
-    Lstore->nnz = nnz;
-    Lstore->nsuper = col_to_sup[n];
-    Lstore->nzval = nzval;
-    Lstore->nzval_colptr = nzval_colptr;
-    Lstore->rowind = rowind;
-    Lstore->rowind_colptr = rowind_colptr;
-    Lstore->col_to_sup = col_to_sup;
-    Lstore->sup_to_col = sup_to_col;
-
-}
-
-
-/*
- * Convert a row compressed storage into a column compressed storage.
- */
-void
-sCompRow_to_CompCol(int m, int n, int nnz, 
-		    float *a, int *colind, int *rowptr,
-		    float **at, int **rowind, int **colptr)
-{
-    register int i, j, col, relpos;
-    int *marker;
-
-    /* Allocate storage for another copy of the matrix. */
-    *at = (float *) floatMalloc(nnz);
-    *rowind = (int *) intMalloc(nnz);
-    *colptr = (int *) intMalloc(n+1);
-    marker = (int *) intCalloc(n);
-    
-    /* Get counts of each column of A, and set up column pointers */
-    for (i = 0; i < m; ++i)
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
-    (*colptr)[0] = 0;
-    for (j = 0; j < n; ++j) {
-	(*colptr)[j+1] = (*colptr)[j] + marker[j];
-	marker[j] = (*colptr)[j];
-    }
-
-    /* Transfer the matrix into the compressed column storage. */
-    for (i = 0; i < m; ++i) {
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    col = colind[j];
-	    relpos = marker[col];
-	    (*rowind)[relpos] = i;
-	    (*at)[relpos] = a[j];
-	    ++marker[col];
-	}
-    }
-
-    SUPERLU_FREE(marker);
-}
-
-
-void
-sPrint_CompCol_Matrix(char *what, SuperMatrix *A)
-{
-    NCformat     *Astore;
-    register int i,n;
-    float       *dp;
-    
-    printf("\nCompCol matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (NCformat *) A->Store;
-    dp = (float *) Astore->nzval;
-    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
-    printf("nzval: ");
-    for (i = 0; i < Astore->colptr[n]; ++i) printf("%f  ", dp[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->colptr[n]; ++i) printf("%d  ", Astore->rowind[i]);
-    printf("\ncolptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->colptr[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-sPrint_SuperNode_Matrix(char *what, SuperMatrix *A)
-{
-    SCformat     *Astore;
-    register int i, j, k, c, d, n, nsup;
-    float       *dp;
-    int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr;
-    
-    printf("\nSuperNode matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (SCformat *) A->Store;
-    dp = (float *) Astore->nzval;
-    col_to_sup = Astore->col_to_sup;
-    sup_to_col = Astore->sup_to_col;
-    rowind_colptr = Astore->rowind_colptr;
-    rowind = Astore->rowind;
-    printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", 
-	   A->nrow,A->ncol,Astore->nnz,Astore->nsuper);
-    printf("nzval:\n");
-    for (k = 0; k <= Astore->nsuper; ++k) {
-      c = sup_to_col[k];
-      nsup = sup_to_col[k+1] - c;
-      for (j = c; j < c + nsup; ++j) {
-	d = Astore->nzval_colptr[j];
-	for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) {
-	  printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]);
-	}
-      }
-    }
-#if 0
-    for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f  ", dp[i]);
-#endif
-    printf("\nnzval_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->nzval_colptr[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->rowind_colptr[n]; ++i) 
-        printf("%d  ", Astore->rowind[i]);
-    printf("\nrowind_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->rowind_colptr[i]);
-    printf("\ncol_to_sup: ");
-    for (i = 0; i < n; ++i) printf("%d  ", col_to_sup[i]);
-    printf("\nsup_to_col: ");
-    for (i = 0; i <= Astore->nsuper+1; ++i) 
-        printf("%d  ", sup_to_col[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-sPrint_Dense_Matrix(char *what, SuperMatrix *A)
-{
-    DNformat     *Astore;
-    register int i, j, lda = Astore->lda;
-    float       *dp;
-    
-    printf("\nDense matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (DNformat *) A->Store;
-    dp = (float *) Astore->nzval;
-    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda);
-    printf("\nnzval: ");
-    for (j = 0; j < A->ncol; ++j) {
-        for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i + j*lda]);
-        printf("\n");
-    }
-    printf("\n");
-    fflush(stdout);
-}
-
-/*
- * Diagnostic print of column "jcol" in the U/L factor.
- */
-void
-sprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu)
-{
-    int     i, k, fsupc;
-    int     *xsup, *supno;
-    int     *xlsub, *lsub;
-    float  *lusup;
-    int     *xlusup;
-    float  *ucol;
-    int     *usub, *xusub;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    
-    printf("%s", msg);
-    printf("col %d: pivrow %d, supno %d, xprune %d\n", 
-	   jcol, pivrow, supno[jcol], xprune[jcol]);
-    
-    printf("\tU-col:\n");
-    for (i = xusub[jcol]; i < xusub[jcol+1]; i++)
-	printf("\t%d%10.4f\n", usub[i], ucol[i]);
-    printf("\tL-col in rectangular snode:\n");
-    fsupc = xsup[supno[jcol]];	/* first col of the snode */
-    i = xlsub[fsupc];
-    k = xlusup[jcol];
-    while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) {
-	printf("\t%d\t%10.4f\n", lsub[i], lusup[k]);
-	i++; k++;
-    }
-    fflush(stdout);
-}
-
-
-/*
- * Check whether tempv[] == 0. This should be true before and after 
- * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". 
- */
-void scheck_tempv(int n, float *tempv)
-{
-    int i;
-	
-    for (i = 0; i < n; i++) {
-	if (tempv[i] != 0.0) 
-	{
-	    fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]);
-	    ABORT("scheck_tempv");
-	}
-    }
-}
-
-
-void
-sGenXtrue(int n, int nrhs, float *x, int ldx)
-{
-    int  i, j;
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) {
-	    x[i + j*ldx] = 1.0;/* + (float)(i+1.)/n;*/
-	}
-}
-
-/*
- * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
- */
-void
-sFillRHS(trans_t trans, int nrhs, float *x, int ldx,
-         SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore;
-    float   *Aval;
-    DNformat *Bstore;
-    float   *rhs;
-    float one = 1.0;
-    float zero = 0.0;
-    int      ldc;
-    char transc[1];
-
-    Astore = A->Store;
-    Aval   = (float *) Astore->nzval;
-    Bstore = B->Store;
-    rhs    = Bstore->nzval;
-    ldc    = Bstore->lda;
-    
-    if ( trans == NOTRANS ) *(unsigned char *)transc = 'N';
-    else *(unsigned char *)transc = 'T';
-
-    sp_sgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A,
-	     x, ldx, zero, rhs, ldc);
-
-}
-
-/* 
- * Fills a float precision array with a given value.
- */
-void 
-sfill(float *a, int alen, float dval)
-{
-    register int i;
-    for (i = 0; i < alen; i++) a[i] = dval;
-}
-
-
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void sinf_norm_error(int nrhs, SuperMatrix *X, float *xtrue)
-{
-    DNformat *Xstore;
-    float err, xnorm;
-    float *Xmat, *soln_work;
-    int i, j;
-
-    Xstore = X->Store;
-    Xmat = Xstore->nzval;
-
-    for (j = 0; j < nrhs; j++) {
-      soln_work = &Xmat[j*Xstore->lda];
-      err = xnorm = 0.0;
-      for (i = 0; i < X->nrow; i++) {
-	err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i]));
-	xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i]));
-      }
-      err = err / xnorm;
-      printf("||X - Xtrue||/||X|| = %e\n", err);
-    }
-}
-
-
-
-/* Print performance of the code. */
-void
-sPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage,
-           float rpg, float rcond, float *ferr,
-           float *berr, char *equed, SuperLUStat_t *stat)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *utime;
-    flops_t  *ops;
-    
-    utime = stat->utime;
-    ops   = stat->ops;
-    
-    if ( utime[FACT] != 0. )
-	printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
-	       ops[FACT]*1e-6/utime[FACT]);
-    printf("Identify relaxed snodes	= %8.2f\n", utime[RELAX]);
-    if ( utime[SOLVE] != 0. )
-	printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE],
-	       ops[SOLVE]*1e-6/utime[SOLVE]);
-    
-    Lstore = (SCformat *) L->Store;
-    Ustore = (NCformat *) U->Store;
-    printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz);
-    printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz);
-    printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz);
-	
-    printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n",
-	   mem_usage->for_lu/1e6, mem_usage->total_needed/1e6,
-	   mem_usage->expansions);
-	
-    printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n");
-    printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n",
-	   utime[FACT], ops[FACT]*1e-6/utime[FACT],
-	   utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE],
-	   utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]);
-    
-    printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n");
-    printf("NUM:\t%e\t%e\t%e\t%e\t%s\n",
-	   rpg, rcond, ferr[0], berr[0], equed);
-    
-}
-
-
-
-
-print_float_vec(char *what, int n, float *vec)
-{
-    int i;
-    printf("%s: n %d\n", what, n);
-    for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]);
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/xerbla.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/xerbla.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/xerbla.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/xerbla.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,43 +0,0 @@
-#include <stdio.h>
-#include "slu_Cnames.h"
-
-/* Subroutine */ int xerbla_(char *srname, int *info)
-{
-/*  -- LAPACK auxiliary routine (version 2.0) --   
-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
-       Courant Institute, Argonne National Lab, and Rice University   
-       September 30, 1994   
-
-
-    Purpose   
-    =======   
-
-    XERBLA  is an error handler for the LAPACK routines.   
-    It is called by an LAPACK routine if an input parameter has an   
-    invalid value.  A message is printed and execution stops.   
-
-    Installers may consider modifying the STOP statement in order to   
-    call system-specific exception-handling facilities.   
-
-    Arguments   
-    =========   
-
-    SRNAME  (input) CHARACTER*6   
-            The name of the routine which called XERBLA.   
-
-    INFO    (input) INT   
-            The position of the invalid parameter in the parameter list   
-
-            of the calling routine.   
-
-   ===================================================================== 
-*/
-
-    printf("** On entry to %6s, parameter number %2d had an illegal value\n",
-		srname, *info);
-
-/*     End of XERBLA */
-
-    return 0;
-} /* xerbla_ */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,363 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_zdefs.h"
-
-/* 
- * Function prototypes 
- */
-void zusolve(int, int, doublecomplex*, doublecomplex*);
-void zlsolve(int, int, doublecomplex*, doublecomplex*);
-void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
-
-
-
-/* Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-zcolumn_bmod (
-	     const int  jcol,	  /* in */
-	     const int  nseg,	  /* in */
-	     doublecomplex     *dense,	  /* in */
-	     doublecomplex     *tempv,	  /* working array */
-	     int        *segrep,  /* in */
-	     int        *repfnz,  /* in */
-	     int        fpanelc,  /* in -- first column in the current panel */
-	     GlobalLU_t *Glu,     /* modified */
-	     SuperLUStat_t *stat  /* output */
-	     )
-{
-/*
- * Purpose:
- * ========
- *    Performs numeric block updates (sup-col) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int         incx = 1, incy = 1;
-    doublecomplex      alpha, beta;
-    
-    /* krep = representative of current k-th supernode
-     * fsupc = first supernodal column
-     * nsupc = no of columns in supernode
-     * nsupr = no of rows in supernode (used as leading dimension)
-     * luptr = location of supernodal LU-block in storage
-     * kfnz = first nonz in the k-th supernodal segment
-     * no_zeros = no of leading zeros in a supernodal U-segment
-     */
-    doublecomplex       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          fsupc, nsupc, nsupr, segsze;
-    int          nrow;	  /* No of rows in the matrix of matrix-vector */
-    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
-    register int lptr, kfnz, isub, irow, i;
-    register int no_zeros, new_next; 
-    int          ufirst, nextlu;
-    int          fst_col; /* First column within small LU update */
-    int          d_fsupc; /* Distance between the first column of the current
-			     panel and the first column of the current snode. */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    doublecomplex       *lusup;
-    int          *xlusup;
-    int          nzlumax;
-    doublecomplex       *tempv1;
-    doublecomplex      zero = {0.0, 0.0};
-    doublecomplex      one = {1.0, 0.0};
-    doublecomplex      none = {-1.0, 0.0};
-    doublecomplex	 comp_temp, comp_temp1;
-    int          mem_error;
-    flops_t      *ops = stat->ops;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    nzlumax = Glu->nzlumax;
-    jcolp1 = jcol + 1;
-    jsupno = supno[jcol];
-    
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-
-	krep = segrep[k];
-	k--;
-	ksupno = supno[krep];
-	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
-
-	    fsupc = xsup[ksupno];
-	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-  	    /* Distance from the current supernode to the current panel; 
-	       d_fsupc=0 if fsupc > fpanelc. */
-  	    d_fsupc = fst_col - fsupc; 
-
-	    luptr = xlusup[fst_col] + d_fsupc;
-	    lptr = xlsub[fsupc] + d_fsupc;
-
-	    kfnz = repfnz[krep];
-	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
-
-	    segsze = krep - kfnz + 1;
-	    nsupc = krep - fst_col + 1;
-	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	    nrow = nsupr - d_fsupc - nsupc;
-	    krep_ind = lptr + nsupc - 1;
-
-	    ops[TRSV] += 4 * segsze * (segsze - 1);
-	    ops[GEMV] += 8 * nrow * segsze;
-
-
-
-	    /* 
-	     * Case 1: Update U-segment of size 1 -- col-col update 
-	     */
-	    if ( segsze == 1 ) {
-	  	ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc;
-
-		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    irow = lsub[i];
-		    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-		    z_sub(&dense[irow], &dense[irow], &comp_temp);
-		    luptr++;
-		}
-
-	    } else if ( segsze <= 3 ) {
-		ukj = dense[lsub[krep_ind]];
-		luptr += nsupr*(nsupc-1) + nsupc-1;
-		ukj1 = dense[lsub[krep_ind - 1]];
-		luptr1 = luptr - nsupr;
-
-		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
-		    zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		    z_sub(&ukj, &ukj, &comp_temp);
-		    dense[lsub[krep_ind]] = ukj;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-			zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-			zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			z_add(&comp_temp, &comp_temp, &comp_temp1);
-			z_sub(&dense[irow], &dense[irow], &comp_temp);
-		    }
-		} else { /* Case 3: 3cols-col update */
-		    ukj2 = dense[lsub[krep_ind - 2]];
-		    luptr2 = luptr1 - nsupr;
-  		    zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
-		    z_sub(&ukj1, &ukj1, &comp_temp);
-
-		    zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-		    z_add(&comp_temp, &comp_temp, &comp_temp1);
-		    z_sub(&ukj, &ukj, &comp_temp);
-
-		    dense[lsub[krep_ind]] = ukj;
-		    dense[lsub[krep_ind-1]] = ukj1;
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-		    	irow = lsub[i];
-		    	luptr++;
-		    	luptr1++;
-			luptr2++;
-			zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-			zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			z_add(&comp_temp, &comp_temp, &comp_temp1);
-			zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-			z_add(&comp_temp, &comp_temp, &comp_temp1);
-			z_sub(&dense[irow], &dense[irow], &comp_temp);
-		    }
-		}
-
-
-	    } else {
-	  	/*
-		 * Case: sup-col update
-		 * Perform a triangular solve and block update,
-		 * then scatter the result of sup-col update to dense
-		 */
-
-		no_zeros = kfnz - fst_col;
-
-	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
-	        isub = lptr + no_zeros;
-	        for (i = 0; i < segsze; i++) {
-	  	    irow = lsub[isub];
-		    tempv[i] = dense[irow];
-		    ++isub; 
-	        }
-
-	        /* Dense triangular solve -- start effective triangle */
-		luptr += nsupr * no_zeros + no_zeros; 
-		
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#else		
-		ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-		       &nsupr, tempv, &incx );
-#endif		
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-                alpha = one;
-                beta = zero;
-#ifdef _CRAY
-		CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-		zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
-		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		zlsolve ( nsupr, segsze, &lusup[luptr], tempv );
-
- 		luptr += segsze;  /* Dense matrix-vector */
-		tempv1 = &tempv[segsze];
-		zmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		
-		
-                /* Scatter tempv[] into SPA dense[] as a temporary storage */
-                isub = lptr + no_zeros;
-                for (i = 0; i < segsze; i++) {
-                    irow = lsub[isub];
-                    dense[irow] = tempv[i];
-                    tempv[i] = zero;
-                    ++isub;
-                }
-
-		/* Scatter tempv1[] into SPA dense[] */
-		for (i = 0; i < nrow; i++) {
-		    irow = lsub[isub];
-		    z_sub(&dense[irow], &dense[irow], &tempv1[i]);
-		    tempv1[i] = zero;
-		    ++isub;
-		}
-	    }
-	    
-	} /* if jsupno ... */
-
-    } /* for each segment... */
-
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    nextlu = xlusup[jcol];
-    fsupc = xsup[jsupno];
-
-    /* Copy the SPA dense into L\U[*,j] */
-    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
-    while ( new_next > nzlumax ) {
-	if (mem_error = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
-	    return (mem_error);
-	lusup = Glu->lusup;
-	lsub = Glu->lsub;
-    }
-
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-        dense[irow] = zero;
-	++nextlu;
-    }
-
-    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */
-
-    /* For more updates within the panel (also within the current supernode), 
-     * should start from the first column of the panel, or the first column 
-     * of the supernode, whichever is bigger. There are 2 cases:
-     *    1) fsupc < fpanelc, then fst_col := fpanelc
-     *    2) fsupc >= fpanelc, then fst_col := fsupc
-     */
-    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
-
-    if ( fst_col < jcol ) {
-
-  	/* Distance between the current supernode and the current panel.
-	   d_fsupc=0 if fsupc >= fpanelc. */
-  	d_fsupc = fst_col - fsupc;
-
-	lptr = xlsub[fsupc] + d_fsupc;
-	luptr = xlusup[fst_col] + d_fsupc;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
-	nsupc = jcol - fst_col;	/* Excluding jcol */
-	nrow = nsupr - d_fsupc - nsupc;
-
-	/* Points to the beginning of jcol in snode L\U(jsupno) */
-	ufirst = xlusup[jcol] + d_fsupc;	
-
-	ops[TRSV] += 4 * nsupc * (nsupc - 1);
-	ops[GEMV] += 8 * nrow * nsupc;
-	
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#else
-	ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
-	       &nsupr, &lusup[ufirst], &incx );
-#endif
-	
-	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
-
-#ifdef _CRAY
-	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-	zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
-	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-
-	zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
-		&lusup[ufirst], tempv );
-	
-        /* Copy updates from tempv[*] into lusup[*] */
-	isub = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    z_sub(&lusup[isub], &lusup[isub], &tempv[i]);
-	    tempv[i] = zero;
-	    ++isub;
-	}
-
-#endif
-	
-	
-    } /* if fst_col < jcol ... */ 
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcolumn_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,269 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-/* What type of supernodes we want */
-#define T2_SUPER
-
-int
-zcolumn_dfs(
-	   const int  m,         /* in - number of rows in the matrix */
-	   const int  jcol,      /* in */
-	   int        *perm_r,   /* in */
-	   int        *nseg,     /* modified - with new segments appended */
-	   int        *lsub_col, /* in - defines the RHS vector to start the dfs */
-	   int        *segrep,   /* modified - with new segments appended */
-	   int        *repfnz,   /* modified */
-	   int        *xprune,   /* modified */
-	   int        *marker,   /* modified */
-	   int        *parent,	 /* working array */
-	   int        *xplore,   /* working array */
-	   GlobalLU_t *Glu       /* modified */
-	   )
-{
-/* 
- * Purpose
- * =======
- *   "column_dfs" performs a symbolic factorization on column jcol, and
- *   decide the supernode boundary.
- *
- *   This routine does not use numeric values, but only use the RHS 
- *   row indices to start the dfs.
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives. The routine returns a list of such supernodal 
- *   representatives in topological order of the dfs that generates them.
- *   The location of the first nonzero in each such supernodal segment
- *   (supernodal entry location) is also returned.
- *
- * Local parameters
- * ================
- *   nseg: no of segments in current U[*,j]
- *   jsuper: jsuper=EMPTY if column j does not belong to the same
- *	supernode as j-1. Otherwise, jsuper=nsuper.
- *
- *   marker2: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- * Return value
- * ============
- *     0  success;
- *   > 0  number of bytes allocated when run out of space.
- *
- */
-    int     jcolp1, jcolm1, jsuper, nsuper, nextl;
-    int     k, krep, krow, kmark, kperm;
-    int     *marker2;           /* Used for small panel LU */
-    int	    fsupc;		/* First column of a snode */
-    int     myfnz;		/* First nonz column of a U-segment */
-    int	    chperm, chmark, chrep, kchild;
-    int     xdfs, maxdfs, kpar, oldrep;
-    int     jptr, jm1ptr;
-    int     ito, ifrom, istop;	/* Used to compress row subscripts */
-    int     mem_error;
-    int     *xsup, *supno, *lsub, *xlsub;
-    int     nzlmax;
-    static  int  first = 1, maxsuper;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	first = 0;
-    }
-    jcolp1  = jcol + 1;
-    jcolm1  = jcol - 1;
-    nsuper  = supno[jcol];
-    jsuper  = nsuper;
-    nextl   = xlsub[jcol];
-    marker2 = &marker[2*m];
-
-
-    /* For each nonzero in A[*,jcol] do dfs */
-    for (k = 0; lsub_col[k] != EMPTY; k++) {
-
-	krow = lsub_col[k];
-    	lsub_col[k] = EMPTY;
-	kmark = marker2[krow];    	
-
-	/* krow was visited before, go to the next nonz */
-        if ( kmark == jcol ) continue; 
-
-	/* For each unmarked nbr krow of jcol
-	 *	krow is in L: place it in structure of L[*,jcol]
-	 */
-	marker2[krow] = jcol;
-	kperm = perm_r[krow];
-
-   	if ( kperm == EMPTY ) {
-	    lsub[nextl++] = krow; 	/* krow is indexed into A */
-	    if ( nextl >= nzlmax ) {
-		if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-		    return (mem_error);
-		lsub = Glu->lsub;
-	    }
-            if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */
-  	} else {
-	    /*	krow is in U: if its supernode-rep krep
-	     *	has been explored, update repfnz[*]
-	     */
-	    krep = xsup[supno[kperm]+1] - 1;
-	    myfnz = repfnz[krep];
-
-	    if ( myfnz != EMPTY ) {	/* Visited before */
-	    	if ( myfnz > kperm ) repfnz[krep] = kperm;
-		/* continue; */
-	    }
-	    else {
-		/* Otherwise, perform dfs starting at krep */
-		oldrep = EMPTY;
-	 	parent[krep] = oldrep;
-	  	repfnz[krep] = kperm;
-		xdfs = xlsub[krep];
-	  	maxdfs = xprune[krep];
-
-		do {
-		    /* 
-		     * For each unmarked kchild of krep 
-		     */
-		    while ( xdfs < maxdfs ) {
-
-		   	kchild = lsub[xdfs];
-			xdfs++;
-		  	chmark = marker2[kchild];
-
-		   	if ( chmark != jcol ) { /* Not reached yet */
-		   	    marker2[kchild] = jcol;
-		   	    chperm = perm_r[kchild];
-
-		   	    /* Case kchild is in L: place it in L[*,k] */
-		   	    if ( chperm == EMPTY ) {
-			    	lsub[nextl++] = kchild;
-				if ( nextl >= nzlmax ) {
-				    if ( mem_error =
-					 zLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu) )
-					return (mem_error);
-				    lsub = Glu->lsub;
-				}
-				if ( chmark != jcolm1 ) jsuper = EMPTY;
-			    } else {
-		    	    	/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-			         *   been explored, update its repfnz[*]
-			         */
-		   	    	chrep = xsup[supno[chperm]+1] - 1;
-		   		myfnz = repfnz[chrep];
-		   		if ( myfnz != EMPTY ) { /* Visited before */
-				    if ( myfnz > chperm )
-     				  	repfnz[chrep] = chperm;
-				} else {
-		        	    /* Continue dfs at super-rep of kchild */
-		   		    xplore[krep] = xdfs;	
-		   		    oldrep = krep;
-		   		    krep = chrep; /* Go deeper down G(L^t) */
-				    parent[krep] = oldrep;
-		    		    repfnz[krep] = chperm;
-		   		    xdfs = xlsub[krep];     
-				    maxdfs = xprune[krep];
-				} /* else */
-
-			   } /* else */
-
-			} /* if */
-
-		    } /* while */
-
-		    /* krow has no more unexplored nbrs;
-	   	     *    place supernode-rep krep in postorder DFS.
-	   	     *    backtrack dfs to its parent
-		     */
-		    segrep[*nseg] = krep;
-		    ++(*nseg);
-		    kpar = parent[krep]; /* Pop from stack, mimic recursion */
-		    if ( kpar == EMPTY ) break; /* dfs done */
-		    krep = kpar;
-		    xdfs = xplore[krep];
-		    maxdfs = xprune[krep];
-
-		} while ( kpar != EMPTY ); 	/* Until empty stack */
-
-	    } /* else */
-
-	} /* else */
-
-    } /* for each nonzero ... */
-
-    /* Check to see if j belongs in the same supernode as j-1 */
-    if ( jcol == 0 ) { /* Do nothing for column 0 */
-	nsuper = supno[0] = 0;
-    } else {
-   	fsupc = xsup[nsuper];
-	jptr = xlsub[jcol];	/* Not compressed yet */
-	jm1ptr = xlsub[jcolm1];
-
-#ifdef T2_SUPER
-	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
-#endif
-	/* Make sure the number of columns in a supernode doesn't
-	   exceed threshold. */
-	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
-
-	/* If jcol starts a new supernode, reclaim storage space in
-	 * lsub from the previous supernode. Note we only store
-	 * the subscript set of the first and last columns of
-   	 * a supernode. (first for num values, last for pruning)
-	 */
-	if ( jsuper == EMPTY ) {	/* starts a new supernode */
-	    if ( (fsupc < jcolm1-1) ) {	/* >= 3 columns in nsuper */
-#ifdef CHK_COMPRESS
-		printf("  Compress lsub[] at super %d-%d\n", fsupc, jcolm1);
-#endif
-	        ito = xlsub[fsupc+1];
-		xlsub[jcolm1] = ito;
-		istop = ito + jptr - jm1ptr;
-		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
-		xlsub[jcol] = istop;
-		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
-		    lsub[ito] = lsub[ifrom];
-		nextl = ito;            /* = istop + length(jcol) */
-	    }
-	    nsuper++;
-	    supno[jcol] = nsuper;
-	} /* if a new supernode */
-
-    }	/* else: jcol > 0 */ 
-    
-    /* Tidy up the pointers before exit */
-    xsup[nsuper+1] = jcolp1;
-    supno[jcolp1]  = nsuper;
-    xprune[jcol]   = nextl;	/* Initialize upper bound for pruning */
-    xlsub[jcolp1]  = nextl;
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zcopy_to_ucol.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,103 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-int
-zcopy_to_ucol(
-	      int        jcol,	  /* in */
-	      int        nseg,	  /* in */
-	      int        *segrep,  /* in */
-	      int        *repfnz,  /* in */
-	      int        *perm_r,  /* in */
-	      doublecomplex     *dense,   /* modified - reset to zero on return */
-	      GlobalLU_t *Glu      /* modified */
-	      )
-{
-/* 
- * Gather from SPA dense[*] to global ucol[*].
- */
-    int ksub, krep, ksupno;
-    int i, k, kfnz, segsze;
-    int fsupc, isub, irow;
-    int jsupno, nextu;
-    int new_next, mem_error;
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-    doublecomplex    *ucol;
-    int       *usub, *xusub;
-    int       nzumax;
-
-    doublecomplex zero = {0.0, 0.0};
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    nzumax  = Glu->nzumax;
-    
-    jsupno = supno[jcol];
-    nextu  = xusub[jcol];
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) {
-	krep = segrep[k--];
-	ksupno = supno[krep];
-
-	if ( ksupno != jsupno ) { /* Should go into ucol[] */
-	    kfnz = repfnz[krep];
-	    if ( kfnz != EMPTY ) {	/* Nonzero U-segment */
-
-	    	fsupc = xsup[ksupno];
-	        isub = xlsub[fsupc] + kfnz - fsupc;
-	        segsze = krep - kfnz + 1;
-
-		new_next = nextu + segsze;
-		while ( new_next > nzumax ) {
-		    if (mem_error = zLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu))
-			return (mem_error);
-		    ucol = Glu->ucol;
-		    if (mem_error = zLUMemXpand(jcol, nextu, USUB, &nzumax, Glu))
-			return (mem_error);
-		    usub = Glu->usub;
-		    lsub = Glu->lsub;
-		}
-		
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    usub[nextu] = perm_r[irow];
-		    ucol[nextu] = dense[irow];
-		    dense[irow] = zero;
-		    nextu++;
-		    isub++;
-		} 
-
-	    }
-
-	}
-
-    } /* for each segment... */
-
-    xusub[jcol + 1] = nextu;      /* Close U[*,jcol] */
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgscon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgscon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgscon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgscon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,143 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	zgscon.c
- * History:     Modified from lapack routines ZGECON.
- */
-#include <math.h>
-#include "slu_zdefs.h"
-
-void
-zgscon(char *norm, SuperMatrix *L, SuperMatrix *U,
-       double anorm, double *rcond, SuperLUStat_t *stat, int *info)
-{
-/*
-    Purpose   
-    =======   
-
-    ZGSCON estimates the reciprocal of the condition number of a general 
-    real matrix A, in either the 1-norm or the infinity-norm, using   
-    the LU factorization computed by ZGETRF.   
-
-    An estimate is obtained for norm(inv(A)), and the reciprocal of the   
-    condition number is computed as   
-       RCOND = 1 / ( norm(A) * norm(inv(A)) ).   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    NORM    (input) char*
-            Specifies whether the 1-norm condition number or the   
-            infinity-norm condition number is required:   
-            = '1' or 'O':  1-norm;   
-            = 'I':         Infinity-norm.
-	    
-    L       (input) SuperMatrix*
-            The factor L from the factorization Pr*A*Pc=L*U as computed by
-            zgstrf(). Use compressed row subscripts storage for supernodes,
-            i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
- 
-    U       (input) SuperMatrix*
-            The factor U from the factorization Pr*A*Pc=L*U as computed by
-            zgstrf(). Use column-wise storage scheme, i.e., U has types:
-            Stype = SLU_NC, Dtype = SLU_Z, Mtype = TRU.
-	    
-    ANORM   (input) double
-            If NORM = '1' or 'O', the 1-norm of the original matrix A.   
-            If NORM = 'I', the infinity-norm of the original matrix A.
-	    
-    RCOND   (output) double*
-            The reciprocal of the condition number of the matrix A,   
-            computed as RCOND = 1/(norm(A) * norm(inv(A))).
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    int    kase, kase1, onenrm, i;
-    double ainvnm;
-    doublecomplex *work;
-    extern int zrscl_(int *, doublecomplex *, doublecomplex *, int *);
-
-    extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *);
-
-    
-    /* Test the input parameters. */
-    *info = 0;
-    onenrm = *(unsigned char *)norm == '1' || lsame_(norm, "O");
-    if (! onenrm && ! lsame_(norm, "I")) *info = -1;
-    else if (L->nrow < 0 || L->nrow != L->ncol ||
-             L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU)
-	 *info = -2;
-    else if (U->nrow < 0 || U->nrow != U->ncol ||
-             U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU) 
-	*info = -3;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("zgscon", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    *rcond = 0.;
-    if ( L->nrow == 0 || U->nrow == 0) {
-	*rcond = 1.;
-	return;
-    }
-
-    work = doublecomplexCalloc( 3*L->nrow );
-
-
-    if ( !work )
-	ABORT("Malloc fails for work arrays in zgscon.");
-    
-    /* Estimate the norm of inv(A). */
-    ainvnm = 0.;
-    if ( onenrm ) kase1 = 1;
-    else kase1 = 2;
-    kase = 0;
-
-    do {
-	zlacon_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase);
-
-	if (kase == 0) break;
-
-	if (kase == kase1) {
-	    /* Multiply by inv(L). */
-	    sp_ztrsv("L", "No trans", "Unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(U). */
-	    sp_ztrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info);
-	    
-	} else {
-
-	    /* Multiply by inv(U'). */
-	    sp_ztrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info);
-
-	    /* Multiply by inv(L'). */
-	    sp_ztrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info);
-	    
-	}
-
-    } while ( kase != 0 );
-
-    /* Compute the estimate of the reciprocal condition number. */
-    if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm;
-
-    SUPERLU_FREE (work);
-    return;
-
-} /* zgscon */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsequ.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsequ.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsequ.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsequ.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,184 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	zgsequ.c
- * History:     Modified from LAPACK routine ZGEEQU
- */
-#include <math.h>
-#include "slu_zdefs.h"
-
-void
-zgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	double *colcnd, double *amax, int *info)
-{
-/*    
-    Purpose   
-    =======   
-
-    ZGSEQU computes row and column scalings intended to equilibrate an   
-    M-by-N sparse matrix A and reduce its condition number. R returns the row
-    scale factors and C the column scale factors, chosen to try to make   
-    the largest element in each row and column of the matrix B with   
-    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
-
-    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
-    number and BIGNUM = largest safe number.  Use of these scaling   
-    factors is not guaranteed to reduce the condition number of A but   
-    works well in practice.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
- 
-    Arguments   
-    =========   
-
-    A       (input) SuperMatrix*
-            The matrix of dimension (A->nrow, A->ncol) whose equilibration
-            factors are to be computed. The type of A can be:
-            Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE.
-	    
-    R       (output) double*, size A->nrow
-            If INFO = 0 or INFO > M, R contains the row scale factors   
-            for A.
-	    
-    C       (output) double*, size A->ncol
-            If INFO = 0,  C contains the column scale factors for A.
-	    
-    ROWCND  (output) double*
-            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
-            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
-            AMAX is neither too large nor too small, it is not worth   
-            scaling by R.
-	    
-    COLCND  (output) double*
-            If INFO = 0, COLCND contains the ratio of the smallest   
-            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
-            worth scaling by C.
-	    
-    AMAX    (output) double*
-            Absolute value of largest matrix element.  If AMAX is very   
-            close to overflow or very close to underflow, the matrix   
-            should be scaled.
-	    
-    INFO    (output) int*
-            = 0:  successful exit   
-            < 0:  if INFO = -i, the i-th argument had an illegal value   
-            > 0:  if INFO = i,  and i is   
-                  <= A->nrow:  the i-th row of A is exactly zero   
-                  >  A->ncol:  the (i-M)-th column of A is exactly zero   
-
-    ===================================================================== 
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int i, j, irow;
-    double rcmin, rcmax;
-    double bignum, smlnum;
-    extern double dlamch_(char *);
-    
-    /* Test the input parameters. */
-    *info = 0;
-    if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("zgsequ", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Get machine constants. */
-    smlnum = dlamch_("S");
-    bignum = 1. / smlnum;
-
-    /* Compute row scale factors. */
-    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
-
-    /* Find the maximum element in each row. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    r[irow] = SUPERLU_MAX( r[irow], z_abs1(&Aval[i]) );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (i = 0; i < A->nrow; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
-    }
-    *amax = rcmax;
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    /* Compute column scale factors */
-    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
-
-    /* Find the maximum element in each column, assuming the row
-       scalings computed above. */
-    for (j = 0; j < A->ncol; ++j)
-	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-	    irow = Astore->rowind[i];
-	    c[j] = SUPERLU_MAX( c[j], z_abs1(&Aval[i]) * r[irow] );
-	}
-
-    /* Find the maximum and minimum scale factors. */
-    rcmin = bignum;
-    rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
-    }
-
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
-    }
-
-    return;
-
-} /* zgsequ */
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsrfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsrfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgsrfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgsrfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,447 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:	zgsrfs.c
- * History:     Modified from lapack routine ZGERFS
- */
-#include <math.h>
-#include "slu_zdefs.h"
-
-void
-zgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
-       int *perm_c, int *perm_r, char *equed, double *R, double *C,
-       SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr,
-       SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose   
- *   =======   
- *
- *   ZGSRFS improves the computed solution to a system of linear   
- *   equations and provides error bounds and backward error estimates for 
- *   the solution.   
- *
- *   If equilibration was performed, the system becomes:
- *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- *   Arguments   
- *   =========   
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *   
- *   A       (input) SuperMatrix*
- *           The original matrix A in the system, or the scaled A if
- *           equilibration was done. The type of A can be:
- *           Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_GE.
- *    
- *   L       (input) SuperMatrix*
- *	     The factor L from the factorization Pr*A*Pc=L*U. Use
- *           compressed row subscripts storage for supernodes, 
- *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
- * 
- *   U       (input) SuperMatrix*
- *           The factor U from the factorization Pr*A*Pc=L*U as computed by
- *           zgstrf(). Use column-wise storage scheme, 
- *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
- *
- *   perm_c  (input) int*, dimension (A->ncol)
- *	     Column permutation vector, which defines the 
- *           permutation matrix Pc; perm_c[i] = j means column i of A is 
- *           in position j in A*Pc.
- *
- *   perm_r  (input) int*, dimension (A->nrow)
- *           Row permutation vector, which defines the permutation matrix Pr;
- *           perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- *   equed   (input) Specifies the form of equilibration that was done.
- *           = 'N': No equilibration.
- *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *           = 'C': Column equilibration, i.e., A was postmultiplied by
- *                  diag(C).
- *           = 'B': Both row and column equilibration, i.e., A was replaced 
- *                  by diag(R)*A*diag(C).
- *
- *   R       (input) double*, dimension (A->nrow)
- *           The row scale factors for A.
- *           If equed = 'R' or 'B', A is premultiplied by diag(R).
- *           If equed = 'N' or 'C', R is not accessed.
- * 
- *   C       (input) double*, dimension (A->ncol)
- *           The column scale factors for A.
- *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
- *           If equed = 'N' or 'R', C is not accessed.
- *
- *   B       (input) SuperMatrix*
- *           B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
- *           The right hand side matrix B.
- *           if equed = 'R' or 'B', B is premultiplied by diag(R).
- *
- *   X       (input/output) SuperMatrix*
- *           X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
- *           On entry, the solution matrix X, as computed by zgstrs().
- *           On exit, the improved solution matrix X.
- *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
- *               in order to obtain the solution to the original system.
- *
- *   FERR    (output) double*, dimension (B->ncol)   
- *           The estimated forward error bound for each solution vector   
- *           X(j) (the j-th column of the solution matrix X).   
- *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *           is an estimated upper bound for the magnitude of the largest 
- *           element in (X(j) - XTRUE) divided by the magnitude of the   
- *           largest element in X(j).  The estimate is as reliable as   
- *           the estimate for RCOND, and is almost always a slight   
- *           overestimate of the true error.
- *
- *   BERR    (output) double*, dimension (B->ncol)   
- *           The componentwise relative backward error of each solution   
- *           vector X(j) (i.e., the smallest relative change in   
- *           any element of A or B that makes X(j) an exact solution).
- *
- *   stat     (output) SuperLUStat_t*
- *            Record the statistics on runtime and floating-point operation count.
- *            See util.h for the definition of 'SuperLUStat_t'.
- *
- *   info    (output) int*   
- *           = 0:  successful exit   
- *            < 0:  if INFO = -i, the i-th argument had an illegal value   
- *
- *    Internal Parameters   
- *    ===================   
- *
- *    ITMAX is the maximum number of steps of iterative refinement.   
- *
- */  
-
-#define ITMAX 5
-    
-    /* Table of constant values */
-    int    ione = 1;
-    doublecomplex ndone = {-1., 0.};
-    doublecomplex done = {1., 0.};
-    
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    SuperMatrix Bjcol;
-    DNformat *Bstore, *Xstore, *Bjcol_store;
-    doublecomplex   *Bmat, *Xmat, *Bptr, *Xptr;
-    int      kase;
-    double   safe1, safe2;
-    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
-    int      ldb, ldx, nrhs;
-    double   s, xk, lstres, eps, safmin;
-    char     transc[1];
-    trans_t  transt;
-    doublecomplex   *work;
-    double   *rwork;
-    int      *iwork;
-    extern double dlamch_(char *);
-    extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *);
-#ifdef _CRAY
-    extern int CCOPY(int *, doublecomplex *, int *, doublecomplex *, int *);
-    extern int CSAXPY(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *);
-#else
-    extern int zcopy_(int *, doublecomplex *, int *, doublecomplex *, int *);
-    extern int zaxpy_(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *);
-#endif
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    Bstore = B->Store;
-    Xstore = X->Store;
-    Bmat   = Bstore->nzval;
-    Xmat   = Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-    
-    /* Test the input parameters */
-    *info = 0;
-    notran = (trans == NOTRANS);
-    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
- 	      L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU )
-	*info = -3;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
- 	      U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU )
-	*info = -4;
-    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
- 	      B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE )
-        *info = -10;
-    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
- 	      X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE )
-	*info = -11;
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("zgsrfs", &i);
-	return;
-    }
-
-    /* Quick return if possible */
-    if ( A->nrow == 0 || nrhs == 0) {
-	for (j = 0; j < nrhs; ++j) {
-	    ferr[j] = 0.;
-	    berr[j] = 0.;
-	}
-	return;
-    }
-
-    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-    colequ = lsame_(equed, "C") || lsame_(equed, "B");
-    
-    /* Allocate working space */
-    work = doublecomplexMalloc(2*A->nrow);
-    rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) );
-    iwork = intMalloc(A->nrow);
-    if ( !work || !rwork || !iwork ) 
-        ABORT("Malloc fails for work/rwork/iwork.");
-    
-    if ( notran ) {
-	*(unsigned char *)transc = 'N';
-        transt = TRANS;
-    } else {
-	*(unsigned char *)transc = 'T';
-	transt = NOTRANS;
-    }
-
-    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
-    nz     = A->ncol + 1;
-    eps    = dlamch_("Epsilon");
-    safmin = dlamch_("Safe minimum");
-    safe1  = nz * safmin;
-    safe2  = safe1 / eps;
-
-    /* Compute the number of nonzeros in each row (or column) of A */
-    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
-    if ( notran ) {
-	for (k = 0; k < A->ncol; ++k)
-	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
-		++iwork[Astore->rowind[i]];
-    } else {
-	for (k = 0; k < A->ncol; ++k)
-	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
-    }	
-
-    /* Copy one column of RHS B into Bjcol. */
-    Bjcol.Stype = B->Stype;
-    Bjcol.Dtype = B->Dtype;
-    Bjcol.Mtype = B->Mtype;
-    Bjcol.nrow  = B->nrow;
-    Bjcol.ncol  = 1;
-    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
-    Bjcol_store = Bjcol.Store;
-    Bjcol_store->lda = ldb;
-    Bjcol_store->nzval = work; /* address aliasing */
-	
-    /* Do for each right hand side ... */
-    for (j = 0; j < nrhs; ++j) {
-	count = 0;
-	lstres = 3.;
-	Bptr = &Bmat[j*ldb];
-	Xptr = &Xmat[j*ldx];
-
-	while (1) { /* Loop until stopping criterion is satisfied. */
-
-	    /* Compute residual R = B - op(A) * X,   
-	       where op(A) = A, A**T, or A**H, depending on TRANS. */
-	    
-#ifdef _CRAY
-	    CCOPY(&A->nrow, Bptr, &ione, work, &ione);
-#else
-	    zcopy_(&A->nrow, Bptr, &ione, work, &ione);
-#endif
-	    sp_zgemv(transc, ndone, A, Xptr, ione, done, work, ione);
-
-	    /* Compute componentwise relative backward error from formula 
-	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
-	       where abs(Z) is the componentwise absolute value of the matrix
-	       or vector Z.  If the i-th component of the denominator is less
-	       than SAFE2, then SAFE1 is added to the i-th component of the   
-	       numerator and denominator before dividing. */
-
-	    for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] );
-	    
-	    /* Compute abs(op(A))*abs(X) + abs(B). */
-	    if (notran) {
-		for (k = 0; k < A->ncol; ++k) {
-		    xk = z_abs1( &Xptr[k] );
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-			rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk;
-		}
-	    } else {
-		for (k = 0; k < A->ncol; ++k) {
-		    s = 0.;
-		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-			irow = Astore->rowind[i];
-			s += z_abs1(&Aval[i]) * z_abs1(&Xptr[irow]);
-		    }
-		    rwork[k] += s;
-		}
-	    }
-	    s = 0.;
-	    for (i = 0; i < A->nrow; ++i) {
-		if (rwork[i] > safe2)
-		    s = SUPERLU_MAX( s, z_abs1(&work[i]) / rwork[i] );
-		else
-		    s = SUPERLU_MAX( s, (z_abs1(&work[i]) + safe1) / 
-				(rwork[i] + safe1) );
-	    }
-	    berr[j] = s;
-
-	    /* Test stopping criterion. Continue iterating if   
-	       1) The residual BERR(J) is larger than machine epsilon, and   
-	       2) BERR(J) decreased by at least a factor of 2 during the   
-	          last iteration, and   
-	       3) At most ITMAX iterations tried. */
-
-	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
-		/* Update solution and try again. */
-		zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-#ifdef _CRAY
-		CAXPY(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#else
-		zaxpy_(&A->nrow, &done, work, &ione,
-		       &Xmat[j*ldx], &ione);
-#endif
-		lstres = berr[j];
-		++count;
-	    } else {
-		break;
-	    }
-        
-	} /* end while */
-
-	stat->RefineSteps = count;
-
-	/* Bound error from formula:
-	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
-	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
-          where   
-            norm(Z) is the magnitude of the largest component of Z   
-            inv(op(A)) is the inverse of op(A)   
-            abs(Z) is the componentwise absolute value of the matrix or
-	       vector Z   
-            NZ is the maximum number of nonzeros in any row of A, plus 1   
-            EPS is machine epsilon   
-
-          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
-          is incremented by SAFE1 if the i-th component of   
-          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   
-
-          Use ZLACON to estimate the infinity-norm of the matrix   
-             inv(op(A)) * diag(W),   
-          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
-	
-	for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] );
-	
-	/* Compute abs(op(A))*abs(X) + abs(B). */
-	if ( notran ) {
-	    for (k = 0; k < A->ncol; ++k) {
-		xk = z_abs1( &Xptr[k] );
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
-		    rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk;
-	    }
-	} else {
-	    for (k = 0; k < A->ncol; ++k) {
-		s = 0.;
-		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    xk = z_abs1( &Xptr[irow] );
-		    s += z_abs1(&Aval[i]) * xk;
-		}
-		rwork[k] += s;
-	    }
-	}
-	
-	for (i = 0; i < A->nrow; ++i)
-	    if (rwork[i] > safe2)
-		rwork[i] = z_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i];
-	    else
-		rwork[i] = z_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;
-	kase = 0;
-
-	do {
-	    zlacon_(&A->nrow, &work[A->nrow], work,
-		    &ferr[j], &kase);
-	    if (kase == 0) break;
-
-	    if (kase == 1) {
-		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) {
-		        zd_mult(&work[i], &work[i], C[i]);
-	            }
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->nrow; ++i) {
-		        zd_mult(&work[i], &work[i], R[i]);
-                    }
-
-		zgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		for (i = 0; i < A->nrow; ++i) {
-		    zd_mult(&work[i], &work[i], rwork[i]);
-	 	}
-	    } else {
-		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
-		for (i = 0; i < A->nrow; ++i) {
-		    zd_mult(&work[i], &work[i], rwork[i]);
-		}
-		
-		zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
-		
-		if ( notran && colequ )
-		    for (i = 0; i < A->ncol; ++i) {
-		        zd_mult(&work[i], &work[i], C[i]);
-		    }
-		else if ( !notran && rowequ )
-		    for (i = 0; i < A->ncol; ++i) {
-		        zd_mult(&work[i], &work[i], R[i]);  
-		    }
-	    }
-	    
-	} while ( kase != 0 );
-
-	/* Normalize error. */
-	lstres = 0.;
- 	if ( notran && colequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, C[i] * z_abs1( &Xptr[i]) );
-  	} else if ( !notran && rowequ ) {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, R[i] * z_abs1( &Xptr[i]) );
-	} else {
-	    for (i = 0; i < A->nrow; ++i)
-	    	lstres = SUPERLU_MAX( lstres, z_abs1( &Xptr[i]) );
-	}
-	if ( lstres != 0. )
-	    ferr[j] /= lstres;
-
-    } /* for each RHS j ... */
-    
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(rwork);
-    SUPERLU_FREE(iwork);
-    SUPERLU_FREE(Bjcol.Store);
-
-    return;
-
-} /* zgsrfs */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssv.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssv.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssv.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,221 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_zdefs.h"
-
-void
-zgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-      SuperMatrix *L, SuperMatrix *U, SuperMatrix *B,
-      SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * ZGSSV solves the system of linear equations A*X=B, using the
- * LU factorization from ZGSTRF. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *
- *      1.1. Permute the columns of A, forming A*Pc, where Pc
- *           is a permutation matrix. For more details of this step, 
- *           see sp_preorder.c.
- *
- *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
- *           by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      1.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
- *      above algorithm to the transpose of A:
- *
- *      2.1. Permute columns of transpose(A) (rows of A),
- *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
- *           determined by Gaussian elimination with partial pivoting.
- *           L is unit lower triangular with offdiagonal entries
- *           bounded by 1 in magnitude, and U is upper triangular.
- *
- *      2.3. Solve the system of equations A*X=B using the factored
- *           form of A.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- * 
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR; Dtype = SLU_Z; Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- * perm_c  (input/output) int*
- *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
- *         which defines the permutation matrix Pc; perm_c[i] = j means 
- *         column i of A is in position j in A*Pc.
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
- *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
- *            On exit, perm_c may be overwritten by the product of the input
- *            perm_c and a permutation that postorders the elimination tree
- *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *            is already in postorder.
- *         Otherwise, it is an output argument.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined 
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->RowPerm = MY_PERMR or
- *            options->Fact = SamePattern_SameRowPerm, perm_r is an
- *            input argument.
- *         otherwise it is an output argument.
- *
- * L       (output) SuperMatrix*
- *         The factor L from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
- *         
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization 
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *	   = 0: successful exit
- *         > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                so the solution could not be computed.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol.
- *   
- */
-    DNformat *Bstore;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int      lwork = 0, *etree, i;
-    
-    /* Set default values for some parameters */
-    double   drop_tol = 0.;
-    int      panel_size;     /* panel size */
-    int      relax;          /* no of columns in a relaxed snodes */
-    int      permc_spec;
-    trans_t  trans = NOTRANS;
-    double   *utime;
-    double   t;	/* Temporary time */
-
-    /* Test the input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    if ( options->Fact != DOFACT ) *info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	 A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-	B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE )
-	*info = -7;
-    if ( *info != 0 ) {
-	i = -(*info);
-	xerbla_("zgssv", &i);
-	return;
-    }
-
-    utime = stat->utime;
-
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	trans = TRANS;
-    } else {
-        if ( A->Stype == SLU_NC ) AA = A;
-    }
-
-    t = SuperLU_timer_();
-    /*
-     * Get column permutation vector perm_c[], according to permc_spec:
-     *   permc_spec = NATURAL:  natural ordering 
-     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-     *   permc_spec = COLAMD:   approximate minimum degree column ordering
-     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-     */
-    permc_spec = options->ColPerm;
-    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-      get_perm_c(permc_spec, AA, perm_c);
-    utime[COLPERM] = SuperLU_timer_() - t;
-
-    etree = intMalloc(A->ncol);
-
-    t = SuperLU_timer_();
-    sp_preorder(options, AA, perm_c, etree, &AC);
-    utime[ETREE] = SuperLU_timer_() - t;
-
-    panel_size = sp_ienv(1);
-    relax = sp_ienv(2);
-
-    /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	  relax, panel_size, sp_ienv(3), sp_ienv(4));*/
-    t = SuperLU_timer_(); 
-    /* Compute the LU factorization of A. */
-    zgstrf(options, &AC, drop_tol, relax, panel_size,
-	   etree, NULL, lwork, perm_c, perm_r, L, U, stat, info);
-    utime[FACT] = SuperLU_timer_() - t;
-
-    t = SuperLU_timer_();
-    if ( *info == 0 ) {
-        /* Solve the system A*X=B, overwriting B with X. */
-        zgstrs (trans, L, U, perm_c, perm_r, B, stat, info);
-    }
-    utime[SOLVE] = SuperLU_timer_() - t;
-
-    SUPERLU_FREE (etree);
-    Destroy_CompCol_Permuted(&AC);
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssvx.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssvx.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgssvx.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgssvx.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,614 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_zdefs.h"
-
-void
-zgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
-       int *etree, char *equed, double *R, double *C,
-       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
-       SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, 
-       double *rcond, double *ferr, double *berr, 
-       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
-{
-/*
- * Purpose
- * =======
- *
- * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using
- * the LU factorization from zgstrf(). Error bounds on the solution and
- * a condition estimate are also provided. It performs the following steps:
- *
- *   1. If A is stored column-wise (A->Stype = SLU_NC):
- *  
- *      1.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A is
- *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
- *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
- *           = TRANS or CONJ).
- *
- *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
- *           matrix that usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the matrix A (after equilibration if options->Equil = YES)
- *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
- *
- *      1.4. Compute the reciprocal pivot growth factor.
- *
- *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form of 
- *           A is used to estimate the condition number of the matrix A. If
- *           the reciprocal of the condition number is less than machine
- *           precision, info = A->ncol+1 is returned as a warning, but the
- *           routine still goes on to solve for X and computes error bounds
- *           as described below.
- *
- *      1.6. The system of equations is solved for X using the factored form
- *           of A.
- *
- *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      1.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R)
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
- *      to the transpose of A:
- *
- *      2.1. If options->Equil = YES, scaling factors are computed to
- *           equilibrate the system:
- *           options->Trans = NOTRANS:
- *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
- *           options->Trans = TRANS:
- *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
- *           options->Trans = CONJ:
- *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
- *           Whether or not the system will be equilibrated depends on the
- *           scaling of the matrix A, but if equilibration is used, A' is
- *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
- *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
- *
- *      2.2. Permute columns of transpose(A) (rows of A), 
- *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
- *           usually preserves sparsity.
- *           For more details of this step, see sp_preorder.c.
- *
- *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
- *           factor the transpose(A) (after equilibration if 
- *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
- *           permutation Pr determined by partial pivoting.
- *
- *      2.4. Compute the reciprocal pivot growth factor.
- *
- *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
- *           routine returns with info = i. Otherwise, the factored form 
- *           of transpose(A) is used to estimate the condition number of the
- *           matrix A. If the reciprocal of the condition number
- *           is less than machine precision, info = A->nrow+1 is returned as
- *           a warning, but the routine still goes on to solve for X and
- *           computes error bounds as described below.
- *
- *      2.6. The system of equations is solved for X using the factored form
- *           of transpose(A).
- *
- *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
- *           applied to improve the computed solution matrix and calculate
- *           error bounds and backward error estimates for it.
- *
- *      2.8. If equilibration was used, the matrix X is premultiplied by
- *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
- *           (if options->Trans = TRANS or CONJ) so that it solves the
- *           original system before equilibration.
- *
- *   See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed and how the
- *         system will be solved.
- *
- * A       (input/output) SuperMatrix*
- *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
- *         of the linear equations is A->nrow. Currently, the type of A can be:
- *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
- *         In the future, more general A may be handled.
- *
- *         On entry, If options->Fact = FACTORED and equed is not 'N', 
- *         then A must have been equilibrated by the scaling factors in
- *         R and/or C.  
- *         On exit, A is not modified if options->Equil = NO, or if 
- *         options->Equil = YES but equed = 'N' on exit.
- *         Otherwise, if options->Equil = YES and equed is not 'N',
- *         A is scaled as follows:
- *         If A->Stype = SLU_NC:
- *           equed = 'R':  A := diag(R) * A
- *           equed = 'C':  A := A * diag(C)
- *           equed = 'B':  A := diag(R) * A * diag(C).
- *         If A->Stype = SLU_NR:
- *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
- *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
- *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
- *
- * perm_c  (input/output) int*
- *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
- *         which defines the permutation matrix Pc; perm_c[i] = j means
- *         column i of A is in position j in A*Pc.
- *         On exit, perm_c may be overwritten by the product of the input
- *         perm_c and a permutation that postorders the elimination tree
- *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
- *         is already in postorder.
- *
- *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
- *         which describes permutation of columns of transpose(A) 
- *         (rows of A) as described above.
- * 
- * perm_r  (input/output) int*
- *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
- *         which defines the permutation matrix Pr, and is determined
- *         by partial pivoting.  perm_r[i] = j means row i of A is in 
- *         position j in Pr*A.
- *
- *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
- *         determines permutation of rows of transpose(A)
- *         (columns of A) as described above.
- *
- *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *         will try to use the input perm_r, unless a certain threshold
- *         criterion is violated. In that case, perm_r is overwritten by a
- *         new permutation determined by partial pivoting or diagonal
- *         threshold pivoting.
- *         Otherwise, perm_r is output argument.
- * 
- * etree   (input/output) int*,  dimension (A->ncol)
- *         Elimination tree of Pc'*A'*A*Pc.
- *         If options->Fact != FACTORED and options->Fact != DOFACT,
- *         etree is an input argument, otherwise it is an output argument.
- *         Note: etree is a vector of parent pointers for a forest whose
- *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *
- * equed   (input/output) char*
- *         Specifies the form of equilibration that was done.
- *         = 'N': No equilibration.
- *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
- *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
- *         = 'B': Both row and column equilibration, i.e., A was replaced 
- *                by diag(R)*A*diag(C).
- *         If options->Fact = FACTORED, equed is an input argument,
- *         otherwise it is an output argument.
- *
- * R       (input/output) double*, dimension (A->nrow)
- *         The row scale factors for A or transpose(A).
- *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
- *         If equed = 'N' or 'C', R is not accessed.
- *         If options->Fact = FACTORED, R is an input argument,
- *             otherwise, R is output.
- *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
- *             of R must be positive.
- * 
- * C       (input/output) double*, dimension (A->ncol)
- *         The column scale factors for A or transpose(A).
- *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
- *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
- *         If equed = 'N' or 'R', C is not accessed.
- *         If options->Fact = FACTORED, C is an input argument,
- *             otherwise, C is output.
- *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
- *             of C must be positive.
- *         
- * L       (output) SuperMatrix*
- *	   The factor L from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses compressed row subscripts storage for supernodes, i.e.,
- *         L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
- *
- * U       (output) SuperMatrix*
- *	   The factor U from the factorization
- *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
- *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
- *         Uses column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
- *
- * work    (workspace/output) void*, size (lwork) (in bytes)
- *         User supplied workspace, should be large enough
- *         to hold data structures for factors L and U.
- *         On exit, if fact is not 'F', L and U point to this array.
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               mem_usage->total_needed; no other side effects.
- *
- *         See argument 'mem_usage' for memory usage statistics.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         If B->ncol = 0, only LU decomposition is performed, the triangular
- *                         solve is skipped.
- *         On exit,
- *            if equed = 'N', B is not modified; otherwise
- *            if A->Stype = SLU_NC:
- *               if options->Trans = NOTRANS and equed = 'R' or 'B',
- *                  B is overwritten by diag(R)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
- *                  B is overwritten by diag(C)*B;
- *            if A->Stype = SLU_NR:
- *               if options->Trans = NOTRANS and equed = 'C' or 'B',
- *                  B is overwritten by diag(C)*B;
- *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
- *                  B is overwritten by diag(R)*B.
- *
- * X       (output) SuperMatrix*
- *         X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. 
- *         If info = 0 or info = A->ncol+1, X contains the solution matrix
- *         to the original system of equations. Note that A and B are modified
- *         on exit if equed is not 'N', and the solution to the equilibrated
- *         system is inv(diag(C))*X if options->Trans = NOTRANS and
- *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
- *         and equed = 'R' or 'B'.
- *
- * recip_pivot_growth (output) double*
- *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
- *         The infinity norm is used. If recip_pivot_growth is much less
- *         than 1, the stability of the LU factorization could be poor.
- *
- * rcond   (output) double*
- *         The estimate of the reciprocal condition number of the matrix A
- *         after equilibration (if done). If rcond is less than the machine
- *         precision (in particular, if rcond = 0), the matrix is singular
- *         to working precision. This condition is indicated by a return
- *         code of info > 0.
- *
- * FERR    (output) double*, dimension (B->ncol)   
- *         The estimated forward error bound for each solution vector   
- *         X(j) (the j-th column of the solution matrix X).   
- *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
- *         is an estimated upper bound for the magnitude of the largest 
- *         element in (X(j) - XTRUE) divided by the magnitude of the   
- *         largest element in X(j).  The estimate is as reliable as   
- *         the estimate for RCOND, and is almost always a slight   
- *         overestimate of the true error.
- *         If options->IterRefine = NOREFINE, ferr = 1.0.
- *
- * BERR    (output) double*, dimension (B->ncol)
- *         The componentwise relative backward error of each solution   
- *         vector X(j) (i.e., the smallest relative change in   
- *         any element of A or B that makes X(j) an exact solution).
- *         If options->IterRefine = NOREFINE, berr = 1.0.
- *
- * mem_usage (output) mem_usage_t*
- *         Record the memory usage statistics, consisting of following fields:
- *         - for_lu (float)
- *           The amount of space used in bytes for L\U data structures.
- *         - total_needed (float)
- *           The amount of space needed in bytes to perform factorization.
- *         - expansions (int)
- *           The number of memory expansions during the LU factorization.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- *         = 0: successful exit   
- *         < 0: if info = -i, the i-th argument had an illegal value   
- *         > 0: if info = i, and i is   
- *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
- *                    been completed, but the factor U is exactly   
- *                    singular, so the solution and error bounds   
- *                    could not be computed.   
- *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
- *                    precision, meaning that the matrix is singular to
- *                    working precision. Nevertheless, the solution and
- *                    error bounds are computed because there are a number
- *                    of situations where the computed solution can be more
- *                    accurate than the value of RCOND would suggest.   
- *              > A->ncol+1: number of bytes allocated when memory allocation
- *                    failure occurred, plus A->ncol.
- *
- */
-
-    DNformat  *Bstore, *Xstore;
-    doublecomplex    *Bmat, *Xmat;
-    int       ldb, ldx, nrhs;
-    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
-    SuperMatrix AC; /* Matrix postmultiplied by Pc */
-    int       colequ, equil, nofact, notran, rowequ, permc_spec;
-    trans_t   trant;
-    char      norm[1];
-    int       i, j, info1;
-    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
-    int       relax, panel_size;
-    double    diag_pivot_thresh, drop_tol;
-    double    t0;      /* temporary time */
-    double    *utime;
-
-    /* External functions */
-    extern double zlangs(char *, SuperMatrix *);
-    extern double dlamch_(char *);
-
-    Bstore = B->Store;
-    Xstore = X->Store;
-    Bmat   = Bstore->nzval;
-    Xmat   = Xstore->nzval;
-    ldb    = Bstore->lda;
-    ldx    = Xstore->lda;
-    nrhs   = B->ncol;
-
-    *info = 0;
-    nofact = (options->Fact != FACTORED);
-    equil = (options->Equil == YES);
-    notran = (options->Trans == NOTRANS);
-    if ( nofact ) {
-	*(unsigned char *)equed = 'N';
-	rowequ = FALSE;
-	colequ = FALSE;
-    } else {
-	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-	colequ = lsame_(equed, "C") || lsame_(equed, "B");
-	smlnum = dlamch_("Safe minimum");
-	bignum = 1. / smlnum;
-    }
-
-#if 0
-printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
-       options->Fact, options->Trans, *equed);
-#endif
-
-    /* Test the input parameters */
-    if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern &&
-	options->Fact != SamePattern_SameRowPerm &&
-	!notran && options->Trans != TRANS && options->Trans != CONJ &&
-	!equil && options->Equil != NO)
-	*info = -1;
-    else if ( A->nrow != A->ncol || A->nrow < 0 ||
-	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
-	      A->Dtype != SLU_Z || A->Mtype != SLU_GE )
-	*info = -2;
-    else if (options->Fact == FACTORED &&
-	     !(rowequ || colequ || lsame_(equed, "N")))
-	*info = -6;
-    else {
-	if (rowequ) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, R[j]);
-		rcmax = SUPERLU_MAX(rcmax, R[j]);
-	    }
-	    if (rcmin <= 0.) *info = -7;
-	    else if ( A->nrow > 0)
-		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else rowcnd = 1.;
-	}
-	if (colequ && *info == 0) {
-	    rcmin = bignum;
-	    rcmax = 0.;
-	    for (j = 0; j < A->nrow; ++j) {
-		rcmin = SUPERLU_MIN(rcmin, C[j]);
-		rcmax = SUPERLU_MAX(rcmax, C[j]);
-	    }
-	    if (rcmin <= 0.) *info = -8;
-	    else if (A->nrow > 0)
-		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
-	    else colcnd = 1.;
-	}
-	if (*info == 0) {
-	    if ( lwork < -1 ) *info = -12;
-	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      B->Stype != SLU_DN || B->Dtype != SLU_Z || 
-		      B->Mtype != SLU_GE )
-		*info = -13;
-	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
-		      (B->ncol != 0 && B->ncol != X->ncol) ||
-                      X->Stype != SLU_DN ||
-		      X->Dtype != SLU_Z || X->Mtype != SLU_GE )
-		*info = -14;
-	}
-    }
-    if (*info != 0) {
-	i = -(*info);
-	xerbla_("zgssvx", &i);
-	return;
-    }
-    
-    /* Initialization for factor parameters */
-    panel_size = sp_ienv(1);
-    relax      = sp_ienv(2);
-    diag_pivot_thresh = options->DiagPivotThresh;
-    drop_tol   = 0.0;
-
-    utime = stat->utime;
-    
-    /* Convert A to SLU_NC format when necessary. */
-    if ( A->Stype == SLU_NR ) {
-	NRformat *Astore = A->Store;
-	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
-	zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
-			       Astore->nzval, Astore->colind, Astore->rowptr,
-			       SLU_NC, A->Dtype, A->Mtype);
-	if ( notran ) { /* Reverse the transpose argument. */
-	    trant = TRANS;
-	    notran = 0;
-	} else {
-	    trant = NOTRANS;
-	    notran = 1;
-	}
-    } else { /* A->Stype == SLU_NC */
-	trant = options->Trans;
-	AA = A;
-    }
-
-    if ( nofact && equil ) {
-	t0 = SuperLU_timer_();
-	/* Compute row and column scalings to equilibrate the matrix A. */
-	zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
-	
-	if ( info1 == 0 ) {
-	    /* Equilibrate matrix A. */
-	    zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
-	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
-	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
-	}
-	utime[EQUIL] = SuperLU_timer_() - t0;
-    }
-
-    if ( nrhs > 0 ) {
-        /* Scale the right hand side if equilibration was performed. */
-        if ( notran ) {
-	    if ( rowequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-                        zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]);
-	            }
-	    }
-        } else if ( colequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-                    zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]);
-	        }
-        }
-    }
-
-    if ( nofact ) {
-	
-        t0 = SuperLU_timer_();
-	/*
-	 * Gnet column permutation vector perm_c[], according to permc_spec:
-	 *   permc_spec = NATURAL:  natural ordering 
-	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
-	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
-	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
-	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
-	 */
-	permc_spec = options->ColPerm;
-	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
-            get_perm_c(permc_spec, AA, perm_c);
-	utime[COLPERM] = SuperLU_timer_() - t0;
-
-	t0 = SuperLU_timer_();
-	sp_preorder(options, AA, perm_c, etree, &AC);
-	utime[ETREE] = SuperLU_timer_() - t0;
-    
-/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
-	       relax, panel_size, sp_ienv(3), sp_ienv(4));
-	fflush(stdout); */
-	
-	/* Compute the LU factorization of A*Pc. */
-	t0 = SuperLU_timer_();
-	zgstrf(options, &AC, drop_tol, relax, panel_size,
-	       etree, work, lwork, perm_c, perm_r, L, U, stat, info);
-	utime[FACT] = SuperLU_timer_() - t0;
-	
-	if ( lwork == -1 ) {
-	    mem_usage->total_needed = *info - A->ncol;
-	    return;
-	}
-    }
-
-    if ( options->PivotGrowth ) {
-        if ( *info > 0 ) {
-	    if ( *info <= A->ncol ) {
-	        /* Compute the reciprocal pivot growth factor of the leading
-	           rank-deficient *info columns of A. */
-	        *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U);
-	    }
-	    return;
-        }
-
-        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
-        *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U);
-    }
-
-    if ( options->ConditionNumber ) {
-        /* Estimate the reciprocal of the condition number of A. */
-        t0 = SuperLU_timer_();
-        if ( notran ) {
-	    *(unsigned char *)norm = '1';
-        } else {
-	    *(unsigned char *)norm = 'I';
-        }
-        anorm = zlangs(norm, AA);
-        zgscon(norm, L, U, anorm, rcond, stat, info);
-        utime[RCOND] = SuperLU_timer_() - t0;
-    }
-    
-    if ( nrhs > 0 ) {
-        /* Compute the solution matrix X. */
-        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
-            for (i = 0; i < B->nrow; i++)
-	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
-    
-        t0 = SuperLU_timer_();
-        zgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
-        utime[SOLVE] = SuperLU_timer_() - t0;
-    
-        /* Use iterative refinement to improve the computed solution and compute
-           error bounds and backward error estimates for it. */
-        t0 = SuperLU_timer_();
-        if ( options->IterRefine != NOREFINE ) {
-            zgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
-                   X, ferr, berr, stat, info);
-        } else {
-            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
-        }
-        utime[REFINE] = SuperLU_timer_() - t0;
-
-        /* Transform the solution matrix X to a solution of the original system. */
-        if ( notran ) {
-	    if ( colequ ) {
-	        for (j = 0; j < nrhs; ++j)
-		    for (i = 0; i < A->nrow; ++i) {
-                        zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]);
-	            }
-	    }
-        } else if ( rowequ ) {
-	    for (j = 0; j < nrhs; ++j)
-	        for (i = 0; i < A->nrow; ++i) {
-                    zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]);
-                }
-        }
-    } /* end if nrhs > 0 */
-
-    if ( options->ConditionNumber ) {
-        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
-        if ( *rcond < dlamch_("E") ) *info = A->ncol + 1;
-    }
-
-    if ( nofact ) {
-        zQuerySpace(L, U, mem_usage);
-        Destroy_CompCol_Permuted(&AC);
-    }
-    if ( A->Stype == SLU_NR ) {
-	Destroy_SuperMatrix_Store(AA);
-	SUPERLU_FREE(AA);
-    }
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrf.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrf.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrf.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,433 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-void
-zgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol,
-        int relax, int panel_size, int *etree, void *work, int lwork,
-        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * ZGSTRF computes an LU factorization of a general sparse m-by-n
- * matrix A using partial pivoting with row interchanges.
- * The factorization has the form
- *     Pr * A = L * U
- * where Pr is a row permutation matrix, L is lower triangular with unit
- * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
- * triangular (upper trapezoidal if A->nrow < A->ncol).
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
- *
- * drop_tol (input) double (NOT IMPLEMENTED)
- *	    Drop tolerance parameter. At step j of the Gaussian elimination,
- *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
- *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
- *
- * relax    (input) int
- *          To control degree of relaxing supernodes. If the number
- *          of nodes (columns) in a subtree of the elimination tree is less
- *          than relax, this subtree is considered as one supernode,
- *          regardless of the row structures of those columns.
- *
- * panel_size (input) int
- *          A panel consists of at most panel_size consecutive columns.
- *
- * etree    (input) int*, dimension (A->ncol)
- *          Elimination tree of A'*A.
- *          Note: etree is a vector of parent pointers for a forest whose
- *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
- *          On input, the columns of A should be permuted so that the
- *          etree is in a certain postorder.
- *
- * work     (input/output) void*, size (lwork) (in bytes)
- *          User-supplied work space and space for the output data structures.
- *          Not referenced if lwork = 0;
- *
- * lwork   (input) int
- *         Specifies the size of work array in bytes.
- *         = 0:  allocate space internally by system malloc;
- *         > 0:  use user-supplied work array of length lwork in bytes,
- *               returns error if space runs out.
- *         = -1: the routine guesses the amount of space needed without
- *               performing the factorization, and returns it in
- *               *info; no other side effects.
- *
- * perm_c   (input) int*, dimension (A->ncol)
- *	    Column permutation vector, which defines the 
- *          permutation matrix Pc; perm_c[i] = j means column i of A is 
- *          in position j in A*Pc.
- *          When searching for diagonal, perm_c[*] is applied to the
- *          row subscripts of A, so that diagonal threshold pivoting
- *          can find the diagonal of A, rather than that of A*Pc.
- *
- * perm_r   (input/output) int*, dimension (A->nrow)
- *          Row permutation vector which defines the permutation matrix Pr,
- *          perm_r[i] = j means row i of A is in position j in Pr*A.
- *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
- *             will try to use the input perm_r, unless a certain threshold
- *             criterion is violated. In that case, perm_r is overwritten by
- *             a new permutation determined by partial pivoting or diagonal
- *             threshold pivoting.
- *          Otherwise, perm_r is output argument;
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = SLU_NC, 
- *          Dtype = SLU_Z, Mtype = SLU_TRU.
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info     (output) int*
- *          = 0: successful exit
- *          < 0: if info = -i, the i-th argument had an illegal value
- *          > 0: if info = i, and i is
- *             <= A->ncol: U(i,i) is exactly zero. The factorization has
- *                been completed, but the factor U is exactly singular,
- *                and division by zero will occur if it is used to solve a
- *                system of equations.
- *             > A->ncol: number of bytes allocated when memory allocation
- *                failure occurred, plus A->ncol. If lwork = -1, it is
- *                the estimated amount of space needed, plus A->ncol.
- *
- * ======================================================================
- *
- * Local Working Arrays: 
- * ======================
- *   m = number of rows in the matrix
- *   n = number of columns in the matrix
- *
- *   xprune[0:n-1]: xprune[*] points to locations in subscript 
- *	vector lsub[*]. For column i, xprune[i] denotes the point where 
- *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
- *	to be traversed for symbolic factorization.
- *
- *   marker[0:3*m-1]: marker[i] = j means that node i has been 
- *	reached when working on column j.
- *	Storage: relative to original row subscripts
- *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
- *	      see zpanel_dfs.c; marker2 is used for inner-factorization,
- *            see zcolumn_dfs.c.
- *
- *   parent[0:m-1]: parent vector used during dfs
- *      Storage: relative to new row subscripts
- *
- *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
- *	unexplored neighbor of i in lsub[*]
- *
- *   segrep[0:nseg-1]: contains the list of supernodal representatives
- *	in topological order of the dfs. A supernode representative is the 
- *	last column of a supernode.
- *      The maximum size of segrep[] is n.
- *
- *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
- *	supernodal representative r, repfnz[r] is the location of the first 
- *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
- *	indicates the supernode r has been explored.
- *	NOTE: There are W of them, each used for one column of a panel. 
- *
- *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
- *      the panel diagonal. These are filled in during zpanel_dfs(), and are
- *      used later in the inner LU factorization within the panel.
- *	panel_lsub[]/dense[] pair forms the SPA data structure.
- *	NOTE: There are W of them.
- *
- *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
- *	    	   NOTE: there are W of them.
- *
- *   tempv[0:*]: real temporary used for dense numeric kernels;
- *	The size of this array is defined by NUM_TEMPV() in zsp_defs.h.
- *
- */
-    /* Local working arrays */
-    NCPformat *Astore;
-    int       *iperm_r = NULL; /* inverse of perm_r; used when 
-                                  options->Fact == SamePattern_SameRowPerm */
-    int       *iperm_c; /* inverse of perm_c */
-    int       *iwork;
-    doublecomplex    *zwork;
-    int	      *segrep, *repfnz, *parent, *xplore;
-    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
-    int	      *xprune;
-    int	      *marker;
-    doublecomplex    *dense, *tempv;
-    int       *relax_end;
-    doublecomplex    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int       *xsup, *supno;
-    int       *xlsub, *xlusup, *xusub;
-    int       nzlumax;
-    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
-
-    /* Local scalars */
-    fact_t    fact = options->Fact;
-    double    diag_pivot_thresh = options->DiagPivotThresh;
-    int       pivrow;   /* pivotal row number in the original matrix A */
-    int       nseg1;	/* no of segments in U-column above panel row jcol */
-    int       nseg;	/* no of segments in each U-column */
-    register int jcol;	
-    register int kcol;	/* end column of a relaxed snode */
-    register int icol;
-    register int i, k, jj, new_next, iinfo;
-    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
-    int       w_def;	/* upper bound on panel width */
-    int       usepr, iperm_r_allocated = 0;
-    int       nnzL, nnzU;
-    int       *panel_histo = stat->panel_histo;
-    flops_t   *ops = stat->ops;
-
-    iinfo    = 0;
-    m        = A->nrow;
-    n        = A->ncol;
-    min_mn   = SUPERLU_MIN(m, n);
-    Astore   = A->Store;
-    a        = Astore->nzval;
-    asub     = Astore->rowind;
-    xa_begin = Astore->colbeg;
-    xa_end   = Astore->colend;
-
-    /* Allocate storage common to the factor routines */
-    *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz,
-                       panel_size, L, U, &Glu, &iwork, &zwork);
-    if ( *info ) return;
-    
-    xsup    = Glu.xsup;
-    supno   = Glu.supno;
-    xlsub   = Glu.xlsub;
-    xlusup  = Glu.xlusup;
-    xusub   = Glu.xusub;
-    
-    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
-	     &repfnz, &panel_lsub, &xprune, &marker);
-    zSetRWork(m, panel_size, zwork, &dense, &tempv);
-    
-    usepr = (fact == SamePattern_SameRowPerm);
-    if ( usepr ) {
-	/* Compute the inverse of perm_r */
-	iperm_r = (int *) intMalloc(m);
-	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
-	iperm_r_allocated = 1;
-    }
-    iperm_c = (int *) intMalloc(n);
-    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
-
-    /* Identify relaxed snodes */
-    relax_end = (int *) intMalloc(n);
-    if ( options->SymmetricMode == YES ) {
-        heap_relax_snode(n, etree, relax, marker, relax_end); 
-    } else {
-        relax_snode(n, etree, relax, marker, relax_end); 
-    }
-    
-    ifill (perm_r, m, EMPTY);
-    ifill (marker, m * NO_MARKER, EMPTY);
-    supno[0] = -1;
-    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
-    w_def    = panel_size;
-
-    /* 
-     * Work on one "panel" at a time. A panel is one of the following: 
-     *	   (a) a relaxed supernode at the bottom of the etree, or
-     *	   (b) panel_size contiguous columns, defined by the user
-     */
-    for (jcol = 0; jcol < min_mn; ) {
-
-	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
-   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
-	    panel_histo[kcol-jcol+1]++;
-
-	    /* --------------------------------------
-	     * Factorize the relaxed supernode(jcol:kcol) 
-	     * -------------------------------------- */
-	    /* Determine the union of the row structure of the snode */
-	    if ( (*info = zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
-				    xprune, marker, &Glu)) != 0 )
-		return;
-
-            nextu    = xusub[jcol];
-	    nextlu   = xlusup[jcol];
-	    jsupno   = supno[jcol];
-	    fsupc    = xsup[jsupno];
-	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
-	    nzlumax = Glu.nzlumax;
-	    while ( new_next > nzlumax ) {
-		if ( (*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
-		    return;
-	    }
-    
-	    for (icol = jcol; icol<= kcol; icol++) {
-		xusub[icol+1] = nextu;
-		
-    		/* Scatter into SPA dense[*] */
-    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
-        	    dense[asub[k]] = a[k];
-
-	       	/* Numeric update within the snode */
-	        zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);
-
-		if ( (*info = zpivotL(icol, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-		
-#ifdef DEBUG
-		zprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-	    jcol = icol;
-
-	} else { /* Work on one panel of panel_size columns */
-	    
-	    /* Adjust panel_size so that a panel won't overlap with the next 
-	     * relaxed snode.
-	     */
-	    panel_size = w_def;
-	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
-		if ( relax_end[k] != EMPTY ) {
-		    panel_size = k - jcol;
-		    break;
-		}
-	    if ( k == min_mn ) panel_size = min_mn - jcol;
-	    panel_histo[panel_size]++;
-
-	    /* symbolic factor on a panel of columns */
-	    zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
-		      dense, panel_lsub, segrep, repfnz, xprune,
-		      marker, parent, xplore, &Glu);
-	    
-	    /* numeric sup-panel updates in topological order */
-	    zpanel_bmod(m, panel_size, jcol, nseg1, dense,
-		        tempv, segrep, repfnz, &Glu, stat);
-	    
-	    /* Sparse LU within the panel, and below panel diagonal */
-    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
- 		k = (jj - jcol) * m; /* column index for w-wide arrays */
-
-		nseg = nseg1;	/* Begin after all the panel segments */
-
-	    	if ((*info = zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
-					segrep, &repfnz[k], xprune, marker,
-					parent, xplore, &Glu)) != 0) return;
-
-	      	/* Numeric updates */
-	    	if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k],
-					 tempv, &segrep[nseg1], &repfnz[k],
-					 jcol, &Glu, stat)) != 0) return;
-		
-	        /* Copy the U-segments to ucol[*] */
-		if ((*info = zcopy_to_ucol(jj, nseg, segrep, &repfnz[k],
-					  perm_r, &dense[k], &Glu)) != 0)
-		    return;
-
-	    	if ( (*info = zpivotL(jj, diag_pivot_thresh, &usepr, perm_r,
-				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
-		    if ( iinfo == 0 ) iinfo = *info;
-
-		/* Prune columns (0:jj-1) using column jj */
-	    	zpruneL(jj, perm_r, pivrow, nseg, segrep,
-                        &repfnz[k], xprune, &Glu);
-
-		/* Reset repfnz[] for this column */
-	    	resetrep_col (nseg, segrep, &repfnz[k]);
-		
-#ifdef DEBUG
-		zprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
-#endif
-
-	    }
-
-   	    jcol += panel_size;	/* Move to the next panel */
-
-	} /* else */
-
-    } /* for */
-
-    *info = iinfo;
-    
-    if ( m > n ) {
-	k = 0;
-        for (i = 0; i < m; ++i) 
-            if ( perm_r[i] == EMPTY ) {
-    		perm_r[i] = n + k;
-		++k;
-	    }
-    }
-
-    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
-    fixupL(min_mn, perm_r, &Glu);
-
-    zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */
-
-    if ( fact == SamePattern_SameRowPerm ) {
-        /* L and U structures may have changed due to possibly different
-	   pivoting, even though the storage is available.
-	   There could also be memory expansions, so the array locations
-           may have changed, */
-        ((SCformat *)L->Store)->nnz = nnzL;
-	((SCformat *)L->Store)->nsuper = Glu.supno[n];
-	((SCformat *)L->Store)->nzval = Glu.lusup;
-	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
-	((SCformat *)L->Store)->rowind = Glu.lsub;
-	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
-	((NCformat *)U->Store)->nnz = nnzU;
-	((NCformat *)U->Store)->nzval = Glu.ucol;
-	((NCformat *)U->Store)->rowind = Glu.usub;
-	((NCformat *)U->Store)->colptr = Glu.xusub;
-    } else {
-        zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, 
-	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
-			         Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU);
-    	zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, 
-			       Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU);
-    }
-    
-    ops[FACT] += ops[TRSV] + ops[GEMV];	
-    
-    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
-    SUPERLU_FREE (iperm_c);
-    SUPERLU_FREE (relax_end);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zgstrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zgstrs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,345 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-
-/* 
- * Function prototypes 
- */
-void zusolve(int, int, doublecomplex*, doublecomplex*);
-void zlsolve(int, int, doublecomplex*, doublecomplex*);
-void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
-
-
-void
-zgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U,
-        int *perm_c, int *perm_r, SuperMatrix *B,
-        SuperLUStat_t *stat, int *info)
-{
-/*
- * Purpose
- * =======
- *
- * ZGSTRS solves a system of linear equations A*X=B or A'*X=B
- * with A sparse and B dense, using the LU factorization computed by
- * ZGSTRF.
- *
- * See supermatrix.h for the definition of 'SuperMatrix' structure.
- *
- * Arguments
- * =========
- *
- * trans   (input) trans_t
- *          Specifies the form of the system of equations:
- *          = NOTRANS: A * X = B  (No transpose)
- *          = TRANS:   A'* X = B  (Transpose)
- *          = CONJ:    A**H * X = B  (Conjugate transpose)
- *
- * L       (input) SuperMatrix*
- *         The factor L from the factorization Pr*A*Pc=L*U as computed by
- *         zgstrf(). Use compressed row subscripts storage for supernodes,
- *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
- *
- * U       (input) SuperMatrix*
- *         The factor U from the factorization Pr*A*Pc=L*U as computed by
- *         zgstrf(). Use column-wise storage scheme, i.e., U has types:
- *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
- *
- * perm_c  (input) int*, dimension (L->ncol)
- *	   Column permutation vector, which defines the 
- *         permutation matrix Pc; perm_c[i] = j means column i of A is 
- *         in position j in A*Pc.
- *
- * perm_r  (input) int*, dimension (L->nrow)
- *         Row permutation vector, which defines the permutation matrix Pr; 
- *         perm_r[i] = j means row i of A is in position j in Pr*A.
- *
- * B       (input/output) SuperMatrix*
- *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
- *         On entry, the right hand side matrix.
- *         On exit, the solution matrix if info = 0;
- *
- * stat     (output) SuperLUStat_t*
- *          Record the statistics on runtime and floating-point operation count.
- *          See util.h for the definition of 'SuperLUStat_t'.
- *
- * info    (output) int*
- * 	   = 0: successful exit
- *	   < 0: if info = -i, the i-th argument had an illegal value
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
-#endif
-    int      incx = 1, incy = 1;
-#ifdef USE_VENDOR_BLAS
-    doublecomplex   alpha = {1.0, 0.0}, beta = {1.0, 0.0};
-    doublecomplex   *work_col;
-#endif
-    doublecomplex   temp_comp;
-    DNformat *Bstore;
-    doublecomplex   *Bmat;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    doublecomplex   *Lval, *Uval;
-    int      fsupc, nrow, nsupr, nsupc, luptr, istart, irow;
-    int      i, j, k, iptr, jcol, n, ldb, nrhs;
-    doublecomplex   *work, *rhs_work, *soln;
-    flops_t  solve_ops;
-    void zprint_soln();
-
-    /* Test input parameters ... */
-    *info = 0;
-    Bstore = B->Store;
-    ldb = Bstore->lda;
-    nrhs = B->ncol;
-    if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ||
-	      L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU )
-	*info = -2;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ||
-	      U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU )
-	*info = -3;
-    else if ( ldb < SUPERLU_MAX(0, L->nrow) ||
-	      B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE )
-	*info = -6;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("zgstrs", &i);
-	return;
-    }
-
-    n = L->nrow;
-    work = doublecomplexCalloc(n * nrhs);
-    if ( !work ) ABORT("Malloc fails for local work[].");
-    soln = doublecomplexMalloc(n);
-    if ( !soln ) ABORT("Malloc fails for local soln[].");
-
-    Bmat = Bstore->nzval;
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-    
-    if ( trans == NOTRANS ) {
-	/* Permute right hand sides to form Pr*B */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-	/* Forward solve PLy=Pb. */
-	for (k = 0; k <= Lstore->nsuper; k++) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    nrow = nsupr - nsupc;
-
-	    solve_ops += 4 * nsupc * (nsupc - 1) * nrhs;
-	    solve_ops += 8 * nrow * nsupc * nrhs;
-	    
-	    if ( nsupc == 1 ) {
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-	    	    luptr = L_NZ_START(fsupc);
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){
-			irow = L_SUB(iptr);
-			++luptr;
-			zz_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]);
-			z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp);
-		    }
-		}
-	    } else {
-	    	luptr = L_NZ_START(fsupc);
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("N", strlen("N"));
-		ftcs3 = _cptofcd("U", strlen("U"));
-		CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#else
-		ztrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-		
-		zgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, 
-			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
-			&beta, &work[0], &n );
-#endif
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    work_col = &work[j*n];
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			z_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]);
-			work_col[i].r = 0.0;
-	                work_col[i].i = 0.0;
-			iptr++;
-		    }
-		}
-#else		
-		for (j = 0; j < nrhs; j++) {
-		    rhs_work = &Bmat[j*ldb];
-		    zlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
-		    zmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
-			    &rhs_work[fsupc], &work[0] );
-
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; i++) {
-			irow = L_SUB(iptr);
-			z_sub(&rhs_work[irow], &rhs_work[irow], &work[i]);
-			work[i].r = 0.;
-	                work[i].i = 0.;
-			iptr++;
-		    }
-		}
-#endif		    
-	    } /* else ... */
-	} /* for L-solve */
-
-#ifdef DEBUG
-  	printf("After L-solve: y=\n");
-	zprint_soln(n, nrhs, Bmat);
-#endif
-
-	/*
-	 * Back solve Ux=y.
-	 */
-	for (k = Lstore->nsuper; k >= 0; k--) {
-	    fsupc = L_FST_SUPC(k);
-	    istart = L_SUB_START(fsupc);
-	    nsupr = L_SUB_START(fsupc+1) - istart;
-	    nsupc = L_FST_SUPC(k+1) - fsupc;
-	    luptr = L_NZ_START(fsupc);
-
-	    solve_ops += 4 * nsupc * (nsupc + 1) * nrhs;
-
-	    if ( nsupc == 1 ) {
-		rhs_work = &Bmat[0];
-		for (j = 0; j < nrhs; j++) {
-		    z_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]);
-		    rhs_work += ldb;
-		}
-	    } else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		ftcs1 = _cptofcd("L", strlen("L"));
-		ftcs2 = _cptofcd("U", strlen("U"));
-		ftcs3 = _cptofcd("N", strlen("N"));
-		CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#else
-		ztrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha,
-		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
-#endif
-#else		
-		for (j = 0; j < nrhs; j++)
-		    zusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] );
-#endif		
-	    }
-
-	    for (j = 0; j < nrhs; ++j) {
-		rhs_work = &Bmat[j*ldb];
-		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){
-			irow = U_SUB(i);
-			zz_mult(&temp_comp, &rhs_work[jcol], &Uval[i]);
-			z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp);
-		    }
-		}
-	    }
-	    
-	} /* for U-solve */
-
-#ifdef DEBUG
-  	printf("After U-solve: x=\n");
-	zprint_soln(n, nrhs, Bmat);
-#endif
-
-	/* Compute the final solution X := Pc*X. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-	
-        stat->ops[SOLVE] = solve_ops;
-
-    } else { /* Solve A'*X=B or CONJ(A)*X=B */
-	/* Permute right hand sides to form Pc'*B. */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-	stat->ops[SOLVE] = 0;
-        if (trans == TRANS) {
-	    for (k = 0; k < nrhs; ++k) {
-	        /* Multiply by inv(U'). */
-	        sp_ztrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info);
-	    
-	        /* Multiply by inv(L'). */
-	        sp_ztrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info);
-	    }
-         } else { /* trans == CONJ */
-            for (k = 0; k < nrhs; ++k) {                
-                /* Multiply by conj(inv(U')). */
-                sp_ztrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info);
-                
-                /* Multiply by conj(inv(L')). */
-                sp_ztrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info);
-	    }
-         }
-	/* Compute the final solution X := Pr'*X (=inv(Pr)*X) */
-	for (i = 0; i < nrhs; i++) {
-	    rhs_work = &Bmat[i*ldb];
-	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]];
-	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
-	}
-
-    }
-
-    SUPERLU_FREE(work);
-    SUPERLU_FREE(soln);
-}
-
-/*
- * Diagnostic print of the solution vector 
- */
-void
-zprint_soln(int n, int nrhs, doublecomplex *soln)
-{
-    int i;
-
-    for (i = 0; i < n; i++) 
-  	printf("\t%d: %.4f\n", i, soln[i]);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlacon.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlacon.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlacon.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlacon.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,213 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <math.h>
-#include "slu_Cnames.h"
-#include "slu_dcomplex.h"
-
-int
-zlacon_(int *n, doublecomplex *v, doublecomplex *x, double *est, int *kase)
-
-{
-/*
-    Purpose   
-    =======   
-
-    ZLACON estimates the 1-norm of a square matrix A.   
-    Reverse communication is used for evaluating matrix-vector products. 
-  
-
-    Arguments   
-    =========   
-
-    N      (input) INT
-           The order of the matrix.  N >= 1.   
-
-    V      (workspace) DOUBLE COMPLEX PRECISION array, dimension (N)   
-           On the final return, V = A*W,  where  EST = norm(V)/norm(W)   
-           (W is not returned).   
-
-    X      (input/output) DOUBLE COMPLEX PRECISION array, dimension (N)   
-           On an intermediate return, X should be overwritten by   
-                 A * X,   if KASE=1,   
-                 A' * X,  if KASE=2,
-           where A' is the conjugate transpose of A,
-           and ZLACON must be re-called with all the other parameters   
-           unchanged.   
-
-
-    EST    (output) DOUBLE PRECISION   
-           An estimate (a lower bound) for norm(A).   
-
-    KASE   (input/output) INT
-           On the initial call to ZLACON, KASE should be 0.   
-           On an intermediate return, KASE will be 1 or 2, indicating   
-           whether X should be overwritten by A * X  or A' * X.   
-           On the final return from ZLACON, KASE will again be 0.   
-
-    Further Details   
-    ======= =======   
-
-    Contributed by Nick Higham, University of Manchester.   
-    Originally named CONEST, dated March 16, 1988.   
-
-    Reference: N.J. Higham, "FORTRAN codes for estimating the one-norm of 
-    a real or complex matrix, with applications to condition estimation", 
-    ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988.   
-    ===================================================================== 
-*/
-
-    /* Table of constant values */
-    int c__1 = 1;
-    doublecomplex      zero = {0.0, 0.0};
-    doublecomplex      one = {1.0, 0.0};
-
-    /* System generated locals */
-    double d__1;
-    
-    /* Local variables */
-    static int iter;
-    static int jump, jlast;
-    static double altsgn, estold;
-    static int i, j;
-    double temp;
-    double safmin;
-    extern double dlamch_(char *);
-    extern int izmax1_(int *, doublecomplex *, int *);
-    extern double dzsum1_(int *, doublecomplex *, int *);
-
-    safmin = dlamch_("Safe minimum");
-    if ( *kase == 0 ) {
-	for (i = 0; i < *n; ++i) {
-	    x[i].r = 1. / (double) (*n);
-	    x[i].i = 0.;
-	}
-	*kase = 1;
-	jump = 1;
-	return 0;
-    }
-
-    switch (jump) {
-	case 1:  goto L20;
-	case 2:  goto L40;
-	case 3:  goto L70;
-	case 4:  goto L110;
-	case 5:  goto L140;
-    }
-
-    /*     ................ ENTRY   (JUMP = 1)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY A*X. */
-  L20:
-    if (*n == 1) {
-	v[0] = x[0];
-	*est = z_abs(&v[0]);
-	/*        ... QUIT */
-	goto L150;
-    }
-    *est = dzsum1_(n, x, &c__1);
-
-    for (i = 0; i < *n; ++i) {
-	d__1 = z_abs(&x[i]);
-	if (d__1 > safmin) {
-	    d__1 = 1 / d__1;
-	    x[i].r *= d__1;
-	    x[i].i *= d__1;
-	} else {
-	    x[i] = one;
-	}
-    }
-    *kase = 2;
-    jump = 2;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 2)   
-	   FIRST ITERATION.  X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */
-L40:
-    j = izmax1_(n, &x[0], &c__1);
-    --j;
-    iter = 2;
-
-    /*     MAIN LOOP - ITERATIONS 2,3,...,ITMAX. */
-L50:
-    for (i = 0; i < *n; ++i) x[i] = zero;
-    x[j] = one;
-    *kase = 1;
-    jump = 3;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 3)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L70:
-#ifdef _CRAY
-    CCOPY(n, x, &c__1, v, &c__1);
-#else
-    zcopy_(n, x, &c__1, v, &c__1);
-#endif
-    estold = *est;
-    *est = dzsum1_(n, v, &c__1);
-
-
-L90:
-    /*     TEST FOR CYCLING. */
-    if (*est <= estold) goto L120;
-
-    for (i = 0; i < *n; ++i) {
-	d__1 = z_abs(&x[i]);
-	if (d__1 > safmin) {
-	    d__1 = 1 / d__1;
-	    x[i].r *= d__1;
-	    x[i].i *= d__1;
-	} else {
-	    x[i] = one;
-	}
-    }
-    *kase = 2;
-    jump = 4;
-    return 0;
-
-    /*     ................ ENTRY   (JUMP = 4)   
-	   X HAS BEEN OVERWRITTEN BY TRANDPOSE(A)*X. */
-L110:
-    jlast = j;
-    j = izmax1_(n, &x[0], &c__1);
-    --j;
-    if (x[jlast].r != (d__1 = x[j].r, fabs(d__1)) && iter < 5) {
-	++iter;
-	goto L50;
-    }
-
-    /*     ITERATION COMPLETE.  FINAL STAGE. */
-L120:
-    altsgn = 1.;
-    for (i = 1; i <= *n; ++i) {
-	x[i-1].r = altsgn * ((double)(i - 1) / (double)(*n - 1) + 1.);
-	x[i-1].i = 0.;
-	altsgn = -altsgn;
-    }
-    *kase = 1;
-    jump = 5;
-    return 0;
-    
-    /*     ................ ENTRY   (JUMP = 5)   
-	   X HAS BEEN OVERWRITTEN BY A*X. */
-L140:
-    temp = dzsum1_(n, x, &c__1) / (double)(*n * 3) * 2.;
-    if (temp > *est) {
-#ifdef _CRAY
-	CCOPY(n, &x[0], &c__1, &v[0], &c__1);
-#else
-	zcopy_(n, &x[0], &c__1, &v[0], &c__1);
-#endif
-	*est = temp;
-    }
-
-L150:
-    *kase = 0;
-    return 0;
-
-} /* zlacon_ */
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlangs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlangs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlangs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlangs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,110 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	zlangs.c
- * History:     Modified from lapack routine ZLANGE
- */
-#include <math.h>
-#include "slu_zdefs.h"
-
-double zlangs(char *norm, SuperMatrix *A)
-{
-/* 
-    Purpose   
-    =======   
-
-    ZLANGS returns the value of the one norm, or the Frobenius norm, or 
-    the infinity norm, or the element of largest absolute value of a 
-    real matrix A.   
-
-    Description   
-    ===========   
-
-    ZLANGE returns the value   
-
-       ZLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
-                (   
-                ( norm1(A),         NORM = '1', 'O' or 'o'   
-                (   
-                ( normI(A),         NORM = 'I' or 'i'   
-                (   
-                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
-
-    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
-    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
-    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
-    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
-
-    Arguments   
-    =========   
-
-    NORM    (input) CHARACTER*1   
-            Specifies the value to be returned in ZLANGE as described above.   
-    A       (input) SuperMatrix*
-            The M by N sparse matrix A. 
-
-   ===================================================================== 
-*/
-    
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int      i, j, irow;
-    double   value, sum;
-    double   *rwork;
-
-    Astore = A->Store;
-    Aval   = Astore->nzval;
-    
-    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
-	value = 0.;
-	
-    } else if (lsame_(norm, "M")) {
-	/* Find max(abs(A(i,j))). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
-		value = SUPERLU_MAX( value, z_abs( &Aval[i]) );
-	
-    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
-	/* Find norm1(A). */
-	value = 0.;
-	for (j = 0; j < A->ncol; ++j) {
-	    sum = 0.;
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
-		sum += z_abs( &Aval[i] );
-	    value = SUPERLU_MAX(value,sum);
-	}
-	
-    } else if (lsame_(norm, "I")) {
-	/* Find normI(A). */
-	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
-	    ABORT("SUPERLU_MALLOC fails for rwork.");
-	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
-		irow = Astore->rowind[i];
-		rwork[irow] += z_abs( &Aval[i] );
-	    }
-	value = 0.;
-	for (i = 0; i < A->nrow; ++i)
-	    value = SUPERLU_MAX(value, rwork[i]);
-	
-	SUPERLU_FREE (rwork);
-	
-    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
-	/* Find normF(A). */
-	ABORT("Not implemented.");
-    } else
-	ABORT("Illegal norm specified.");
-
-    return (value);
-
-} /* zlangs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlaqgs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlaqgs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zlaqgs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zlaqgs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,138 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:	zlaqgs.c
- * History:     Modified from LAPACK routine ZLAQGE
- */
-#include <math.h>
-#include "slu_zdefs.h"
-
-void
-zlaqgs(SuperMatrix *A, double *r, double *c, 
-	double rowcnd, double colcnd, double amax, char *equed)
-{
-/*
-    Purpose   
-    =======   
-
-    ZLAQGS equilibrates a general sparse M by N matrix A using the row and   
-    scaling factors in the vectors R and C.   
-
-    See supermatrix.h for the definition of 'SuperMatrix' structure.
-
-    Arguments   
-    =========   
-
-    A       (input/output) SuperMatrix*
-            On exit, the equilibrated matrix.  See EQUED for the form of 
-            the equilibrated matrix. The type of A can be:
-	    Stype = NC; Dtype = SLU_Z; Mtype = GE.
-	    
-    R       (input) double*, dimension (A->nrow)
-            The row scale factors for A.
-	    
-    C       (input) double*, dimension (A->ncol)
-            The column scale factors for A.
-	    
-    ROWCND  (input) double
-            Ratio of the smallest R(i) to the largest R(i).
-	    
-    COLCND  (input) double
-            Ratio of the smallest C(i) to the largest C(i).
-	    
-    AMAX    (input) double
-            Absolute value of largest matrix entry.
-	    
-    EQUED   (output) char*
-            Specifies the form of equilibration that was done.   
-            = 'N':  No equilibration   
-            = 'R':  Row equilibration, i.e., A has been premultiplied by  
-                    diag(R).   
-            = 'C':  Column equilibration, i.e., A has been postmultiplied  
-                    by diag(C).   
-            = 'B':  Both row and column equilibration, i.e., A has been
-                    replaced by diag(R) * A * diag(C).   
-
-    Internal Parameters   
-    ===================   
-
-    THRESH is a threshold value used to decide if row or column scaling   
-    should be done based on the ratio of the row or column scaling   
-    factors.  If ROWCND < THRESH, row scaling is done, and if   
-    COLCND < THRESH, column scaling is done.   
-
-    LARGE and SMALL are threshold values used to decide if row scaling   
-    should be done based on the absolute size of the largest matrix   
-    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
-
-    ===================================================================== 
-*/
-
-#define THRESH    (0.1)
-    
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int i, j, irow;
-    double large, small, cj;
-    extern double dlamch_(char *);
-    double temp;
-
-
-    /* Quick return if possible */
-    if (A->nrow <= 0 || A->ncol <= 0) {
-	*(unsigned char *)equed = 'N';
-	return;
-    }
-
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Initialize LARGE and SMALL. */
-    small = dlamch_("Safe minimum") / dlamch_("Precision");
-    large = 1. / small;
-
-    if (rowcnd >= THRESH && amax >= small && amax <= large) {
-	if (colcnd >= THRESH)
-	    *(unsigned char *)equed = 'N';
-	else {
-	    /* Column scaling */
-	    for (j = 0; j < A->ncol; ++j) {
-		cj = c[j];
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    zd_mult(&Aval[i], &Aval[i], cj);
-                }
-	    }
-	    *(unsigned char *)equed = 'C';
-	}
-    } else if (colcnd >= THRESH) {
-	/* Row scaling, no column scaling */
-	for (j = 0; j < A->ncol; ++j)
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		zd_mult(&Aval[i], &Aval[i], r[irow]);
-	    }
-	*(unsigned char *)equed = 'R';
-    } else {
-	/* Row and column scaling */
-	for (j = 0; j < A->ncol; ++j) {
-	    cj = c[j];
-	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		irow = Astore->rowind[i];
-		temp = cj * r[irow];
-		zd_mult(&Aval[i], &Aval[i], temp);
-	    }
-	}
-	*(unsigned char *)equed = 'B';
-    }
-
-    return;
-
-} /* zlaqgs */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmemory.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmemory.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmemory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmemory.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,680 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-#include "slu_zdefs.h"
-
-/* Constants */
-#define NO_MEMTYPE  4      /* 0: lusup;
-			      1: ucol;
-			      2: lsub;
-			      3: usub */
-#define GluIntArray(n)   (5 * (n) + 5)
-
-/* Internal prototypes */
-void  *zexpand (int *, MemType,int, int, GlobalLU_t *);
-int   zLUWorkInit (int, int, int, int **, doublecomplex **, LU_space_t);
-void  copy_mem_doublecomplex (int, void *, void *);
-void  zStackCompress (GlobalLU_t *);
-void  zSetupSpace (void *, int, LU_space_t *);
-void  *zuser_malloc (int, int);
-void  zuser_free (int, int);
-
-/* External prototypes (in memory.c - prec-indep) */
-extern void    copy_mem_int    (int, void *, void *);
-extern void    user_bcopy      (char *, char *, int);
-
-/* Headers for 4 types of dynamatically managed memory */
-typedef struct e_node {
-    int size;      /* length of the memory that has been used */
-    void *mem;     /* pointer to the new malloc'd store */
-} ExpHeader;
-
-typedef struct {
-    int  size;
-    int  used;
-    int  top1;  /* grow upward, relative to &array[0] */
-    int  top2;  /* grow downward */
-    void *array;
-} LU_stack_t;
-
-/* Variables local to this file */
-static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */
-static LU_stack_t stack;
-static int no_expand;
-
-/* Macros to manipulate stack */
-#define StackFull(x)         ( x + stack.used >= stack.size )
-#define NotDoubleAlign(addr) ( (long int)addr & 7 )
-#define DoubleAlign(addr)    ( ((long int)addr + 7) & ~7L )
-#define TempSpace(m, w)      ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \
-			      (w + 1) * m * sizeof(doublecomplex) )
-#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
-
-
-
-
-/*
- * Setup the memory model to be used for factorization.
- *    lwork = 0: use system malloc;
- *    lwork > 0: use user-supplied work[] space.
- */
-void zSetupSpace(void *work, int lwork, LU_space_t *MemModel)
-{
-    if ( lwork == 0 ) {
-	*MemModel = SYSTEM; /* malloc/free */
-    } else if ( lwork > 0 ) {
-	*MemModel = USER;   /* user provided space */
-	stack.used = 0;
-	stack.top1 = 0;
-	stack.top2 = (lwork/4)*4; /* must be word addressable */
-	stack.size = stack.top2;
-	stack.array = (void *) work;
-    }
-}
-
-
-
-void *zuser_malloc(int bytes, int which_end)
-{
-    void *buf;
-    
-    if ( StackFull(bytes) ) return (NULL);
-
-    if ( which_end == HEAD ) {
-	buf = (char*) stack.array + stack.top1;
-	stack.top1 += bytes;
-    } else {
-	stack.top2 -= bytes;
-	buf = (char*) stack.array + stack.top2;
-    }
-    
-    stack.used += bytes;
-    return buf;
-}
-
-
-void zuser_free(int bytes, int which_end)
-{
-    if ( which_end == HEAD ) {
-	stack.top1 -= bytes;
-    } else {
-	stack.top2 += bytes;
-    }
-    stack.used -= bytes;
-}
-
-
-
-/*
- * mem_usage consists of the following fields:
- *    - for_lu (float)
- *      The amount of space used in bytes for the L\U data structures.
- *    - total_needed (float)
- *      The amount of space needed in bytes to perform factorization.
- *    - expansions (int)
- *      Number of memory expansions during the LU factorization.
- */
-int zQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    register int n, iword, dword, panel_size = sp_ienv(1);
-
-    Lstore = L->Store;
-    Ustore = U->Store;
-    n = L->ncol;
-    iword = sizeof(int);
-    dword = sizeof(doublecomplex);
-
-    /* For LU factors */
-    mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] *
-				 dword + Lstore->rowind_colptr[n] * iword );
-    mem_usage->for_lu += (float)( (n + 1) * iword +
-				 Ustore->colptr[n] * (dword + iword) );
-
-    /* Working storage to support factorization */
-    mem_usage->total_needed = mem_usage->for_lu +
-	(float)( (2 * panel_size + 4 + NO_MARKER) * n * iword +
-		(panel_size + 1) * n * dword );
-
-    mem_usage->expansions = --no_expand;
-
-    return 0;
-} /* zQuerySpace */
-
-/*
- * Allocate storage for the data structures common to all factor routines.
- * For those unpredictable size, make a guess as FILL * nnz(A).
- * Return value:
- *     If lwork = -1, return the estimated amount of space required, plus n;
- *     otherwise, return the amount of space actually allocated when
- *     memory allocation failure occurred.
- */
-int
-zLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz,
-	  int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu,
-	  int **iwork, doublecomplex **dwork)
-{
-    int      info, iword, dword;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    int      *xsup, *supno;
-    int      *lsub, *xlsub;
-    doublecomplex   *lusup;
-    int      *xlusup;
-    doublecomplex   *ucol;
-    int      *usub, *xusub;
-    int      nzlmax, nzumax, nzlumax;
-    int      FILL = sp_ienv(6);
-    
-    Glu->n    = n;
-    no_expand = 0;
-    iword     = sizeof(int);
-    dword     = sizeof(doublecomplex);
-
-    if ( !expanders )	
-        expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader));
-    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
-    
-    if ( fact != SamePattern_SameRowPerm ) {
-	/* Guess for L\U factors */
-	nzumax = nzlumax = FILL * annz;
-	nzlmax = SUPERLU_MAX(1, FILL/4.) * annz;
-
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else {
-	    zSetupSpace(work, lwork, &Glu->MemModel);
-	}
-	
-#if ( PRNTlevel >= 1 )
-	printf("zLUMemInit() called: FILL %ld, nzlmax %ld, nzumax %ld\n", 
-	       FILL, nzlmax, nzumax);
-	fflush(stdout);
-#endif	
-	
-	/* Integer pointers for L\U factors */
-	if ( Glu->MemModel == SYSTEM ) {
-	    xsup   = intMalloc(n+1);
-	    supno  = intMalloc(n+1);
-	    xlsub  = intMalloc(n+1);
-	    xlusup = intMalloc(n+1);
-	    xusub  = intMalloc(n+1);
-	} else {
-	    xsup   = (int *)zuser_malloc((n+1) * iword, HEAD);
-	    supno  = (int *)zuser_malloc((n+1) * iword, HEAD);
-	    xlsub  = (int *)zuser_malloc((n+1) * iword, HEAD);
-	    xlusup = (int *)zuser_malloc((n+1) * iword, HEAD);
-	    xusub  = (int *)zuser_malloc((n+1) * iword, HEAD);
-	}
-
-	lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	ucol  = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu );
-	lsub  = (int *)    zexpand( &nzlmax, LSUB, 0, 0, Glu );
-	usub  = (int *)    zexpand( &nzumax, USUB, 0, 1, Glu );
-
-	while ( !lusup || !ucol || !lsub || !usub ) {
-	    if ( Glu->MemModel == SYSTEM ) {
-		SUPERLU_FREE(lusup); 
-		SUPERLU_FREE(ucol); 
-		SUPERLU_FREE(lsub); 
-		SUPERLU_FREE(usub);
-	    } else {
-		zuser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD);
-	    }
-	    nzlumax /= 2;
-	    nzumax /= 2;
-	    nzlmax /= 2;
-	    if ( nzlumax < annz ) {
-		printf("Not enough memory to perform factorization.\n");
-		return (zmemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-	    }
-#if ( PRNTlevel >= 1)
-	    printf("zLUMemInit() reduce size: nzlmax %ld, nzumax %ld\n", 
-		   nzlmax, nzumax);
-	    fflush(stdout);
-#endif
-	    lusup = (doublecomplex *) zexpand( &nzlumax, LUSUP, 0, 0, Glu );
-	    ucol  = (doublecomplex *) zexpand( &nzumax, UCOL, 0, 0, Glu );
-	    lsub  = (int *)    zexpand( &nzlmax, LSUB, 0, 0, Glu );
-	    usub  = (int *)    zexpand( &nzumax, USUB, 0, 1, Glu );
-	}
-	
-    } else {
-	/* fact == SamePattern_SameRowPerm */
-	Lstore   = L->Store;
-	Ustore   = U->Store;
-	xsup     = Lstore->sup_to_col;
-	supno    = Lstore->col_to_sup;
-	xlsub    = Lstore->rowind_colptr;
-	xlusup   = Lstore->nzval_colptr;
-	xusub    = Ustore->colptr;
-	nzlmax   = Glu->nzlmax;    /* max from previous factorization */
-	nzumax   = Glu->nzumax;
-	nzlumax  = Glu->nzlumax;
-	
-	if ( lwork == -1 ) {
-	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
-		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
-        } else if ( lwork == 0 ) {
-	    Glu->MemModel = SYSTEM;
-	} else {
-	    Glu->MemModel = USER;
-	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
-	    stack.size = stack.top2;
-	}
-	
-	lsub  = expanders[LSUB].mem  = Lstore->rowind;
-	lusup = expanders[LUSUP].mem = Lstore->nzval;
-	usub  = expanders[USUB].mem  = Ustore->rowind;
-	ucol  = expanders[UCOL].mem  = Ustore->nzval;;
-	expanders[LSUB].size         = nzlmax;
-	expanders[LUSUP].size        = nzlumax;
-	expanders[USUB].size         = nzumax;
-	expanders[UCOL].size         = nzumax;	
-    }
-
-    Glu->xsup    = xsup;
-    Glu->supno   = supno;
-    Glu->lsub    = lsub;
-    Glu->xlsub   = xlsub;
-    Glu->lusup   = lusup;
-    Glu->xlusup  = xlusup;
-    Glu->ucol    = ucol;
-    Glu->usub    = usub;
-    Glu->xusub   = xusub;
-    Glu->nzlmax  = nzlmax;
-    Glu->nzumax  = nzumax;
-    Glu->nzlumax = nzlumax;
-    
-    info = zLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel);
-    if ( info )
-	return ( info + zmemory_usage(nzlmax, nzumax, nzlumax, n) + n);
-    
-    ++no_expand;
-    return 0;
-    
-} /* zLUMemInit */
-
-/* Allocate known working storage. Returns 0 if success, otherwise
-   returns the number of bytes allocated so far when failure occurred. */
-int
-zLUWorkInit(int m, int n, int panel_size, int **iworkptr, 
-            doublecomplex **dworkptr, LU_space_t MemModel)
-{
-    int    isize, dsize, extra;
-    doublecomplex *old_ptr;
-    int    maxsuper = sp_ienv(3),
-           rowblk   = sp_ienv(4);
-
-    isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int);
-    dsize = (m * panel_size +
-	     NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(doublecomplex);
-    
-    if ( MemModel == SYSTEM ) 
-	*iworkptr = (int *) intCalloc(isize/sizeof(int));
-    else
-	*iworkptr = (int *) zuser_malloc(isize, TAIL);
-    if ( ! *iworkptr ) {
-	fprintf(stderr, "zLUWorkInit: malloc fails for local iworkptr[]\n");
-	return (isize + n);
-    }
-
-    if ( MemModel == SYSTEM )
-	*dworkptr = (doublecomplex *) SUPERLU_MALLOC(dsize);
-    else {
-	*dworkptr = (doublecomplex *) zuser_malloc(dsize, TAIL);
-	if ( NotDoubleAlign(*dworkptr) ) {
-	    old_ptr = *dworkptr;
-	    *dworkptr = (doublecomplex*) DoubleAlign(*dworkptr);
-	    *dworkptr = (doublecomplex*) ((double*)*dworkptr - 1);
-	    extra = (char*)old_ptr - (char*)*dworkptr;
-#ifdef DEBUG	    
-	    printf("zLUWorkInit: not aligned, extra %d\n", extra);
-#endif	    
-	    stack.top2 -= extra;
-	    stack.used += extra;
-	}
-    }
-    if ( ! *dworkptr ) {
-	fprintf(stderr, "malloc fails for local dworkptr[].");
-	return (isize + dsize + n);
-    }
-	
-    return 0;
-}
-
-
-/*
- * Set up pointers for real working arrays.
- */
-void
-zSetRWork(int m, int panel_size, doublecomplex *dworkptr,
-	 doublecomplex **dense, doublecomplex **tempv)
-{
-    doublecomplex zero = {0.0, 0.0};
-
-    int maxsuper = sp_ienv(3),
-        rowblk   = sp_ienv(4);
-    *dense = dworkptr;
-    *tempv = *dense + panel_size*m;
-    zfill (*dense, m * panel_size, zero);
-    zfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero);     
-}
-	
-/*
- * Free the working storage used by factor routines.
- */
-void zLUWorkFree(int *iwork, doublecomplex *dwork, GlobalLU_t *Glu)
-{
-    if ( Glu->MemModel == SYSTEM ) {
-	SUPERLU_FREE (iwork);
-	SUPERLU_FREE (dwork);
-    } else {
-	stack.used -= (stack.size - stack.top2);
-	stack.top2 = stack.size;
-/*	zStackCompress(Glu);  */
-    }
-    
-    SUPERLU_FREE (expanders);	
-    expanders = 0;
-}
-
-/* Expand the data structures for L and U during the factorization.
- * Return value:   0 - successful return
- *               > 0 - number of bytes allocated when run out of space
- */
-int
-zLUMemXpand(int jcol,
-	   int next,          /* number of elements currently in the factors */
-	   MemType mem_type,  /* which type of memory to expand  */
-	   int *maxlen,       /* modified - maximum length of a data structure */
-	   GlobalLU_t *Glu    /* modified - global LU data structures */
-	   )
-{
-    void   *new_mem;
-    
-#ifdef DEBUG    
-    printf("zLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n",
-	   jcol, next, *maxlen, mem_type);
-#endif    
-
-    if (mem_type == USUB) 
-    	new_mem = zexpand(maxlen, mem_type, next, 1, Glu);
-    else
-	new_mem = zexpand(maxlen, mem_type, next, 0, Glu);
-    
-    if ( !new_mem ) {
-	int    nzlmax  = Glu->nzlmax;
-	int    nzumax  = Glu->nzumax;
-	int    nzlumax = Glu->nzlumax;
-    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
-    	return (zmemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n);
-    }
-
-    switch ( mem_type ) {
-      case LUSUP:
-	Glu->lusup   = (doublecomplex *) new_mem;
-	Glu->nzlumax = *maxlen;
-	break;
-      case UCOL:
-	Glu->ucol   = (doublecomplex *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-      case LSUB:
-	Glu->lsub   = (int *) new_mem;
-	Glu->nzlmax = *maxlen;
-	break;
-      case USUB:
-	Glu->usub   = (int *) new_mem;
-	Glu->nzumax = *maxlen;
-	break;
-    }
-    
-    return 0;
-    
-}
-
-
-
-void
-copy_mem_doublecomplex(int howmany, void *old, void *new)
-{
-    register int i;
-    doublecomplex *dold = old;
-    doublecomplex *dnew = new;
-    for (i = 0; i < howmany; i++) dnew[i] = dold[i];
-}
-
-/*
- * Expand the existing storage to accommodate more fill-ins.
- */
-void
-*zexpand (
-	 int *prev_len,   /* length used from previous call */
-	 MemType type,    /* which part of the memory to expand */
-	 int len_to_copy, /* size of the memory to be copied to new store */
-	 int keep_prev,   /* = 1: use prev_len;
-			     = 0: compute new_len to expand */
-	 GlobalLU_t *Glu  /* modified - global LU data structures */
-	)
-{
-    float    EXPAND = 1.5;
-    float    alpha;
-    void     *new_mem, *old_mem;
-    int      new_len, tries, lword, extra, bytes_to_copy;
-
-    alpha = EXPAND;
-
-    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
-        new_len = *prev_len;
-    else {
-	new_len = alpha * *prev_len;
-    }
-    
-    if ( type == LSUB || type == USUB ) lword = sizeof(int);
-    else lword = sizeof(doublecomplex);
-
-    if ( Glu->MemModel == SYSTEM ) {
-	new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-	if ( no_expand != 0 ) {
-	    tries = 0;
-	    if ( keep_prev ) {
-		if ( !new_mem ) return (NULL);
-	    } else {
-		while ( !new_mem ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    new_mem = (void *) SUPERLU_MALLOC((size_t)new_len * lword);
-		}
-	    }
-	    if ( type == LSUB || type == USUB ) {
-		copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
-	    } else {
-		copy_mem_doublecomplex(len_to_copy, expanders[type].mem, new_mem);
-	    }
-	    SUPERLU_FREE (expanders[type].mem);
-	}
-	expanders[type].mem = (void *) new_mem;
-	
-    } else { /* MemModel == USER */
-	if ( no_expand == 0 ) {
-	    new_mem = zuser_malloc(new_len * lword, HEAD);
-	    if ( NotDoubleAlign(new_mem) &&
-		(type == LUSUP || type == UCOL) ) {
-		old_mem = new_mem;
-		new_mem = (void *)DoubleAlign(new_mem);
-		extra = (char*)new_mem - (char*)old_mem;
-#ifdef DEBUG		
-		printf("expand(): not aligned, extra %d\n", extra);
-#endif		
-		stack.top1 += extra;
-		stack.used += extra;
-	    }
-	    expanders[type].mem = (void *) new_mem;
-	}
-	else {
-	    tries = 0;
-	    extra = (new_len - *prev_len) * lword;
-	    if ( keep_prev ) {
-		if ( StackFull(extra) ) return (NULL);
-	    } else {
-		while ( StackFull(extra) ) {
-		    if ( ++tries > 10 ) return (NULL);
-		    alpha = Reduce(alpha);
-		    new_len = alpha * *prev_len;
-		    extra = (new_len - *prev_len) * lword;	    
-		}
-	    }
-
-	    if ( type != USUB ) {
-		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
-		bytes_to_copy = (char*)stack.array + stack.top1
-		    - (char*)expanders[type + 1].mem;
-		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
-
-		if ( type < USUB ) {
-		    Glu->usub = expanders[USUB].mem =
-			(void*)((char*)expanders[USUB].mem + extra);
-		}
-		if ( type < LSUB ) {
-		    Glu->lsub = expanders[LSUB].mem =
-			(void*)((char*)expanders[LSUB].mem + extra);
-		}
-		if ( type < UCOL ) {
-		    Glu->ucol = expanders[UCOL].mem =
-			(void*)((char*)expanders[UCOL].mem + extra);
-		}
-		stack.top1 += extra;
-		stack.used += extra;
-		if ( type == UCOL ) {
-		    stack.top1 += extra;   /* Add same amount for USUB */
-		    stack.used += extra;
-		}
-		
-	    } /* if ... */
-
-	} /* else ... */
-    }
-
-    expanders[type].size = new_len;
-    *prev_len = new_len;
-    if ( no_expand ) ++no_expand;
-    
-    return (void *) expanders[type].mem;
-    
-} /* zexpand */
-
-
-/*
- * Compress the work[] array to remove fragmentation.
- */
-void
-zStackCompress(GlobalLU_t *Glu)
-{
-    register int iword, dword, ndim;
-    char    *last, *fragment;
-    int      *ifrom, *ito;
-    doublecomplex   *dfrom, *dto;
-    int      *xlsub, *lsub, *xusub, *usub, *xlusup;
-    doublecomplex   *ucol, *lusup;
-    
-    iword = sizeof(int);
-    dword = sizeof(doublecomplex);
-    ndim = Glu->n;
-
-    xlsub  = Glu->xlsub;
-    lsub   = Glu->lsub;
-    xusub  = Glu->xusub;
-    usub   = Glu->usub;
-    xlusup = Glu->xlusup;
-    ucol   = Glu->ucol;
-    lusup  = Glu->lusup;
-    
-    dfrom = ucol;
-    dto = (doublecomplex *)((char*)lusup + xlusup[ndim] * dword);
-    copy_mem_doublecomplex(xusub[ndim], dfrom, dto);
-    ucol = dto;
-
-    ifrom = lsub;
-    ito = (int *) ((char*)ucol + xusub[ndim] * iword);
-    copy_mem_int(xlsub[ndim], ifrom, ito);
-    lsub = ito;
-    
-    ifrom = usub;
-    ito = (int *) ((char*)lsub + xlsub[ndim] * iword);
-    copy_mem_int(xusub[ndim], ifrom, ito);
-    usub = ito;
-    
-    last = (char*)usub + xusub[ndim] * iword;
-    fragment = (char*) (((char*)stack.array + stack.top1) - last);
-    stack.used -= (long int) fragment;
-    stack.top1 -= (long int) fragment;
-
-    Glu->ucol = ucol;
-    Glu->lsub = lsub;
-    Glu->usub = usub;
-    
-#ifdef DEBUG
-    printf("zStackCompress: fragment %d\n", fragment);
-    /* for (last = 0; last < ndim; ++last)
-	print_lu_col("After compress:", last, 0);*/
-#endif    
-    
-}
-
-/*
- * Allocate storage for original matrix A
- */
-void
-zallocateA(int n, int nnz, doublecomplex **a, int **asub, int **xa)
-{
-    *a    = (doublecomplex *) doublecomplexMalloc(nnz);
-    *asub = (int *) intMalloc(nnz);
-    *xa   = (int *) intMalloc(n+1);
-}
-
-
-doublecomplex *doublecomplexMalloc(int n)
-{
-    doublecomplex *buf;
-    buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex)); 
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in doublecomplexMalloc()\n");
-    }
-    return (buf);
-}
-
-doublecomplex *doublecomplexCalloc(int n)
-{
-    doublecomplex *buf;
-    register int i;
-    doublecomplex zero = {0.0, 0.0};
-    buf = (doublecomplex *) SUPERLU_MALLOC((size_t)n * sizeof(doublecomplex));
-    if ( !buf ) {
-	ABORT("SUPERLU_MALLOC failed for buf in doublecomplexCalloc()\n");
-    }
-    for (i = 0; i < n; ++i) buf[i] = zero;
-    return (buf);
-}
-
-
-int zmemory_usage(const int nzlmax, const int nzumax, 
-		  const int nzlumax, const int n)
-{
-    register int iword, dword;
-
-    iword   = sizeof(int);
-    dword   = sizeof(doublecomplex);
-    
-    return (10 * n * iword +
-	    nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword);
-
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmyblas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmyblas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zmyblas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zmyblas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,182 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		zmyblas2.c
- * Purpose:
- *     Level 2 BLAS operations: solves and matvec, written in C.
- * Note:
- *     This is only used when the system lacks an efficient BLAS library.
- */
-#include "slu_dcomplex.h"
-
-/*
- * Solves a dense UNIT lower triangular system. The unit lower 
- * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
- * The solution will be returned in the rhs vector.
- */
-void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs )
-{
-    int k;
-    doublecomplex x0, x1, x2, x3, temp;
-    doublecomplex *M0;
-    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
-    register int firstcol = 0;
-
-    M0 = &M[0];
-
-
-    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
-      	Mki0 = M0 + 1;
-      	Mki1 = Mki0 + ldm + 1;
-      	Mki2 = Mki1 + ldm + 1;
-      	Mki3 = Mki2 + ldm + 1;
-
-      	x0 = rhs[firstcol];
-      	zz_mult(&temp, &x0, Mki0); Mki0++;
-      	z_sub(&x1, &rhs[firstcol+1], &temp);
-      	zz_mult(&temp, &x0, Mki0); Mki0++;
-	z_sub(&x2, &rhs[firstcol+2], &temp);
-	zz_mult(&temp, &x1, Mki1); Mki1++;
-	z_sub(&x2, &x2, &temp);
-      	zz_mult(&temp, &x0, Mki0); Mki0++;
-	z_sub(&x3, &rhs[firstcol+3], &temp);
-	zz_mult(&temp, &x1, Mki1); Mki1++;
-	z_sub(&x3, &x3, &temp);
-	zz_mult(&temp, &x2, Mki2); Mki2++;
-	z_sub(&x3, &x3, &temp);
-
- 	rhs[++firstcol] = x1;
-      	rhs[++firstcol] = x2;
-      	rhs[++firstcol] = x3;
-      	++firstcol;
-    
-      	for (k = firstcol; k < ncol; k++) {
-	    zz_mult(&temp, &x0, Mki0); Mki0++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x1, Mki1); Mki1++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x2, Mki2); Mki2++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x3, Mki3); Mki3++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	}
-
-        M0 += 4 * ldm + 4;
-    }
-
-    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
-        Mki0 = M0 + 1;
-        Mki1 = Mki0 + ldm + 1;
-
-        x0 = rhs[firstcol];
-	zz_mult(&temp, &x0, Mki0); Mki0++;
-	z_sub(&x1, &rhs[firstcol+1], &temp);
-
-      	rhs[++firstcol] = x1;
-      	++firstcol;
-    
-      	for (k = firstcol; k < ncol; k++) {
-	    zz_mult(&temp, &x0, Mki0); Mki0++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	    zz_mult(&temp, &x1, Mki1); Mki1++;
-	    z_sub(&rhs[k], &rhs[k], &temp);
-	} 
-    }
-    
-}
-
-/*
- * Solves a dense upper triangular system. The upper triangular matrix is
- * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
- * in the rhs vector.
- */
-void
-zusolve ( ldm, ncol, M, rhs )
-int ldm;	/* in */
-int ncol;	/* in */
-doublecomplex *M;	/* in */
-doublecomplex *rhs;	/* modified */
-{
-    doublecomplex xj, temp;
-    int jcol, j, irow;
-
-    jcol = ncol - 1;
-
-    for (j = 0; j < ncol; j++) {
-
-	z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
-	rhs[jcol] = xj;
-	
-	for (irow = 0; irow < jcol; irow++) {
-	    zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
-	    z_sub(&rhs[irow], &rhs[irow], &temp);
-	}
-
-	jcol--;
-
-    }
-}
-
-
-/*
- * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
- * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
- */
-void zmatvec ( ldm, nrow, ncol, M, vec, Mxvec )
-int ldm;	/* in -- leading dimension of M */
-int nrow;	/* in */ 
-int ncol;	/* in */
-doublecomplex *M;	/* in */
-doublecomplex *vec;	/* in */
-doublecomplex *Mxvec;	/* in/out */
-{
-    doublecomplex vi0, vi1, vi2, vi3;
-    doublecomplex *M0, temp;
-    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
-    register int firstcol = 0;
-    int k;
-
-    M0 = &M[0];
-
-    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
-	Mki0 = M0;
-	Mki1 = Mki0 + ldm;
-	Mki2 = Mki1 + ldm;
-	Mki3 = Mki2 + ldm;
-
-	vi0 = vec[firstcol++];
-	vi1 = vec[firstcol++];
-	vi2 = vec[firstcol++];
-	vi3 = vec[firstcol++];	
-	for (k = 0; k < nrow; k++) {
-	    zz_mult(&temp, &vi0, Mki0); Mki0++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	    zz_mult(&temp, &vi1, Mki1); Mki1++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	    zz_mult(&temp, &vi2, Mki2); Mki2++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	    zz_mult(&temp, &vi3, Mki3); Mki3++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	}
-
-	M0 += 4 * ldm;
-    }
-
-    while ( firstcol < ncol ) {		/* Do 1 column */
- 	Mki0 = M0;
-	vi0 = vec[firstcol++];
-	for (k = 0; k < nrow; k++) {
-	    zz_mult(&temp, &vi0, Mki0); Mki0++;
-	    z_add(&Mxvec[k], &Mxvec[k], &temp);
-	}
-	M0 += ldm;
-    }
-	
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,478 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_zdefs.h"
-
-/* 
- * Function prototypes 
- */
-void zlsolve(int, int, doublecomplex *, doublecomplex *);
-void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *);
-extern void zcheck_tempv();
-
-void
-zpanel_bmod (
-	    const int  m,          /* in - number of rows in the matrix */
-	    const int  w,          /* in */
-	    const int  jcol,       /* in */
-	    const int  nseg,       /* in */
-	    doublecomplex     *dense,     /* out, of size n by w */
-	    doublecomplex     *tempv,     /* working array */
-	    int        *segrep,    /* in */
-	    int        *repfnz,    /* in, of size n by w */
-	    GlobalLU_t *Glu,       /* modified */
-	    SuperLUStat_t *stat    /* output */
-	    )
-{
-/* 
- * Purpose
- * =======
- *
- *    Performs numeric block updates (sup-panel) in topological order.
- *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
- *    Special processing on the supernodal portion of L\U[*,j]
- *
- *    Before entering this routine, the original nonzeros in the panel 
- *    were already copied into the spa[m,w].
- *
- *    Updated/Output parameters-
- *	dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
- *      collectively in the m-by-w vector dense[*]. 
- *
- */
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-         ftcs2 = _cptofcd("N", strlen("N")),
-         ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int          incx = 1, incy = 1;
-    doublecomplex       alpha, beta;
-#endif
-
-    register int k, ksub;
-    int          fsupc, nsupc, nsupr, nrow;
-    int          krep, krep_ind;
-    doublecomplex       ukj, ukj1, ukj2;
-    int          luptr, luptr1, luptr2;
-    int          segsze;
-    int          block_nrow;  /* no of rows in a block row */
-    register int lptr;	      /* Points to the row subscripts of a supernode */
-    int          kfnz, irow, no_zeros; 
-    register int isub, isub1, i;
-    register int jj;	      /* Index through each column in the panel */
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    doublecomplex       *lusup;
-    int          *xlusup;
-    int          *repfnz_col; /* repfnz[] for a column in the panel */
-    doublecomplex       *dense_col;  /* dense[] for a column in the panel */
-    doublecomplex       *tempv1;             /* Used in 1-D update */
-    doublecomplex       *TriTmp, *MatvecTmp; /* used in 2-D update */
-    doublecomplex      zero = {0.0, 0.0};
-    doublecomplex      one = {1.0, 0.0};
-    doublecomplex      comp_temp, comp_temp1;
-    register int ldaTmp;
-    register int r_ind, r_hi;
-    static   int first = 1, maxsuper, rowblk, colblk;
-    flops_t  *ops = stat->ops;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    
-    if ( first ) {
-	maxsuper = sp_ienv(3);
-	rowblk   = sp_ienv(4);
-	colblk   = sp_ienv(5);
-	first = 0;
-    }
-    ldaTmp = maxsuper + rowblk;
-
-    /* 
-     * For each nonz supernode segment of U[*,j] in topological order 
-     */
-    k = nseg - 1;
-    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
-
-	/* krep = representative of current k-th supernode
-	 * fsupc = first supernodal column
-	 * nsupc = no of columns in a supernode
-	 * nsupr = no of rows in a supernode
-	 */
-        krep = segrep[k--];
-	fsupc = xsup[supno[krep]];
-	nsupc = krep - fsupc + 1;
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nrow = nsupr - nsupc;
-	lptr = xlsub[fsupc];
-	krep_ind = lptr + nsupc - 1;
-
-	repfnz_col = repfnz;
-	dense_col = dense;
-	
-	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
-
-	    TriTmp = tempv;
-	
-	    /* Sequence through each column in panel -- triangular solves */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
-
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-	    
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += 4 * segsze * (segsze - 1);
-		ops[GEMV] += 8 * nrow * segsze;
-	
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-	    	        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-		        z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		        z_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++;
-			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    z_add(&comp_temp, &comp_temp, &comp_temp1);
-			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-  		        zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
-		        z_sub(&ukj1, &ukj1, &comp_temp);
-
-		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-        		zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-		        z_add(&comp_temp, &comp_temp, &comp_temp1);
-		        z_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    luptr++; luptr1++; luptr2++;
-			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    z_add(&comp_temp, &comp_temp, &comp_temp1);
-			    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-			    z_add(&comp_temp, &comp_temp, &comp_temp1);
-			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    }
-
-		} else  {	/* segsze >= 4 */
-		    
-		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
-		       holds the result of triangular solves.    */
-		    no_zeros = kfnz - fsupc;
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			TriTmp[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#else
-		    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-			   &nsupr, TriTmp, &incx );
-#endif
-#else		
-		    zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
-#endif
-		    
-
-		} /* else ... */
-	    
-	    }  /* for jj ... end tri-solves */
-
-	    /* Block row updates; push all the way into dense[*] block */
-	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
-		
-		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
-		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
-		luptr = xlusup[fsupc] + nsupc + r_ind;
-		isub1 = lptr + nsupc + r_ind;
-		
-		repfnz_col = repfnz;
-		TriTmp = tempv;
-		dense_col = dense;
-		
-		/* Sequence through each column in panel -- matrix-vector */
-		for (jj = jcol; jj < jcol + w; jj++,
-		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		    
-		    kfnz = repfnz_col[krep];
-		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		    
-		    segsze = krep - kfnz + 1;
-		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
-		    
-		    /* Perform a block update, and scatter the result of
-		       matrix-vector to dense[].		 */
-		    no_zeros = kfnz - fsupc;
-		    luptr1 = luptr + nsupr * no_zeros;
-		    MatvecTmp = &TriTmp[maxsuper];
-		    
-#ifdef USE_VENDOR_BLAS
-		    alpha = one; 
-                    beta = zero;
-#ifdef _CRAY
-		    CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#else
-		    zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
-			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
-#endif
-#else
-		    zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
-			   TriTmp, MatvecTmp);
-#endif
-		    
-		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
-		     * such that MatvecTmp[*] can be re-used for the
-		     * the next blok row update. dense[] will be copied into 
-		     * global store after the whole panel has been finished.
-		     */
-		    isub = isub1;
-		    for (i = 0; i < block_nrow; i++) {
-			irow = lsub[isub];
-		        z_sub(&dense_col[irow], &dense_col[irow], 
-                              &MatvecTmp[i]);
-			MatvecTmp[i] = zero;
-			++isub;
-		    }
-		    
-		} /* for jj ... */
-		
-	    } /* for each block row ... */
-	    
-	    /* Scatter the triangular solves into SPA dense[*] */
-	    repfnz_col = repfnz;
-	    TriTmp = tempv;
-	    dense_col = dense;
-	    
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		if ( segsze <= 3 ) continue; /* skip unrolled cases */
-		
-		no_zeros = kfnz - fsupc;		
-		isub = lptr + no_zeros;
-		for (i = 0; i < segsze; i++) {
-		    irow = lsub[isub];
-		    dense_col[irow] = TriTmp[i];
-		    TriTmp[i] = zero;
-		    ++isub;
-		}
-		
-	    } /* for jj ... */
-	    
-	} else { /* 1-D block modification */
-	    
-	    
-	    /* Sequence through each column in the panel */
-	    for (jj = jcol; jj < jcol + w; jj++,
-		 repfnz_col += m, dense_col += m) {
-		
-		kfnz = repfnz_col[krep];
-		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
-		
-		segsze = krep - kfnz + 1;
-		luptr = xlusup[fsupc];
-
-		ops[TRSV] += 4 * segsze * (segsze - 1);
-		ops[GEMV] += 8 * nrow * segsze;
-		
-		/* Case 1: Update U-segment of size 1 -- col-col update */
-		if ( segsze == 1 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc;
-
-		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
-			irow = lsub[i];
-	    	        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-		        z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			++luptr;
-		    }
-
-		} else if ( segsze <= 3 ) {
-		    ukj = dense_col[lsub[krep_ind]];
-		    luptr += nsupr*(nsupc-1) + nsupc-1;
-		    ukj1 = dense_col[lsub[krep_ind - 1]];
-		    luptr1 = luptr - nsupr;
-
-		    if ( segsze == 2 ) {
-		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-		        z_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr;  ++luptr1;
-			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    z_add(&comp_temp, &comp_temp, &comp_temp1);
-			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    } else {
-			ukj2 = dense_col[lsub[krep_ind - 2]];
-			luptr2 = luptr1 - nsupr;
-  		        zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
-		        z_sub(&ukj1, &ukj1, &comp_temp);
-
-		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
-        		zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-		        z_add(&comp_temp, &comp_temp, &comp_temp1);
-		        z_sub(&ukj, &ukj, &comp_temp);
-			dense_col[lsub[krep_ind]] = ukj;
-			dense_col[lsub[krep_ind-1]] = ukj1;
-			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
-			    irow = lsub[i];
-			    ++luptr; ++luptr1; ++luptr2;
-			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
-			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
-			    z_add(&comp_temp, &comp_temp, &comp_temp1);
-			    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
-			    z_add(&comp_temp, &comp_temp, &comp_temp1);
-			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
-			}
-		    }
-
-		} else  { /* segsze >= 4 */
-		    /* 
-		     * Perform a triangular solve and block update,
-		     * then scatter the result of sup-col update to dense[].
-		     */
-		    no_zeros = kfnz - fsupc;
-		    
-		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
-		     *    The result of triangular solve is in tempv[*];
-		     *    The result of matrix vector update is in dense_col[*]
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; ++i) {
-			irow = lsub[isub];
-			tempv[i] = dense_col[irow]; /* Gather */
-			++isub;
-		    }
-		    
-		    /* start effective triangle */
-		    luptr += nsupr * no_zeros + no_zeros;
-		    
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#else
-		    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
-			   &nsupr, tempv, &incx );
-#endif
-		    
-		    luptr += segsze;	/* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-                    alpha = one;
-                    beta = zero;
-#ifdef _CRAY
-		    CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#else
-		    zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
-			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
-#endif
-#else
-		    zlsolve ( nsupr, segsze, &lusup[luptr], tempv );
-		    
-		    luptr += segsze;        /* Dense matrix-vector */
-		    tempv1 = &tempv[segsze];
-		    zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
-#endif
-		    
-		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
-		     * that tempv[*] can be used for the triangular solve of
-		     * the next column of the panel. They will be copied into 
-		     * ucol[*] after the whole panel has been finished.
-		     */
-		    isub = lptr + no_zeros;
-		    for (i = 0; i < segsze; i++) {
-			irow = lsub[isub];
-			dense_col[irow] = tempv[i];
-			tempv[i] = zero;
-			isub++;
-		    }
-		    
-		    /* Scatter the update from tempv1[*] into SPA dense[*] */
-		    /* Start dense rectangular L */
-		    for (i = 0; i < nrow; i++) {
-			irow = lsub[isub];
-		        z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
-			tempv1[i] = zero;
-			++isub;	
-		    }
-		    
-		} /* else segsze>=4 ... */
-		
-	    } /* for each column in the panel... */
-	    
-	} /* else 1-D update ... */
-
-    } /* for each updating supernode ... */
-
-}
-
-
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpanel_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,247 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-void
-zpanel_dfs (
-	   const int  m,           /* in - number of rows in the matrix */
-	   const int  w,           /* in */
-	   const int  jcol,        /* in */
-	   SuperMatrix *A,       /* in - original matrix */
-	   int        *perm_r,     /* in */
-	   int        *nseg,	   /* out */
-	   doublecomplex     *dense,      /* out */
-	   int        *panel_lsub, /* out */
-	   int        *segrep,     /* out */
-	   int        *repfnz,     /* out */
-	   int        *xprune,     /* out */
-	   int        *marker,     /* out */     
-	   int        *parent,     /* working array */
-	   int        *xplore,     /* working array */
-	   GlobalLU_t *Glu         /* modified */
-	   )
-{
-/*
- * Purpose
- * =======
- *
- *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
- *
- *   A supernode representative is the last column of a supernode.
- *   The nonzeros in U[*,j] are segments that end at supernodal
- *   representatives.
- *
- *   The routine returns one list of the supernodal representatives
- *   in topological order of the dfs that generates them. This list is
- *   a superset of the topological order of each individual column within
- *   the panel. 
- *   The location of the first nonzero in each supernodal segment
- *   (supernodal entry location) is also returned. Each column has a 
- *   separate list for this purpose.
- *
- *   Two marker arrays are used for dfs:
- *     marker[i] == jj, if i was visited during dfs of current column jj;
- *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
- *
- *   marker: A-row --> A-row/col (0/1)
- *   repfnz: SuperA-col --> PA-row
- *   parent: SuperA-col --> SuperA-col
- *   xplore: SuperA-col --> index to L-structure
- *
- */
-    NCPformat *Astore;
-    doublecomplex    *a;
-    int       *asub;
-    int       *xa_begin, *xa_end;
-    int	      krep, chperm, chmark, chrep, oldrep, kchild, myfnz;
-    int       k, krow, kmark, kperm;
-    int       xdfs, maxdfs, kpar;
-    int       jj;	   /* index through each column in the panel */
-    int       *marker1;	   /* marker1[jj] >= jcol if vertex jj was visited 
-			      by a previous column within this panel.   */
-    int       *repfnz_col; /* start of each column in the panel */
-    doublecomplex    *dense_col;  /* start of each column in the panel */
-    int       nextl_col;   /* next available position in panel_lsub[*,jj] */
-    int       *xsup, *supno;
-    int       *lsub, *xlsub;
-
-    /* Initialize pointers */
-    Astore     = A->Store;
-    a          = Astore->nzval;
-    asub       = Astore->rowind;
-    xa_begin   = Astore->colbeg;
-    xa_end     = Astore->colend;
-    marker1    = marker + m;
-    repfnz_col = repfnz;
-    dense_col  = dense;
-    *nseg      = 0;
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-
-    /* For each column in the panel */
-    for (jj = jcol; jj < jcol + w; jj++) {
-	nextl_col = (jj - jcol) * m;
-
-#ifdef CHK_DFS
-	printf("\npanel col %d: ", jj);
-#endif
-
-	/* For each nonz in A[*,jj] do dfs */
-	for (k = xa_begin[jj]; k < xa_end[jj]; k++) {
-	    krow = asub[k];
-            dense_col[krow] = a[k];
-	    kmark = marker[krow];    	
-	    if ( kmark == jj ) 
-		continue;     /* krow visited before, go to the next nonzero */
-
-	    /* For each unmarked nbr krow of jj
-	     * krow is in L: place it in structure of L[*,jj]
-	     */
-	    marker[krow] = jj;
-	    kperm = perm_r[krow];
-	    
-	    if ( kperm == EMPTY ) {
-		panel_lsub[nextl_col++] = krow; /* krow is indexed into A */
-	    }
-	    /* 
-	     * krow is in U: if its supernode-rep krep
-	     * has been explored, update repfnz[*]
-	     */
-	    else {
-		
-		krep = xsup[supno[kperm]+1] - 1;
-		myfnz = repfnz_col[krep];
-		
-#ifdef CHK_DFS
-		printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm);
-#endif
-		if ( myfnz != EMPTY ) {	/* Representative visited before */
-		    if ( myfnz > kperm ) repfnz_col[krep] = kperm;
-		    /* continue; */
-		}
-		else {
-		    /* Otherwise, perform dfs starting at krep */
-		    oldrep = EMPTY;
-		    parent[krep] = oldrep;
-		    repfnz_col[krep] = kperm;
-		    xdfs = xlsub[krep];
-		    maxdfs = xprune[krep];
-		    
-#ifdef CHK_DFS 
-		    printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-		    for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-		    printf("\n");
-#endif
-		    do {
-			/* 
-			 * For each unmarked kchild of krep 
-			 */
-			while ( xdfs < maxdfs ) {
-			    
-			    kchild = lsub[xdfs];
-			    xdfs++;
-			    chmark = marker[kchild];
-			    
-			    if ( chmark != jj ) { /* Not reached yet */
-				marker[kchild] = jj;
-				chperm = perm_r[kchild];
-			      
-				/* Case kchild is in L: place it in L[*,j] */
-				if ( chperm == EMPTY ) {
-				    panel_lsub[nextl_col++] = kchild;
-				} 
-				/* Case kchild is in U: 
-				 *   chrep = its supernode-rep. If its rep has 
-				 *   been explored, update its repfnz[*]
-				 */
-				else {
-				    
-				    chrep = xsup[supno[chperm]+1] - 1;
-				    myfnz = repfnz_col[chrep];
-#ifdef CHK_DFS
-				    printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm);
-#endif
-				    if ( myfnz != EMPTY ) { /* Visited before */
-					if ( myfnz > chperm )
-					    repfnz_col[chrep] = chperm;
-				    }
-				    else {
-					/* Cont. dfs at snode-rep of kchild */
-					xplore[krep] = xdfs;	
-					oldrep = krep;
-					krep = chrep; /* Go deeper down G(L) */
-					parent[krep] = oldrep;
-					repfnz_col[krep] = chperm;
-					xdfs = xlsub[krep];     
-					maxdfs = xprune[krep];
-#ifdef CHK_DFS 
-					printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
-					for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);	
-					printf("\n");
-#endif
-				    } /* else */
-				  
-				} /* else */
-			      
-			    } /* if... */
-			    
-			} /* while xdfs < maxdfs */
-			
-			/* krow has no more unexplored nbrs:
-			 *    Place snode-rep krep in postorder DFS, if this 
-			 *    segment is seen for the first time. (Note that
-			 *    "repfnz[krep]" may change later.)
-			 *    Backtrack dfs to its parent.
-			 */
-			if ( marker1[krep] < jcol ) {
-			    segrep[*nseg] = krep;
-			    ++(*nseg);
-			    marker1[krep] = jj;
-			}
-			
-			kpar = parent[krep]; /* Pop stack, mimic recursion */
-			if ( kpar == EMPTY ) break; /* dfs done */
-			krep = kpar;
-			xdfs = xplore[krep];
-			maxdfs = xprune[krep];
-			
-#ifdef CHK_DFS 
-			printf("  pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs);
-			for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
-			printf("\n");
-#endif
-		    } while ( kpar != EMPTY ); /* do-while - until empty stack */
-		    
-		} /* else */
-		
-	    } /* else */
-	    
-	} /* for each nonz in A[*,jj] */
-	
-	repfnz_col += m;    /* Move to next column */
-        dense_col += m;
-	
-    } /* for jj ... */
-    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotgrowth.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,108 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <math.h>
-#include "slu_zdefs.h"
-
-double
-zPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
-             SuperMatrix *L, SuperMatrix *U)
-{
-/*
- * Purpose
- * =======
- *
- * Compute the reciprocal pivot growth factor of the leading ncols columns
- * of the matrix, using the formula:
- *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
- *
- * Arguments
- * =========
- *
- * ncols    (input) int
- *          The number of columns of matrices A, L and U.
- *
- * A        (input) SuperMatrix*
- *	    Original matrix A, permuted by columns, of dimension
- *          (A->nrow, A->ncol). The type of A can be:
- *          Stype = NC; Dtype = SLU_Z; Mtype = GE.
- *
- * L        (output) SuperMatrix*
- *          The factor L from the factorization Pr*A=L*U; use compressed row 
- *          subscripts storage for supernodes, i.e., L has type: 
- *          Stype = SC; Dtype = SLU_Z; Mtype = TRLU.
- *
- * U        (output) SuperMatrix*
- *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
- *          storage scheme, i.e., U has types: Stype = NC;
- *          Dtype = SLU_Z; Mtype = TRU.
- *
- */
-    NCformat *Astore;
-    SCformat *Lstore;
-    NCformat *Ustore;
-    doublecomplex  *Aval, *Lval, *Uval;
-    int      fsupc, nsupr, luptr, nz_in_U;
-    int      i, j, k, oldcol;
-    int      *inv_perm_c;
-    double   rpg, maxaj, maxuj;
-    extern   double dlamch_(char *);
-    double   smlnum;
-    doublecomplex   *luval;
-    doublecomplex   temp_comp;
-   
-    /* Get machine constants. */
-    smlnum = dlamch_("S");
-    rpg = 1. / smlnum;
-
-    Astore = A->Store;
-    Lstore = L->Store;
-    Ustore = U->Store;
-    Aval = Astore->nzval;
-    Lval = Lstore->nzval;
-    Uval = Ustore->nzval;
-    
-    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
-    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;
-
-    for (k = 0; k <= Lstore->nsuper; ++k) {
-	fsupc = L_FST_SUPC(k);
-	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	luptr = L_NZ_START(fsupc);
-	luval = &Lval[luptr];
-	nz_in_U = 1;
-	
-	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
-	    maxaj = 0.;
-            oldcol = inv_perm_c[j];
-	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
-		maxaj = SUPERLU_MAX( maxaj, z_abs1( &Aval[i]) );
-	
-	    maxuj = 0.;
-	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
-		maxuj = SUPERLU_MAX( maxuj, z_abs1( &Uval[i]) );
-	    
-	    /* Supernode */
-	    for (i = 0; i < nz_in_U; ++i)
-		maxuj = SUPERLU_MAX( maxuj, z_abs1( &luval[i]) );
-
-	    ++nz_in_U;
-	    luval += nsupr;
-
-	    if ( maxuj == 0. )
-		rpg = SUPERLU_MIN( rpg, 1.);
-	    else
-		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
-	}
-	
-	if ( j >= ncols ) break;
-    }
-
-    SUPERLU_FREE(inv_perm_c);
-    return (rpg);
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpivotL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpivotL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,174 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include <stdlib.h>
-#include "slu_zdefs.h"
-
-#undef DEBUG
-
-int
-zpivotL(
-        const int  jcol,     /* in */
-        const double u,      /* in - diagonal pivoting threshold */
-        int        *usepr,   /* re-use the pivot sequence given by perm_r/iperm_r */
-        int        *perm_r,  /* may be modified */
-        int        *iperm_r, /* in - inverse of perm_r */
-        int        *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */
-        int        *pivrow,  /* out */
-        GlobalLU_t *Glu,     /* modified - global LU data structures */
-	SuperLUStat_t *stat  /* output */
-       )
-{
-/*
- * Purpose
- * =======
- *   Performs the numerical pivoting on the current column of L,
- *   and the CDIV operation.
- *
- *   Pivot policy:
- *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
- *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
- *           pivot row = k;
- *       ELSE IF abs(A_jj) >= thresh THEN
- *           pivot row = j;
- *       ELSE
- *           pivot row = m;
- * 
- *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
- *
- *   Return value: 0      success;
- *                 i > 0  U(i,i) is exactly zero.
- *
- */
-    doublecomplex one = {1.0, 0.0};
-    int          fsupc;	    /* first column in the supernode */
-    int          nsupc;	    /* no of columns in the supernode */
-    int          nsupr;     /* no of rows in the supernode */
-    int          lptr;	    /* points to the starting subscript of the supernode */
-    int          pivptr, old_pivptr, diag, diagind;
-    double       pivmax, rtemp, thresh;
-    doublecomplex       temp;
-    doublecomplex       *lu_sup_ptr; 
-    doublecomplex       *lu_col_ptr;
-    int          *lsub_ptr;
-    int          isub, icol, k, itemp;
-    int          *lsub, *xlsub;
-    doublecomplex       *lusup;
-    int          *xlusup;
-    flops_t      *ops = stat->ops;
-
-    /* Initialize pointers */
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    fsupc      = (Glu->xsup)[(Glu->supno)[jcol]];
-    nsupc      = jcol - fsupc;	        /* excluding jcol; nsupc >= 0 */
-    lptr       = xlsub[fsupc];
-    nsupr      = xlsub[fsupc+1] - lptr;
-    lu_sup_ptr = &lusup[xlusup[fsupc]];	/* start of the current supernode */
-    lu_col_ptr = &lusup[xlusup[jcol]];	/* start of jcol in the supernode */
-    lsub_ptr   = &lsub[lptr];	/* start of row indices of the supernode */
-
-#ifdef DEBUG
-if ( jcol == MIN_COL ) {
-    printf("Before cdiv: col %d\n", jcol);
-    for (k = nsupc; k < nsupr; k++) 
-	printf("  lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]);
-}
-#endif
-    
-    /* Determine the largest abs numerical value for partial pivoting;
-       Also search for user-specified pivot, and diagonal element. */
-    if ( *usepr ) *pivrow = iperm_r[jcol];
-    diagind = iperm_c[jcol];
-    pivmax = 0.0;
-    pivptr = nsupc;
-    diag = EMPTY;
-    old_pivptr = nsupc;
-    for (isub = nsupc; isub < nsupr; ++isub) {
-        rtemp = z_abs1 (&lu_col_ptr[isub]);
-	if ( rtemp > pivmax ) {
-	    pivmax = rtemp;
-	    pivptr = isub;
-	}
-	if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub;
-	if ( lsub_ptr[isub] == diagind ) diag = isub;
-    }
-
-    /* Test for singularity */
-    if ( pivmax == 0.0 ) {
-	*pivrow = lsub_ptr[pivptr];
-	perm_r[*pivrow] = jcol;
-	*usepr = 0;
-	return (jcol+1);
-    }
-
-    thresh = u * pivmax;
-    
-    /* Choose appropriate pivotal element by our policy. */
-    if ( *usepr ) {
-        rtemp = z_abs1 (&lu_col_ptr[old_pivptr]);
-	if ( rtemp != 0.0 && rtemp >= thresh )
-	    pivptr = old_pivptr;
-	else
-	    *usepr = 0;
-    }
-    if ( *usepr == 0 ) {
-	/* Use diagonal pivot? */
-	if ( diag >= 0 ) { /* diagonal exists */
-            rtemp = z_abs1 (&lu_col_ptr[diag]);
-	    if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag;
-        }
-	*pivrow = lsub_ptr[pivptr];
-    }
-    
-    /* Record pivot row */
-    perm_r[*pivrow] = jcol;
-    
-    /* Interchange row subscripts */
-    if ( pivptr != nsupc ) {
-	itemp = lsub_ptr[pivptr];
-	lsub_ptr[pivptr] = lsub_ptr[nsupc];
-	lsub_ptr[nsupc] = itemp;
-
-	/* Interchange numerical values as well, for the whole snode, such 
-	 * that L is indexed the same way as A.
- 	 */
-	for (icol = 0; icol <= nsupc; icol++) {
-	    itemp = pivptr + icol * nsupr;
-	    temp = lu_sup_ptr[itemp];
-	    lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr];
-	    lu_sup_ptr[nsupc + icol*nsupr] = temp;
-	}
-    } /* if */
-
-    /* cdiv operation */
-    ops[FACT] += 10 * (nsupr - nsupc);
-
-    z_div(&temp, &one, &lu_col_ptr[nsupc]);
-    for (k = nsupc+1; k < nsupr; k++) 
-	zz_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp);
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpruneL.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpruneL.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zpruneL.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zpruneL.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,147 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-void
-zpruneL(
-       const int  jcol,	     /* in */
-       const int  *perm_r,   /* in */
-       const int  pivrow,    /* in */
-       const int  nseg,	     /* in */
-       const int  *segrep,   /* in */
-       const int  *repfnz,   /* in */
-       int        *xprune,   /* out */
-       GlobalLU_t *Glu       /* modified - global LU data structures */
-       )
-{
-/*
- * Purpose
- * =======
- *   Prunes the L-structure of supernodes whose L-structure
- *   contains the current pivot row "pivrow"
- *
- */
-    doublecomplex     utemp;
-    int        jsupno, irep, irep1, kmin, kmax, krow, movnum;
-    int        i, ktemp, minloc, maxloc;
-    int        do_prune; /* logical variable */
-    int        *xsup, *supno;
-    int        *lsub, *xlsub;
-    doublecomplex     *lusup;
-    int        *xlusup;
-
-    xsup       = Glu->xsup;
-    supno      = Glu->supno;
-    lsub       = Glu->lsub;
-    xlsub      = Glu->xlsub;
-    lusup      = Glu->lusup;
-    xlusup     = Glu->xlusup;
-    
-    /*
-     * For each supernode-rep irep in U[*,j]
-     */
-    jsupno = supno[jcol];
-    for (i = 0; i < nseg; i++) {
-
-	irep = segrep[i];
-	irep1 = irep + 1;
-	do_prune = FALSE;
-
-	/* Don't prune with a zero U-segment */
- 	if ( repfnz[irep] == EMPTY )
-		continue;
-
-     	/* If a snode overlaps with the next panel, then the U-segment 
-   	 * is fragmented into two parts -- irep and irep1. We should let
-	 * pruning occur at the rep-column in irep1's snode. 
-	 */
-	if ( supno[irep] == supno[irep1] ) 	/* Don't prune */
-		continue;
-
-	/*
-	 * If it has not been pruned & it has a nonz in row L[pivrow,i]
-	 */
-	if ( supno[irep] != jsupno ) {
-	    if ( xprune[irep] >= xlsub[irep1] ) {
-		kmin = xlsub[irep];
-		kmax = xlsub[irep1] - 1;
-		for (krow = kmin; krow <= kmax; krow++) 
-		    if ( lsub[krow] == pivrow ) {
-			do_prune = TRUE;
-			break;
-		    }
-	    }
-	    
-    	    if ( do_prune ) {
-
-	     	/* Do a quicksort-type partition
-	     	 * movnum=TRUE means that the num values have to be exchanged.
-	     	 */
-	        movnum = FALSE;
-	        if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */
-			movnum = TRUE;
-
-	        while ( kmin <= kmax ) {
-
-	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
-			kmax--;
-		    else if ( perm_r[lsub[kmin]] != EMPTY )
-			kmin++;
-		    else { /* kmin below pivrow, and kmax above pivrow: 
-		            * 	interchange the two subscripts
-			    */
-		        ktemp = lsub[kmin];
-		        lsub[kmin] = lsub[kmax];
-		        lsub[kmax] = ktemp;
-
-			/* If the supernode has only one column, then we
- 			 * only keep one set of subscripts. For any subscript 
-			 * interchange performed, similar interchange must be 
-			 * done on the numerical values.
- 			 */
-		        if ( movnum ) {
-		    	    minloc = xlusup[irep] + (kmin - xlsub[irep]);
-		    	    maxloc = xlusup[irep] + (kmax - xlsub[irep]);
-			    utemp = lusup[minloc];
-		  	    lusup[minloc] = lusup[maxloc];
-			    lusup[maxloc] = utemp;
-		        }
-
-		        kmin++;
-		        kmax--;
-
-		    }
-
-	        } /* while */
-
-	        xprune[irep] = kmin;	/* Pruning */
-
-#ifdef CHK_PRUNE
-	printf("    After zpruneL(),using col %d:  xprune[%d] = %d\n", 
-			jcol, irep, kmin);
-#endif
-	    } /* if do_prune */
-
-	} /* if */
-
-    } /* for each U-segment... */
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zreadhb.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zreadhb.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zreadhb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zreadhb.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,265 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include "slu_zdefs.h"
-
-
-/* Eat up the rest of the current line */
-int zDumpLine(FILE *fp)
-{
-    register int c;
-    while ((c = fgetc(fp)) != '\n') ;
-    return 0;
-}
-
-int zParseIntFormat(char *buf, int *num, int *size)
-{
-    char *tmp;
-
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    sscanf(tmp, "%d", num);
-    while (*tmp != 'I' && *tmp != 'i') ++tmp;
-    ++tmp;
-    sscanf(tmp, "%d", size);
-    return 0;
-}
-
-int zParseFloatFormat(char *buf, int *num, int *size)
-{
-    char *tmp, *period;
-    
-    tmp = buf;
-    while (*tmp++ != '(') ;
-    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
-	   && *tmp != 'F' && *tmp != 'f') {
-        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
-           num picked up refers to P, which should be skipped. */
-        if (*tmp=='p' || *tmp=='P') {
-           ++tmp;
-           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
-        } else {
-           ++tmp;
-        }
-    }
-    ++tmp;
-    period = tmp;
-    while (*period != '.' && *period != ')') ++period ;
-    *period = '\0';
-    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
-
-    return 0;
-}
-
-int zReadVector(FILE *fp, int n, int *where, int perline, int persize)
-{
-    register int i, j, item;
-    char tmp, buf[100];
-    
-    i = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    item = atoi(&buf[j*persize]); 
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	    where[i++] = item - 1;
-	}
-    }
-
-    return 0;
-}
-
-/* Read complex numbers as pairs of (real, imaginary) */
-int zReadValues(FILE *fp, int n, doublecomplex *destination, int perline, int persize)
-{
-    register int i, j, k, s, pair;
-    register double realpart;
-    char tmp, buf[100];
-    
-    i = pair = 0;
-    while (i < n) {
-	fgets(buf, 100, fp);    /* read a line at a time */
-	for (j=0; j<perline && i<n; j++) {
-	    tmp = buf[(j+1)*persize];     /* save the char at that place */
-	    buf[(j+1)*persize] = 0;       /* null terminate */
-	    s = j*persize;
-	    for (k = 0; k < persize; ++k) /* No D_ format in C */
-		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
-	    if ( pair == 0 ) {
-	  	/* The value is real part */
-		realpart = atof(&buf[s]);
-		pair = 1;
-	    } else {
-		/* The value is imaginary part */
-	        destination[i].r = realpart;
-		destination[i++].i = atof(&buf[s]);
-		pair = 0;
-	    }
-	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
-	}
-    }
-
-    return 0;
-}
-
-
-void
-zreadhb(int *nrow, int *ncol, int *nonz,
-	doublecomplex **nzval, int **rowind, int **colptr)
-{
-/* 
- * Purpose
- * =======
- * 
- * Read a DOUBLE COMPLEX PRECISION matrix stored in Harwell-Boeing format 
- * as described below.
- * 
- * Line 1 (A72,A8) 
- *  	Col. 1 - 72   Title (TITLE) 
- *	Col. 73 - 80  Key (KEY) 
- * 
- * Line 2 (5I14) 
- * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD) 
- * 	Col. 15 - 28  Number of lines for pointers (PTRCRD) 
- * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD) 
- * 	Col. 43 - 56  Number of lines for numerical values (VALCRD) 
- *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD) 
- *                    (including starting guesses and solution vectors 
- *		       if present) 
- *           	      (zero indicates no right-hand side data is present) 
- *
- * Line 3 (A3, 11X, 4I14) 
- *   	Col. 1 - 3    Matrix type (see below) (MXTYPE) 
- * 	Col. 15 - 28  Number of rows (or variables) (NROW) 
- * 	Col. 29 - 42  Number of columns (or elements) (NCOL) 
- *	Col. 43 - 56  Number of row (or variable) indices (NNZERO) 
- *	              (equal to number of entries for assembled matrices) 
- * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL) 
- *	              (zero in the case of assembled matrices) 
- * Line 4 (2A16, 2A20) 
- * 	Col. 1 - 16   Format for pointers (PTRFMT) 
- *	Col. 17 - 32  Format for row (or variable) indices (INDFMT) 
- *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT) 
- * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT) 
- *
- * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present 
- *    	Col. 1 	      Right-hand side type: 
- *	         	  F for full storage or M for same format as matrix 
- *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP) 
- *    	Col. 3        X if an exact solution vector(s) is supplied. 
- *	Col. 15 - 28  Number of right-hand sides (NRHS) 
- *	Col. 29 - 42  Number of row indices (NRHSIX) 
- *          	      (ignored in case of unassembled matrices) 
- *
- * The three character type field on line 3 describes the matrix type. 
- * The following table lists the permitted values for each of the three 
- * characters. As an example of the type field, RSA denotes that the matrix 
- * is real, symmetric, and assembled. 
- *
- * First Character: 
- *	R Real matrix 
- *	C Complex matrix 
- *	P Pattern only (no numerical values supplied) 
- *
- * Second Character: 
- *	S Symmetric 
- *	U Unsymmetric 
- *	H Hermitian 
- *	Z Skew symmetric 
- *	R Rectangular 
- *
- * Third Character: 
- *	A Assembled 
- *	E Elemental matrices (unassembled) 
- *
- */
-
-    register int i, numer_lines = 0, rhscrd = 0;
-    int tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
-    char buf[100], type[4], key[10];
-    FILE *fp;
-
-    fp = stdin;
-
-    /* Line 1 */
-    fgets(buf, 100, fp);
-    fputs(buf, stdout);
-#if 0
-    fscanf(fp, "%72c", buf); buf[72] = 0;
-    printf("Title: %s", buf);
-    fscanf(fp, "%8c", key);  key[8] = 0;
-    printf("Key: %s\n", key);
-    zDumpLine(fp);
-#endif
-
-    /* Line 2 */
-    for (i=0; i<5; i++) {
-	fscanf(fp, "%14c", buf); buf[14] = 0;
-	sscanf(buf, "%d", &tmp);
-	if (i == 3) numer_lines = tmp;
-	if (i == 4 && tmp) rhscrd = tmp;
-    }
-    zDumpLine(fp);
-
-    /* Line 3 */
-    fscanf(fp, "%3c", type);
-    fscanf(fp, "%11c", buf); /* pad */
-    type[3] = 0;
-#ifdef DEBUG
-    printf("Matrix type %s\n", type);
-#endif
-    
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nrow);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", ncol);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", nonz);
-    fscanf(fp, "%14c", buf); sscanf(buf, "%d", &tmp);
-    
-    if (tmp != 0)
-	  printf("This is not an assembled matrix!\n");
-    if (*nrow != *ncol)
-	printf("Matrix is not square.\n");
-    zDumpLine(fp);
-
-    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
-    zallocateA(*ncol, *nonz, nzval, rowind, colptr);
-
-    /* Line 4: format statement */
-    fscanf(fp, "%16c", buf);
-    zParseIntFormat(buf, &colnum, &colsize);
-    fscanf(fp, "%16c", buf);
-    zParseIntFormat(buf, &rownum, &rowsize);
-    fscanf(fp, "%20c", buf);
-    zParseFloatFormat(buf, &valnum, &valsize);
-    fscanf(fp, "%20c", buf);
-    zDumpLine(fp);
-
-    /* Line 5: right-hand side */    
-    if ( rhscrd ) zDumpLine(fp); /* skip RHSFMT */
-    
-#ifdef DEBUG
-    printf("%d rows, %d nonzeros\n", *nrow, *nonz);
-    printf("colnum %d, colsize %d\n", colnum, colsize);
-    printf("rownum %d, rowsize %d\n", rownum, rowsize);
-    printf("valnum %d, valsize %d\n", valnum, valsize);
-#endif
-    
-    zReadVector(fp, *ncol+1, *colptr, colnum, colsize);
-    zReadVector(fp, *nonz, *rowind, rownum, rowsize);
-    if ( numer_lines ) {
-        zReadValues(fp, *nonz, *nzval, valnum, valsize);
-    }
-    
-    fclose(fp);
-
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsnode_bmod.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsnode_bmod.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsnode_bmod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsnode_bmod.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,118 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-
-/*
- * Performs numeric block updates within the relaxed snode. 
- */
-int
-zsnode_bmod (
-	    const int  jcol,	  /* in */
-	    const int  jsupno,    /* in */
-	    const int  fsupc,     /* in */
-	    doublecomplex     *dense,    /* in */
-	    doublecomplex     *tempv,    /* working array */
-	    GlobalLU_t *Glu,      /* modified */
-	    SuperLUStat_t *stat   /* output */
-	    )
-{
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    int            incx = 1, incy = 1;
-    doublecomplex         alpha = {-1.0, 0.0},  beta = {1.0, 0.0};
-#endif
-
-    doublecomplex   comp_zero = {0.0, 0.0};
-    int            luptr, nsupc, nsupr, nrow;
-    int            isub, irow, i, iptr; 
-    register int   ufirst, nextlu;
-    int            *lsub, *xlsub;
-    doublecomplex         *lusup;
-    int            *xlusup;
-    flops_t *ops = stat->ops;
-
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-
-    nextlu = xlusup[jcol];
-    
-    /*
-     *	Process the supernodal portion of L\U[*,j]
-     */
-    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
-  	irow = lsub[isub];
-	lusup[nextlu] = dense[irow];
-        dense[irow] = comp_zero;
-	++nextlu;
-    }
-
-    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */
-    
-    if ( fsupc < jcol ) {
-
-	luptr = xlusup[fsupc];
-	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
-	nsupc = jcol - fsupc;	/* Excluding jcol */
-	ufirst = xlusup[jcol];	/* Points to the beginning of column
-				   jcol in supernode L\U(jsupno). */
-	nrow = nsupr - nsupc;
-
-	ops[TRSV] += 4 * nsupc * (nsupc - 1);
-	ops[GEMV] += 8 * nrow * nsupc;
-
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#else
-	ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
-	      &lusup[ufirst], &incx );
-	zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
-		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
-#endif
-#else
-	zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
-	zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
-			&lusup[ufirst], &tempv[0] );
-
-        /* Scatter tempv[*] into lusup[*] */
-	iptr = ufirst + nsupc;
-	for (i = 0; i < nrow; i++) {
-	    z_sub(&lusup[iptr], &lusup[iptr], &tempv[i]);
-            ++iptr;
-	    tempv[i] = comp_zero;
-	}
-#endif
-
-    }
-
-    return 0;
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsnode_dfs.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,104 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include "slu_zdefs.h"
-
-int
-zsnode_dfs (
-	   const int  jcol,	    /* in - start of the supernode */
-	   const int  kcol, 	    /* in - end of the supernode */
-	   const int  *asub,        /* in */
-	   const int  *xa_begin,    /* in */
-	   const int  *xa_end,      /* in */
-	   int        *xprune,      /* out */
-	   int        *marker,      /* modified */
-	   GlobalLU_t *Glu          /* modified */
-	   )
-{
-/* Purpose
- * =======
- *    zsnode_dfs() - Determine the union of the row structures of those 
- *    columns within the relaxed snode.
- *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
- *    the portion outside the rectangular supernode must be zero.
- *
- * Return value
- * ============
- *     0   success;
- *    >0   number of bytes allocated when run out of memory.
- *
- */
-    register int i, k, ifrom, ito, nextl, new_next;
-    int          nsuper, krow, kmark, mem_error;
-    int          *xsup, *supno;
-    int          *lsub, *xlsub;
-    int          nzlmax;
-    
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    nzlmax  = Glu->nzlmax;
-
-    nsuper = ++supno[jcol];	/* Next available supernode number */
-    nextl = xlsub[jcol];
-
-    for (i = jcol; i <= kcol; i++) {
-	/* For each nonzero in A[*,i] */
-	for (k = xa_begin[i]; k < xa_end[i]; k++) {	
-	    krow = asub[k];
-	    kmark = marker[krow];
-	    if ( kmark != kcol ) { /* First time visit krow */
-		marker[krow] = kcol;
-		lsub[nextl++] = krow;
-		if ( nextl >= nzlmax ) {
-		    if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-			return (mem_error);
-		    lsub = Glu->lsub;
-		}
-	    }
-    	}
-	supno[i] = nsuper;
-    }
-
-    /* Supernode > 1, then make a copy of the subscripts for pruning */
-    if ( jcol < kcol ) {
-	new_next = nextl + (nextl - xlsub[jcol]);
-	while ( new_next > nzlmax ) {
-	    if ( mem_error = zLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu) )
-		return (mem_error);
-	    lsub = Glu->lsub;
-	}
-	ito = nextl;
-	for (ifrom = xlsub[jcol]; ifrom < nextl; )
-	    lsub[ito++] = lsub[ifrom++];	
-        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
-	nextl = ito;
-    }
-
-    xsup[nsuper+1] = kcol + 1;
-    supno[kcol+1]  = nsuper;
-    xprune[kcol]   = nextl;
-    xlsub[kcol+1]  = nextl;
-
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas2.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas2.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas2.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,565 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
- * File name:		zsp_blas2.c
- * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
- */
-
-#include "slu_zdefs.h"
-
-/* 
- * Function prototypes 
- */
-void zusolve(int, int, doublecomplex*, doublecomplex*);
-void zlsolve(int, int, doublecomplex*, doublecomplex*);
-void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
-
-
-int
-sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
-         SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info)
-{
-/*
- *   Purpose
- *   =======
- *
- *   sp_ztrsv() solves one of the systems of equations   
- *       A*x = b,   or   A'*x = b,
- *   where b and x are n element vectors and A is a sparse unit , or   
- *   non-unit, upper or lower triangular matrix.   
- *   No test for singularity or near-singularity is included in this   
- *   routine. Such tests must be performed before calling this routine.   
- *
- *   Parameters   
- *   ==========   
- *
- *   uplo   - (input) char*
- *            On entry, uplo specifies whether the matrix is an upper or   
- *             lower triangular matrix as follows:   
- *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
- *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
- *
- *   trans  - (input) char*
- *             On entry, trans specifies the equations to be solved as   
- *             follows:   
- *                trans = 'N' or 'n'   A*x = b.   
- *                trans = 'T' or 't'   A'*x = b.
- *                trans = 'C' or 'c'   A^H*x = b.   
- *
- *   diag   - (input) char*
- *             On entry, diag specifies whether or not A is unit   
- *             triangular as follows:   
- *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
- *                diag = 'N' or 'n'   A is not assumed to be unit   
- *                                    triangular.   
- *	     
- *   L       - (input) SuperMatrix*
- *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes,
- *             i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU.
- *
- *   U       - (input) SuperMatrix*
- *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU.
- *    
- *   x       - (input/output) doublecomplex*
- *             Before entry, the incremented array X must contain the n   
- *             element right-hand side vector b. On exit, X is overwritten 
- *             with the solution vector x.
- *
- *   info    - (output) int*
- *             If *info = -i, the i-th argument had an illegal value.
- *
- */
-#ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
-#endif
-    SCformat *Lstore;
-    NCformat *Ustore;
-    doublecomplex   *Lval, *Uval;
-    int incx = 1, incy = 1;
-    doublecomplex temp;
-    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
-    doublecomplex comp_zero = {0.0, 0.0};
-    int nrow;
-    int fsupc, nsupr, nsupc, luptr, istart, irow;
-    int i, k, iptr, jcol;
-    doublecomplex *work;
-    flops_t solve_ops;
-
-    /* Test the input parameters */
-    *info = 0;
-    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
-    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && 
-              !lsame_(trans, "C")) *info = -2;
-    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
-    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
-    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
-    if ( *info ) {
-	i = -(*info);
-	xerbla_("sp_ztrsv", &i);
-	return 0;
-    }
-
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
-    solve_ops = 0;
-
-    if ( !(work = doublecomplexCalloc(L->nrow)) )
-	ABORT("Malloc fails for work in sp_ztrsv().");
-    
-    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L)*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-		fsupc = L_FST_SUPC(k);
-		istart = L_SUB_START(fsupc);
-		nsupr = L_SUB_START(fsupc+1) - istart;
-		nsupc = L_FST_SUPC(k+1) - fsupc;
-		luptr = L_NZ_START(fsupc);
-		nrow = nsupr - nsupc;
-
-                /* 1 z_div costs 10 flops */
-	        solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc;
-	        solve_ops += 8 * nrow * nsupc;
-
-		if ( nsupc == 1 ) {
-		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
-			irow = L_SUB(iptr);
-			++luptr;
-			zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#else
-		    ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
-		       	&x[fsupc], &incx);
-		
-		    zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
-		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
-#endif
-#else
-		    zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
-		
-		    zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
-                             &x[fsupc], &work[0] );
-#endif		
-		
-		    iptr = istart + nsupc;
-		    for (i = 0; i < nrow; ++i, ++iptr) {
-			irow = L_SUB(iptr);
-			z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
-			work[i] = comp_zero;
-
-		    }
-	 	}
-	    } /* for k ... */
-	    
-	} else {
-	    /* Form x := inv(U)*x */
-	    
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; k--) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-		
-                /* 1 z_div costs 10 flops */
-    	        solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
-
-		if ( nsupc == 1 ) {
-		    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
-		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
-			irow = U_SUB(i);
-			zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
-		    }
-		} else {
-#ifdef USE_VENDOR_BLAS
-#ifdef _CRAY
-		    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
-		       &x[fsupc], &incx);
-#else
-		    ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
-                           &x[fsupc], &incx);
-#endif
-#else		
-		    zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
-#endif		
-
-		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		        solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
-				i++) {
-			    irow = U_SUB(i);
-			zz_mult(&comp_zero, &x[jcol], &Uval[i]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
-		    	}
-                    }
-		}
-	    } /* for k ... */
-	    
-	}
-    } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := inv(L')*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 8 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-			zz_mult(&comp_zero, &x[irow], &Lval[i]);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
-			iptr++;
-		    }
-		}
-		
-		if ( nsupc > 1 ) {
-		    solve_ops += 4 * nsupc * (nsupc - 1);
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-		    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := inv(U')*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-			zz_mult(&comp_zero, &x[irow], &Uval[i]);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
-		    }
-		}
-
-                /* 1 z_div costs 10 flops */
-		solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
-
-		if ( nsupc == 1 ) {
-		    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
-		} else {
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd("T", strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-		    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#endif
-		}
-	    } /* for k ... */
-	}
-    } else { /* Form x := conj(inv(A'))*x */
-	
-	if ( lsame_(uplo, "L") ) {
-	    /* Form x := conj(inv(L'))*x */
-    	    if ( L->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = Lstore->nsuper; k >= 0; --k) {
-	    	fsupc = L_FST_SUPC(k);
-	    	istart = L_SUB_START(fsupc);
-	    	nsupr = L_SUB_START(fsupc+1) - istart;
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		solve_ops += 8 * (nsupr - nsupc) * nsupc;
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    iptr = istart + nsupc;
-		    for (i = L_NZ_START(jcol) + nsupc; 
-				i < L_NZ_START(jcol+1); i++) {
-			irow = L_SUB(iptr);
-                        zz_conj(&temp, &Lval[i]);
-			zz_mult(&comp_zero, &x[irow], &temp);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
-			iptr++;
-		    }
- 		}
- 		
- 		if ( nsupc > 1 ) {
-		    solve_ops += 4 * nsupc * (nsupc - 1);
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("L", strlen("L"));
-                    ftcs2 = _cptofcd(trans, strlen("T"));
-                    ftcs3 = _cptofcd("U", strlen("U"));
-		    ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			&x[fsupc], &incx);
-#else
-                    ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr,
-                           &x[fsupc], &incx);
-#endif
-		}
-	    }
-	} else {
-	    /* Form x := conj(inv(U'))*x */
-	    if ( U->nrow == 0 ) return 0; /* Quick return */
-	    
-	    for (k = 0; k <= Lstore->nsuper; k++) {
-	    	fsupc = L_FST_SUPC(k);
-	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
-	    	nsupc = L_FST_SUPC(k+1) - fsupc;
-	    	luptr = L_NZ_START(fsupc);
-
-		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
-		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
-		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
-			irow = U_SUB(i);
-                        zz_conj(&temp, &Uval[i]);
-			zz_mult(&comp_zero, &x[irow], &temp);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
-		    }
-		}
-
-                /* 1 z_div costs 10 flops */
-		solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
- 
-		if ( nsupc == 1 ) {
-                    zz_conj(&temp, &Lval[luptr]);
-		    z_div(&x[fsupc], &x[fsupc], &temp);
-		} else {
-#ifdef _CRAY
-                    ftcs1 = _cptofcd("U", strlen("U"));
-                    ftcs2 = _cptofcd(trans, strlen("T"));
-                    ftcs3 = _cptofcd("N", strlen("N"));
-		    ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
-			    &x[fsupc], &incx);
-#else
-                    ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr,
-                               &x[fsupc], &incx);
-#endif
-  		}
-  	    } /* for k ... */
-  	}
-    }
-
-    stat->ops[SOLVE] += solve_ops;
-    SUPERLU_FREE(work);
-    return 0;
-}
-
-
-
-int
-sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, 
-	 int incx, doublecomplex beta, doublecomplex *y, int incy)
-{
-/*  Purpose   
-    =======   
-
-    sp_zgemv()  performs one of the matrix-vector operations   
-       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
-    where alpha and beta are scalars, x and y are vectors and A is a
-    sparse A->nrow by A->ncol matrix.   
-
-    Parameters   
-    ==========   
-
-    TRANS  - (input) char*
-             On entry, TRANS specifies the operation to be performed as   
-             follows:   
-                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
-                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
-                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
-
-    ALPHA  - (input) doublecomplex
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
-
-    X      - (input) doublecomplex*, array of DIMENSION at least   
-             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
-             Before entry, the incremented array X must contain the   
-             vector x.   
-
-    INCX   - (input) int
-             On entry, INCX specifies the increment for the elements of   
-             X. INCX must not be zero.   
-
-    BETA   - (input) doublecomplex
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then Y need not be set on input.   
-
-    Y      - (output) doublecomplex*,  array of DIMENSION at least   
-             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
-             and at least   
-             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
-             Before entry with BETA non-zero, the incremented array Y   
-             must contain the vector y. On exit, Y is overwritten by the 
-             updated vector y.
-	     
-    INCY   - (input) int
-             On entry, INCY specifies the increment for the elements of   
-             Y. INCY must not be zero.   
-
-    ==== Sparse Level 2 Blas routine.   
-*/
-
-    /* Local variables */
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    int info;
-    doublecomplex temp, temp1;
-    int lenx, leny, i, j, irow;
-    int iy, jx, jy, kx, ky;
-    int notran;
-    doublecomplex comp_zero = {0.0, 0.0};
-    doublecomplex comp_one = {1.0, 0.0};
-
-    notran = lsame_(trans, "N");
-    Astore = A->Store;
-    Aval = Astore->nzval;
-    
-    /* Test the input parameters */
-    info = 0;
-    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
-    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
-    else if (incx == 0) info = 5;
-    else if (incy == 0)	info = 8;
-    if (info != 0) {
-	xerbla_("sp_zgemv ", &info);
-	return 0;
-    }
-
-    /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || 
-	z_eq(&alpha, &comp_zero) && 
-	z_eq(&beta, &comp_one))
-	return 0;
-
-
-    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
-       up the start points in  X  and  Y. */
-    if (lsame_(trans, "N")) {
-	lenx = A->ncol;
-	leny = A->nrow;
-    } else {
-	lenx = A->nrow;
-	leny = A->ncol;
-    }
-    if (incx > 0) kx = 0;
-    else kx =  - (lenx - 1) * incx;
-    if (incy > 0) ky = 0;
-    else ky =  - (leny - 1) * incy;
-
-    /* Start the operations. In this version the elements of A are   
-       accessed sequentially with one pass through A. */
-    /* First form  y := beta*y. */
-    if ( !z_eq(&beta, &comp_one) ) {
-	if (incy == 1) {
-	    if ( z_eq(&beta, &comp_zero) )
-		for (i = 0; i < leny; ++i) y[i] = comp_zero;
-	    else
-		for (i = 0; i < leny; ++i) 
-		  zz_mult(&y[i], &beta, &y[i]);
-	} else {
-	    iy = ky;
-	    if ( z_eq(&beta, &comp_zero) )
-		for (i = 0; i < leny; ++i) {
-		    y[iy] = comp_zero;
-		    iy += incy;
-		}
-	    else
-		for (i = 0; i < leny; ++i) {
-		    zz_mult(&y[iy], &beta, &y[iy]);
-		    iy += incy;
-		}
-	}
-    }
-    
-    if ( z_eq(&alpha, &comp_zero) ) return 0;
-
-    if ( notran ) {
-	/* Form  y := alpha*A*x + y. */
-	jx = kx;
-	if (incy == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		if ( !z_eq(&x[jx], &comp_zero) ) {
-		    zz_mult(&temp, &alpha, &x[jx]);
-		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-			irow = Astore->rowind[i];
-			zz_mult(&temp1, &temp,  &Aval[i]);
-			z_add(&y[irow], &y[irow], &temp1);
-		    }
-		}
-		jx += incx;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    } else {
-	/* Form  y := alpha*A'*x + y. */
-	jy = ky;
-	if (incx == 1) {
-	    for (j = 0; j < A->ncol; ++j) {
-		temp = comp_zero;
-		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
-		    irow = Astore->rowind[i];
-		    zz_mult(&temp1, &Aval[i], &x[irow]);
-		    z_add(&temp, &temp, &temp1);
-		}
-		zz_mult(&temp1, &alpha, &temp);
-		z_add(&y[jy], &y[jy], &temp1);
-		jy += incy;
-	    }
-	} else {
-	    ABORT("Not implemented.");
-	}
-    }
-    return 0;    
-} /* sp_zgemv */
-
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas3.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas3.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zsp_blas3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zsp_blas3.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,119 +0,0 @@
-
-/*
- * -- SuperLU routine (version 2.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * November 15, 1997
- *
- */
-/*
- * File name:		sp_blas3.c
- * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
- */
-
-#include "slu_zdefs.h"
-
-int
-sp_zgemm(char *transa, char *transb, int m, int n, int k, 
-         doublecomplex alpha, SuperMatrix *A, doublecomplex *b, int ldb, 
-         doublecomplex beta, doublecomplex *c, int ldc)
-{
-/*  Purpose   
-    =======   
-
-    sp_z performs one of the matrix-matrix operations   
-
-       C := alpha*op( A )*op( B ) + beta*C,   
-
-    where  op( X ) is one of 
-
-       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
-
-    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
-    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
-  
-
-    Parameters   
-    ==========   
-
-    TRANSA - (input) char*
-             On entry, TRANSA specifies the form of op( A ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSA = 'N' or 'n',  op( A ) = A.   
-                TRANSA = 'T' or 't',  op( A ) = A'.   
-                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
-             Unchanged on exit.   
-
-    TRANSB - (input) char*
-             On entry, TRANSB specifies the form of op( B ) to be used in 
-             the matrix multiplication as follows:   
-                TRANSB = 'N' or 'n',  op( B ) = B.   
-                TRANSB = 'T' or 't',  op( B ) = B'.   
-                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
-             Unchanged on exit.   
-
-    M      - (input) int   
-             On entry,  M  specifies  the number of rows of the matrix 
-	     op( A ) and of the matrix C.  M must be at least zero. 
-	     Unchanged on exit.   
-
-    N      - (input) int
-             On entry,  N specifies the number of columns of the matrix 
-	     op( B ) and the number of columns of the matrix C. N must be 
-	     at least zero.
-	     Unchanged on exit.   
-
-    K      - (input) int
-             On entry, K specifies the number of columns of the matrix 
-	     op( A ) and the number of rows of the matrix op( B ). K must 
-	     be at least  zero.   
-             Unchanged on exit.
-	     
-    ALPHA  - (input) doublecomplex
-             On entry, ALPHA specifies the scalar alpha.   
-
-    A      - (input) SuperMatrix*
-             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
-             Currently, the type of A can be:
-                 Stype = NC or NCP; Dtype = SLU_Z; Mtype = GE. 
-             In the future, more general A can be handled.
-
-    B      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDB, kb ), where kb is 
-             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
-             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
-             part of the array B must contain the matrix B, otherwise 
-             the leading n by k part of the array B must contain the 
-             matrix B.   
-             Unchanged on exit.   
-
-    LDB    - (input) int
-             On entry, LDB specifies the first dimension of B as declared 
-             in the calling (sub) program. LDB must be at least max( 1, n ).  
-             Unchanged on exit.   
-
-    BETA   - (input) doublecomplex
-             On entry, BETA specifies the scalar beta. When BETA is   
-             supplied as zero then C need not be set on input.   
-
-    C      - DOUBLE COMPLEX PRECISION array of DIMENSION ( LDC, n ).   
-             Before entry, the leading m by n part of the array C must 
-             contain the matrix C,  except when beta is zero, in which 
-             case C need not be set on entry.   
-             On exit, the array C is overwritten by the m by n matrix 
-	     ( alpha*op( A )*B + beta*C ).   
-
-    LDC    - (input) int
-             On entry, LDC specifies the first dimension of C as declared 
-             in the calling (sub)program. LDC must be at least max(1,m).   
-             Unchanged on exit.   
-
-    ==== Sparse Level 3 Blas routine.   
-*/
-    int    incx = 1, incy = 1;
-    int    j;
-
-    for (j = 0; j < n; ++j) {
-	sp_zgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
-    }
-    return 0;    
-}
diff -Nru hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zutil.c hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zutil.c
--- hypre-2.11.2/src/FEI_mv/SuperLU/SRC/zutil.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/FEI_mv/SuperLU/SRC/zutil.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,485 +0,0 @@
-
-/*
- * -- SuperLU routine (version 3.0) --
- * Univ. of California Berkeley, Xerox Palo Alto Research Center,
- * and Lawrence Berkeley National Lab.
- * October 15, 2003
- *
- */
-/*
-  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
- 
-  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- 
-  Permission is hereby granted to use or copy this program for any
-  purpose, provided the above notices are retained on all copies.
-  Permission to modify the code and to distribute modified code is
-  granted, provided the above notices are retained, and a notice that
-  the code was modified is included with the above copyright notice.
-*/
-
-#include <math.h>
-#include "slu_zdefs.h"
-
-void
-zCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       doublecomplex *nzval, int *rowind, int *colptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NCformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->rowind = rowind;
-    Astore->colptr = colptr;
-}
-
-void
-zCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, 
-		       doublecomplex *nzval, int *colind, int *rowptr,
-		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    NRformat *Astore;
-
-    A->Stype = stype;
-    A->Dtype = dtype;
-    A->Mtype = mtype;
-    A->nrow = m;
-    A->ncol = n;
-    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) );
-    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
-    Astore = A->Store;
-    Astore->nnz = nnz;
-    Astore->nzval = nzval;
-    Astore->colind = colind;
-    Astore->rowptr = rowptr;
-}
-
-/* Copy matrix A into matrix B. */
-void
-zCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore, *Bstore;
-    int      ncol, nnz, i;
-
-    B->Stype = A->Stype;
-    B->Dtype = A->Dtype;
-    B->Mtype = A->Mtype;
-    B->nrow  = A->nrow;;
-    B->ncol  = ncol = A->ncol;
-    Astore   = (NCformat *) A->Store;
-    Bstore   = (NCformat *) B->Store;
-    Bstore->nnz = nnz = Astore->nnz;
-    for (i = 0; i < nnz; ++i)
-	((doublecomplex *)Bstore->nzval)[i] = ((doublecomplex *)Astore->nzval)[i];
-    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
-    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
-}
-
-
-void
-zCreate_Dense_Matrix(SuperMatrix *X, int m, int n, doublecomplex *x, int ldx,
-		    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    DNformat    *Xstore;
-    
-    X->Stype = stype;
-    X->Dtype = dtype;
-    X->Mtype = mtype;
-    X->nrow = m;
-    X->ncol = n;
-    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
-    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
-    Xstore = (DNformat *) X->Store;
-    Xstore->lda = ldx;
-    Xstore->nzval = (doublecomplex *) x;
-}
-
-void
-zCopy_Dense_Matrix(int M, int N, doublecomplex *X, int ldx,
-			doublecomplex *Y, int ldy)
-{
-/*
- *
- *  Purpose
- *  =======
- *
- *  Copies a two-dimensional matrix X to another matrix Y.
- */
-    int    i, j;
-    
-    for (j = 0; j < N; ++j)
-        for (i = 0; i < M; ++i)
-            Y[i + j*ldy] = X[i + j*ldx];
-}
-
-void
-zCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, 
-			doublecomplex *nzval, int *nzval_colptr, int *rowind,
-			int *rowind_colptr, int *col_to_sup, int *sup_to_col,
-			Stype_t stype, Dtype_t dtype, Mtype_t mtype)
-{
-    SCformat *Lstore;
-
-    L->Stype = stype;
-    L->Dtype = dtype;
-    L->Mtype = mtype;
-    L->nrow = m;
-    L->ncol = n;
-    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
-    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
-    Lstore = L->Store;
-    Lstore->nnz = nnz;
-    Lstore->nsuper = col_to_sup[n];
-    Lstore->nzval = nzval;
-    Lstore->nzval_colptr = nzval_colptr;
-    Lstore->rowind = rowind;
-    Lstore->rowind_colptr = rowind_colptr;
-    Lstore->col_to_sup = col_to_sup;
-    Lstore->sup_to_col = sup_to_col;
-
-}
-
-
-/*
- * Convert a row compressed storage into a column compressed storage.
- */
-void
-zCompRow_to_CompCol(int m, int n, int nnz, 
-		    doublecomplex *a, int *colind, int *rowptr,
-		    doublecomplex **at, int **rowind, int **colptr)
-{
-    register int i, j, col, relpos;
-    int *marker;
-
-    /* Allocate storage for another copy of the matrix. */
-    *at = (doublecomplex *) doublecomplexMalloc(nnz);
-    *rowind = (int *) intMalloc(nnz);
-    *colptr = (int *) intMalloc(n+1);
-    marker = (int *) intCalloc(n);
-    
-    /* Get counts of each column of A, and set up column pointers */
-    for (i = 0; i < m; ++i)
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
-    (*colptr)[0] = 0;
-    for (j = 0; j < n; ++j) {
-	(*colptr)[j+1] = (*colptr)[j] + marker[j];
-	marker[j] = (*colptr)[j];
-    }
-
-    /* Transfer the matrix into the compressed column storage. */
-    for (i = 0; i < m; ++i) {
-	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
-	    col = colind[j];
-	    relpos = marker[col];
-	    (*rowind)[relpos] = i;
-	    (*at)[relpos] = a[j];
-	    ++marker[col];
-	}
-    }
-
-    SUPERLU_FREE(marker);
-}
-
-
-void
-zPrint_CompCol_Matrix(char *what, SuperMatrix *A)
-{
-    NCformat     *Astore;
-    register int i,n;
-    double       *dp;
-    
-    printf("\nCompCol matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (NCformat *) A->Store;
-    dp = (double *) Astore->nzval;
-    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
-    printf("nzval: ");
-    for (i = 0; i < 2*Astore->colptr[n]; ++i) printf("%f  ", dp[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->colptr[n]; ++i) printf("%d  ", Astore->rowind[i]);
-    printf("\ncolptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->colptr[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-zPrint_SuperNode_Matrix(char *what, SuperMatrix *A)
-{
-    SCformat     *Astore;
-    register int i, j, k, c, d, n, nsup;
-    double       *dp;
-    int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr;
-    
-    printf("\nSuperNode matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    n = A->ncol;
-    Astore = (SCformat *) A->Store;
-    dp = (double *) Astore->nzval;
-    col_to_sup = Astore->col_to_sup;
-    sup_to_col = Astore->sup_to_col;
-    rowind_colptr = Astore->rowind_colptr;
-    rowind = Astore->rowind;
-    printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", 
-	   A->nrow,A->ncol,Astore->nnz,Astore->nsuper);
-    printf("nzval:\n");
-    for (k = 0; k <= Astore->nsuper; ++k) {
-      c = sup_to_col[k];
-      nsup = sup_to_col[k+1] - c;
-      for (j = c; j < c + nsup; ++j) {
-	d = Astore->nzval_colptr[j];
-	for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) {
-	  printf("%d\t%d\t%e\t%e\n", rowind[i], j, dp[d], dp[d+1]);
-          d += 2;	
-	}
-      }
-    }
-#if 0
-    for (i = 0; i < 2*Astore->nzval_colptr[n]; ++i) printf("%f  ", dp[i]);
-#endif
-    printf("\nnzval_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->nzval_colptr[i]);
-    printf("\nrowind: ");
-    for (i = 0; i < Astore->rowind_colptr[n]; ++i) 
-        printf("%d  ", Astore->rowind[i]);
-    printf("\nrowind_colptr: ");
-    for (i = 0; i <= n; ++i) printf("%d  ", Astore->rowind_colptr[i]);
-    printf("\ncol_to_sup: ");
-    for (i = 0; i < n; ++i) printf("%d  ", col_to_sup[i]);
-    printf("\nsup_to_col: ");
-    for (i = 0; i <= Astore->nsuper+1; ++i) 
-        printf("%d  ", sup_to_col[i]);
-    printf("\n");
-    fflush(stdout);
-}
-
-void
-zPrint_Dense_Matrix(char *what, SuperMatrix *A)
-{
-    DNformat     *Astore;
-    register int i, j, lda = Astore->lda;
-    double       *dp;
-    
-    printf("\nDense matrix %s:\n", what);
-    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
-    Astore = (DNformat *) A->Store;
-    dp = (double *) Astore->nzval;
-    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,lda);
-    printf("\nnzval: ");
-    for (j = 0; j < A->ncol; ++j) {
-        for (i = 0; i < 2*A->nrow; ++i) printf("%f  ", dp[i + j*2*lda]);
-        printf("\n");
-    }
-    printf("\n");
-    fflush(stdout);
-}
-
-/*
- * Diagnostic print of column "jcol" in the U/L factor.
- */
-void
-zprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu)
-{
-    int     i, k, fsupc;
-    int     *xsup, *supno;
-    int     *xlsub, *lsub;
-    doublecomplex  *lusup;
-    int     *xlusup;
-    doublecomplex  *ucol;
-    int     *usub, *xusub;
-
-    xsup    = Glu->xsup;
-    supno   = Glu->supno;
-    lsub    = Glu->lsub;
-    xlsub   = Glu->xlsub;
-    lusup   = Glu->lusup;
-    xlusup  = Glu->xlusup;
-    ucol    = Glu->ucol;
-    usub    = Glu->usub;
-    xusub   = Glu->xusub;
-    
-    printf("%s", msg);
-    printf("col %d: pivrow %d, supno %d, xprune %d\n", 
-	   jcol, pivrow, supno[jcol], xprune[jcol]);
-    
-    printf("\tU-col:\n");
-    for (i = xusub[jcol]; i < xusub[jcol+1]; i++)
-	printf("\t%d%10.4f, %10.4f\n", usub[i], ucol[i].r, ucol[i].i);
-    printf("\tL-col in rectangular snode:\n");
-    fsupc = xsup[supno[jcol]];	/* first col of the snode */
-    i = xlsub[fsupc];
-    k = xlusup[jcol];
-    while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) {
-	printf("\t%d\t%10.4f, %10.4f\n", lsub[i], lusup[k].r, lusup[k].i);
-	i++; k++;
-    }
-    fflush(stdout);
-}
-
-
-/*
- * Check whether tempv[] == 0. This should be true before and after 
- * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". 
- */
-void zcheck_tempv(int n, doublecomplex *tempv)
-{
-    int i;
-	
-    for (i = 0; i < n; i++) {
-	if ((tempv[i].r != 0.0) || (tempv[i].i != 0.0))
-	{
-	    fprintf(stderr,"tempv[%d] = {%f, %f}\n", i, tempv[i].r, tempv[i].i);
-	    ABORT("zcheck_tempv");
-	}
-    }
-}
-
-
-void
-zGenXtrue(int n, int nrhs, doublecomplex *x, int ldx)
-{
-    int  i, j;
-    for (j = 0; j < nrhs; ++j)
-	for (i = 0; i < n; ++i) {
-	    x[i + j*ldx].r = 1.0;
-	    x[i + j*ldx].i = 0.0;
-	}
-}
-
-/*
- * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
- */
-void
-zFillRHS(trans_t trans, int nrhs, doublecomplex *x, int ldx,
-         SuperMatrix *A, SuperMatrix *B)
-{
-    NCformat *Astore;
-    doublecomplex   *Aval;
-    DNformat *Bstore;
-    doublecomplex   *rhs;
-    doublecomplex one = {1.0, 0.0};
-    doublecomplex zero = {0.0, 0.0};
-    int      ldc;
-    char transc[1];
-
-    Astore = A->Store;
-    Aval   = (doublecomplex *) Astore->nzval;
-    Bstore = B->Store;
-    rhs    = Bstore->nzval;
-    ldc    = Bstore->lda;
-    
-    if ( trans == NOTRANS ) *(unsigned char *)transc = 'N';
-    else *(unsigned char *)transc = 'T';
-
-    sp_zgemm(transc, "N", A->nrow, nrhs, A->ncol, one, A,
-	     x, ldx, zero, rhs, ldc);
-
-}
-
-/* 
- * Fills a doublecomplex precision array with a given value.
- */
-void 
-zfill(doublecomplex *a, int alen, doublecomplex dval)
-{
-    register int i;
-    for (i = 0; i < alen; i++) a[i] = dval;
-}
-
-
-
-/* 
- * Check the inf-norm of the error vector 
- */
-void zinf_norm_error(int nrhs, SuperMatrix *X, doublecomplex *xtrue)
-{
-    DNformat *Xstore;
-    double err, xnorm;
-    doublecomplex *Xmat, *soln_work;
-    doublecomplex temp;
-    int i, j;
-
-    Xstore = X->Store;
-    Xmat = Xstore->nzval;
-
-    for (j = 0; j < nrhs; j++) {
-      soln_work = &Xmat[j*Xstore->lda];
-      err = xnorm = 0.0;
-      for (i = 0; i < X->nrow; i++) {
-        z_sub(&temp, &soln_work[i], &xtrue[i]);
-	err = SUPERLU_MAX(err, z_abs(&temp));
-	xnorm = SUPERLU_MAX(xnorm, z_abs(&soln_work[i]));
-      }
-      err = err / xnorm;
-      printf("||X - Xtrue||/||X|| = %e\n", err);
-    }
-}
-
-
-
-/* Print performance of the code. */
-void
-zPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage,
-           double rpg, double rcond, double *ferr,
-           double *berr, char *equed, SuperLUStat_t *stat)
-{
-    SCformat *Lstore;
-    NCformat *Ustore;
-    double   *utime;
-    flops_t  *ops;
-    
-    utime = stat->utime;
-    ops   = stat->ops;
-    
-    if ( utime[FACT] != 0. )
-	printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
-	       ops[FACT]*1e-6/utime[FACT]);
-    printf("Identify relaxed snodes	= %8.2f\n", utime[RELAX]);
-    if ( utime[SOLVE] != 0. )
-	printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE],
-	       ops[SOLVE]*1e-6/utime[SOLVE]);
-    
-    Lstore = (SCformat *) L->Store;
-    Ustore = (NCformat *) U->Store;
-    printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz);
-    printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz);
-    printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz);
-	
-    printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n",
-	   mem_usage->for_lu/1e6, mem_usage->total_needed/1e6,
-	   mem_usage->expansions);
-	
-    printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n");
-    printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n",
-	   utime[FACT], ops[FACT]*1e-6/utime[FACT],
-	   utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE],
-	   utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]);
-    
-    printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n");
-    printf("NUM:\t%e\t%e\t%e\t%e\t%s\n",
-	   rpg, rcond, ferr[0], berr[0], equed);
-    
-}
-
-
-
-
-print_doublecomplex_vec(char *what, int n, doublecomplex *vec)
-{
-    int i;
-    printf("%s: n %d\n", what, n);
-    for (i = 0; i < n; ++i) printf("%d\t%f%f\n", i, vec[i].r, vec[i].i);
-    return 0;
-}
-
diff -Nru hypre-2.11.2/src/IJ_mv/IJVector_parcsr.c hypre-2.13.0/src/IJ_mv/IJVector_parcsr.c
--- hypre-2.11.2/src/IJ_mv/IJVector_parcsr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/IJ_mv/IJVector_parcsr.c	2017-10-20 17:42:22.000000000 +0000
@@ -858,7 +858,7 @@
    			     	      HYPRE_Complex  *off_proc_data)
 {
    MPI_Comm comm = hypre_IJVectorComm(vector);
-   hypre_ParVector *par_vector = hypre_IJVectorObject(vector);
+   hypre_ParVector *par_vector = ( hypre_ParVector *) hypre_IJVectorObject(vector);
    hypre_MPI_Request *requests = NULL;
    hypre_MPI_Status *status = NULL;
    HYPRE_Int i, j, j2, row;
@@ -1315,7 +1315,8 @@
       ex_contact_vec_starts[i+1] = -storage-1; /* need negative for next loop */
    }      
 
-   void_contact_buf = hypre_MAlloc(storage*obj_size_bytes);
+   /*void_contact_buf = hypre_MAlloc(storage*obj_size_bytes);*/
+   void_contact_buf = hypre_CAlloc(storage,obj_size_bytes);
    index_ptr = void_contact_buf; /* step through with this index */
 
    /* set up data to be sent to send procs */
diff -Nru hypre-2.11.2/src/krylov/bicgstab.c hypre-2.13.0/src/krylov/bicgstab.c
--- hypre-2.11.2/src/krylov/bicgstab.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/bicgstab.c	2017-10-20 17:42:22.000000000 +0000
@@ -19,7 +19,6 @@
 #include "krylov.h"
 #include "_hypre_utilities.h"
 
-
 /*--------------------------------------------------------------------------
  * hypre_BiCGSTABFunctionsCreate
  *--------------------------------------------------------------------------*/
@@ -236,7 +235,7 @@
    HYPRE_Int        iter; 
    HYPRE_Int        my_id, num_procs;
    HYPRE_Real alpha, beta, gamma, epsilon, temp, res, r_norm, b_norm;
-   HYPRE_Real epsmac = 1.e-128; 
+   HYPRE_Real epsmac = HYPRE_REAL_MIN; 
    HYPRE_Real ieee_check = 0.;
    HYPRE_Real cf_ave_0 = 0.0;
    HYPRE_Real cf_ave_1 = 0.0;
diff -Nru hypre-2.11.2/src/krylov/gmres.c hypre-2.13.0/src/krylov/gmres.c
--- hypre-2.11.2/src/krylov/gmres.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/gmres.c	2017-10-20 17:42:22.000000000 +0000
@@ -833,7 +833,6 @@
       (gmres_data -> rel_residual_norm) = r_norm;
 
    if (iter >= max_iter && r_norm > epsilon) hypre_error(HYPRE_ERROR_CONV);
-   
 
    hypre_TFreeF(c,gmres_functions); 
    hypre_TFreeF(s,gmres_functions); 
diff -Nru hypre-2.11.2/src/krylov/HYPRE_lobpcg.c hypre-2.13.0/src/krylov/HYPRE_lobpcg.c
--- hypre-2.11.2/src/krylov/HYPRE_lobpcg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/HYPRE_lobpcg.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,7 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
 /******************************************************************************
  *
  * HYPRE_LOBPCG interface
@@ -20,29 +19,6 @@
 #include "_hypre_utilities.h"
 
 #include "HYPRE_config.h"
-#ifdef HYPRE_USING_ESSL
-
-#include <essl.h>
-
-#else
-
-#include "fortran.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-	
-HYPRE_Int hypre_F90_NAME_LAPACK(dsygv, DSYGV)
-   ( HYPRE_Int *itype, char *jobz, char *uplo, HYPRE_Int *n,
-     HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Real *b, HYPRE_Int *ldb, HYPRE_Real *w,
-     HYPRE_Real *work, HYPRE_Int *lwork, /*@out@*/ HYPRE_Int *info
-      );
-HYPRE_Int hypre_F90_NAME_LAPACK( dpotrf, DPOTRF )
-   (const char* uplo, HYPRE_Int* n, HYPRE_Real* aval, HYPRE_Int* lda, HYPRE_Int* ierr );
-#ifdef __cplusplus
-}
-#endif
-
-#endif
 
 #include "HYPRE_lobpcg.h"
 #include "lobpcg.h"
@@ -50,6 +26,8 @@
 #include "interpreter.h"
 #include "HYPRE_MatvecFunctions.h"
 
+#include "_hypre_lapack.h"
+
 typedef struct
 {
 	HYPRE_Int    (*Precond)(void*,void*,void*,void*);
@@ -107,24 +85,14 @@
                             n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Real *b, HYPRE_Int *ldb,
                             HYPRE_Real *w, HYPRE_Real *work, HYPRE_Int *lwork, HYPRE_Int *info)
 {
-#ifdef HYPRE_USING_ESSL
-   dsygv(*itype, a, *lda, b, *ldb, w, a, *lda, *n, work, *lwork );
-#else
-   hypre_F90_NAME_LAPACK( dsygv, DSYGV )( itype, jobz, uplo, n, 
-                                          a, lda, b, ldb,
-                                          w, work, lwork, info );
-#endif
+   hypre_dsygv(itype, jobz, uplo, n, a, lda, b, ldb, w, work, lwork, info);
    return 0;
 }
 
-static HYPRE_Int dpotrf_interface (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *
+static HYPRE_Int dpotrf_interface (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *
                              lda, HYPRE_Int *info)
 {
-#ifdef HYPRE_USING_ESSL
-   dpotrf(uplo, *n, a, *lda, info);
-#else
-   hypre_F90_NAME_LAPACK( dpotrf, DPOTRF )(uplo, n, a, lda, info);
-#endif
+   hypre_dpotrf(uplo, n, a, lda, info);
    return 0;
 }
 
diff -Nru hypre-2.11.2/src/krylov/lobpcg.c hypre-2.13.0/src/krylov/lobpcg.c
--- hypre-2.11.2/src/krylov/lobpcg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/lobpcg.c	2017-10-20 17:42:22.000000000 +0000
@@ -29,7 +29,7 @@
 
 static HYPRE_Int
 lobpcg_chol( utilities_FortranMatrix* a, 
-        HYPRE_Int (*dpotrf) (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info) )
+        HYPRE_Int (*dpotrf) (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info) )
 {
 
   HYPRE_Int lda, n;
@@ -121,7 +121,7 @@
 mv_MultiVectorPtr x, mv_MultiVectorPtr y,
 utilities_FortranMatrix* r,
 mv_MultiVectorPtr z,
-HYPRE_Int (*dpotrf) (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info)
+HYPRE_Int (*dpotrf) (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Int *info)
 
 ){
 
@@ -176,7 +176,7 @@
   for ( i = 0; i < n; i++ ) {
     if ( utilities_FortranMatrixValue( resNorms, i + 1, 1 ) >
 	 utilities_FortranMatrixValue( lambda, i + 1, 1 )*rtol + atol
-	 + DBL_EPSILON ) {
+	 + HYPRE_REAL_EPSILON ) {
       activeMask[i] = 1; 
       notConverged++;
     }
diff -Nru hypre-2.11.2/src/krylov/lobpcg.h hypre-2.13.0/src/krylov/lobpcg.h
--- hypre-2.11.2/src/krylov/lobpcg.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/lobpcg.h	2017-10-20 17:42:22.000000000 +0000
@@ -15,6 +15,10 @@
 #ifndef LOCALLY_OPTIMAL_BLOCK_PRECONDITIONED_CONJUGATE_GRADIENTS
 #define LOCALLY_OPTIMAL_BLOCK_PRECONDITIONED_CONJUGATE_GRADIENTS
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define PROBLEM_SIZE_TOO_SMALL			       	1
 #define WRONG_BLOCK_SIZE			       	2
 #define WRONG_CONSTRAINTS                               3
@@ -30,7 +34,7 @@
 typedef struct {
 
 /* these pointers should point to 2 functions providing standard lapack  functionality */
-   HYPRE_Int   (*dpotrf) (char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *
+   HYPRE_Int   (*dpotrf) (const char *uplo, HYPRE_Int *n, HYPRE_Real *a, HYPRE_Int *
         lda, HYPRE_Int *info);
    HYPRE_Int   (*dsygv) (HYPRE_Int *itype, char *jobz, char *uplo, HYPRE_Int *
         n, HYPRE_Real *a, HYPRE_Int *lda, HYPRE_Real *b, HYPRE_Int *ldb,
@@ -38,10 +42,6 @@
 
 } lobpcg_BLASLAPACKFunctions;
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 HYPRE_Int
 lobpcg_solve( mv_MultiVectorPtr blockVectorX,
 	      void* operatorAData,
diff -Nru hypre-2.11.2/src/krylov/Makefile hypre-2.13.0/src/krylov/Makefile
--- hypre-2.11.2/src/krylov/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -18,6 +18,8 @@
 C_COMPILE_FLAGS = \
  -I..\
  -I$(srcdir)/..\
+ -I$(srcdir)/../blas\
+ -I$(srcdir)/../lapack\
  -I$(srcdir)/../multivector\
  -I$(srcdir)/../utilities\
  ${CINCLUDES}
diff -Nru hypre-2.11.2/src/krylov/pcg.c hypre-2.13.0/src/krylov/pcg.c
--- hypre-2.11.2/src/krylov/pcg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/krylov/pcg.c	2017-10-20 17:42:22.000000000 +0000
@@ -645,7 +645,7 @@
          break;
       }
 
-      if ( (gamma<1.0e-292) && ((-gamma)<1.0e-292) ) {
+      if ( (gamma<HYPRE_REAL_MIN) && ((-gamma)<HYPRE_REAL_MIN) ) {
          /* ierr = 1;*/
          hypre_error(HYPRE_ERROR_CONV);
          
@@ -668,7 +668,7 @@
       if (cf_tol > 0.0)
       {
          cf_ave_0 = cf_ave_1;
-         if ( i_prod_0<1.0e-292 ) {
+         if ( i_prod_0<HYPRE_REAL_MIN ) {
             /* i_prod_0 is zero, or (almost) subnormal, yet i_prod wasn't small
                enough to pass the convergence test.  Therefore initial guess was good,
                and we're just calculating garbage - time to bail out before the
diff -Nru hypre-2.11.2/src/lapack/dbdsqr.c hypre-2.13.0/src/lapack/dbdsqr.c
--- hypre-2.11.2/src/lapack/dbdsqr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dbdsqr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,6 +1,9 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 /*  -- translated by f2c (version 19990503).
    You must link the resulting object file with the libraries:
@@ -14,7 +17,7 @@
 static doublereal c_b49 = 1.;
 static doublereal c_b72 = -1.;
 
-/* Subroutine */ HYPRE_Int dbdsqr_(const char *uplo, integer *n, integer *ncvt, integer *
+/* Subroutine */ integer dbdsqr_(const char *uplo, integer *n, integer *ncvt, integer *
 	nru, integer *ncc, doublereal *d__, doublereal *e, doublereal *vt, 
 	integer *ldvt, doublereal *u, integer *ldu, doublereal *c__, integer *
 	ldc, doublereal *work, integer *info)
@@ -25,7 +28,7 @@
     doublereal d__1, d__2, d__3, d__4;
 
     /* Builtin functions */
-    HYPRE_Real pow_dd(doublereal *, doublereal *), sqrt(doublereal), d_sign(
+    doublereal pow_dd(doublereal *, doublereal *), d_sign(
 	    doublereal *, doublereal *);
 
     /* Local variables */
@@ -36,27 +39,27 @@
     static doublereal cosl;
     static integer isub, iter;
     static doublereal unfl, sinl, cosr, smin, smax, sinr;
-    extern /* Subroutine */ HYPRE_Int drot_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer drot_(integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *), dlas2_(
 	    doublereal *, doublereal *, doublereal *, doublereal *, 
 	    doublereal *);
     static doublereal f, g, h__;
     static integer i__, j, m;
     static doublereal r__;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *);
     extern logical lsame_(const char *,const char *);
     static doublereal oldcs;
-    extern /* Subroutine */ HYPRE_Int dlasr_(const char *,const char *,const char *, integer *, 
+    extern /* Subroutine */ integer dlasr_(const char *,const char *,const char *, integer *, 
 	    integer *, doublereal *, doublereal *, doublereal *, integer *);
     static integer oldll;
     static doublereal shift, sigmn, oldsn;
-    extern /* Subroutine */ HYPRE_Int dswap_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dswap_(integer *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static integer maxit;
     static doublereal sminl, sigmx;
     static logical lower;
-    extern /* Subroutine */ HYPRE_Int dlasq1_(integer *, doublereal *, doublereal *,
+    extern /* Subroutine */ integer dlasq1_(integer *, doublereal *, doublereal *,
 	     doublereal *, integer *), dlasv2_(doublereal *, doublereal *, 
 	    doublereal *, doublereal *, doublereal *, doublereal *, 
 	    doublereal *, doublereal *, doublereal *);
@@ -64,7 +67,7 @@
     static integer ll;
     extern doublereal dlamch_(const char *);
     static doublereal sn, mu;
-    extern /* Subroutine */ HYPRE_Int dlartg_(doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dlartg_(doublereal *, doublereal *, 
 	    doublereal *, doublereal *, doublereal *), xerbla_(const char *, 
 	    integer *);
     static doublereal sminoa, thresh;
@@ -893,4 +896,6 @@
 #undef u_ref
 #undef c___ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgebd2.c hypre-2.13.0/src/lapack/dgebd2.c
--- hypre-2.11.2/src/lapack/dgebd2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgebd2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgebd2_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgebd2_(integer *m, integer *n, doublereal *a, integer *
 	lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal *
 	taup, doublereal *work, integer *info)
 {
@@ -134,7 +137,7 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *), dlarfg_(integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *), xerbla_(const char *, integer *);
@@ -276,4 +279,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgebrd.c hypre-2.13.0/src/lapack/dgebrd.c
--- hypre-2.11.2/src/lapack/dgebrd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgebrd.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,10 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgebrd_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgebrd_(integer *m, integer *n, doublereal *a, integer *
 	lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal *
 	taup, doublereal *work, integer *lwork, integer *info)
 {
@@ -152,20 +153,20 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
     /* Local variables */
     static integer i__, j;
-    extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     static integer nbmin, iinfo, minmn;
-    extern /* Subroutine */ HYPRE_Int dgebd2_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dgebd2_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
 	     doublereal *, integer *);
     static integer nb;
-    extern /* Subroutine */ HYPRE_Int dlabrd_(integer *, integer *, integer *, 
+    extern /* Subroutine */ integer dlabrd_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
 	     doublereal *, doublereal *, integer *, doublereal *, integer *);
     static integer nx;
     static doublereal ws;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *, const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
     static integer ldwrkx, ldwrky, lwkopt;
@@ -320,4 +321,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgelq2.c hypre-2.13.0/src/lapack/dgelq2.c
--- hypre-2.11.2/src/lapack/dgelq2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgelq2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgelq2_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgelq2_(integer *m, integer *n, doublereal *a, integer *
 	lda, doublereal *tau, doublereal *work, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -72,7 +75,7 @@
     integer a_dim1, a_offset, i__1, i__2, i__3;
     /* Local variables */
     static integer i__, k;
-    extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *), dlarfg_(integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *), xerbla_(const char *, integer *);
@@ -134,4 +137,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgelqf.c hypre-2.13.0/src/lapack/dgelqf.c
--- hypre-2.11.2/src/lapack/dgelqf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgelqf.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgelqf_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgelqf_(integer *m, integer *n, doublereal *a, integer *
 	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -89,15 +92,15 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
     /* Local variables */
     static integer i__, k, nbmin, iinfo;
-    extern /* Subroutine */ HYPRE_Int dgelq2_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dgelq2_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     static integer ib, nb;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nx;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -234,4 +237,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgels.c hypre-2.13.0/src/lapack/dgels.c
--- hypre-2.11.2/src/lapack/dgels.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgels.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,9 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgels_(char *trans, integer *m, integer *n, integer *
+/* Subroutine */ integer dgels_(char *trans, integer *m, integer *n, integer *
 	nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, 
 	doublereal *work, integer *lwork, integer *info)
 {
@@ -131,17 +133,17 @@
     static logical tpsd;
     static integer i__, j, iascl, ibscl;
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static integer wsize;
     static doublereal rwork[1];
-    extern /* Subroutine */ HYPRE_Int dlabad_(doublereal *, doublereal *);
+    extern /* Subroutine */ integer dlabad_(doublereal *, doublereal *);
     static integer nb;
     extern doublereal dlamch_(const char *), dlange_(const char *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *);
     static integer mn;
-    extern /* Subroutine */ HYPRE_Int dgelqf_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dgelqf_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *, integer *), 
 	    dlascl_(const char *, integer *, integer *, doublereal *, doublereal *, 
 	    integer *, integer *, doublereal *, integer *, integer *),
@@ -153,7 +155,7 @@
 	    integer *, integer *, ftnlen, ftnlen);
     static integer scllen;
     static doublereal bignum;
-    extern /* Subroutine */ HYPRE_Int dormlq_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dormlq_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *, integer *), 
 	    dormqr_(const char *,const char *, integer *, integer *, integer *, 
@@ -473,4 +475,6 @@
 
 #undef b_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgeqr2.c hypre-2.13.0/src/lapack/dgeqr2.c
--- hypre-2.11.2/src/lapack/dgeqr2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgeqr2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgeqr2_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgeqr2_(integer *m, integer *n, doublereal *a, integer *
 	lda, doublereal *tau, doublereal *work, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -75,7 +78,7 @@
     integer a_dim1, a_offset, i__1, i__2, i__3;
     /* Local variables */
     static integer i__, k;
-    extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *), dlarfg_(integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *), xerbla_(const char *, integer *);
@@ -138,4 +141,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgeqrf.c hypre-2.13.0/src/lapack/dgeqrf.c
--- hypre-2.11.2/src/lapack/dgeqrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgeqrf.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgeqrf_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgeqrf_(integer *m, integer *n, doublereal *a, integer *
 	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -90,15 +93,15 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
     /* Local variables */
     static integer i__, k, nbmin, iinfo;
-    extern /* Subroutine */ HYPRE_Int dgeqr2_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dgeqr2_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     static integer ib, nb;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nx;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -235,4 +238,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgesvd.c hypre-2.13.0/src/lapack/dgesvd.c
--- hypre-2.11.2/src/lapack/dgesvd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgesvd.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,6 +1,9 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 /*  -- translated by f2c (version 19990503).
    You must link the resulting object file with the libraries:
@@ -17,7 +20,7 @@
 static doublereal c_b416 = 0.;
 static doublereal c_b438 = 1.;
 
-/* Subroutine */ HYPRE_Int dgesvd_(char *jobu, char *jobvt, integer *m, integer *n, 
+/* Subroutine */ integer dgesvd_(char *jobu, char *jobvt, integer *m, integer *n, 
 	doublereal *a, integer *lda, doublereal *s, doublereal *u, integer *
 	ldu, doublereal *vt, integer *ldvt, doublereal *work, integer *lwork, 
 	integer *info)
@@ -29,27 +32,26 @@
     char ch__1[2];
 
     /* Builtin functions   
-       Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen);
-    HYPRE_Real sqrt(doublereal);
+       Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen);
 
     /* Local variables */
     static integer iscl;
     static doublereal anrm;
     static integer ierr, itau, ncvt, nrvt, i__;
-    extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     extern logical lsame_(const char *,const char *);
     static integer chunk, minmn, wrkbl, itaup, itauq, mnthr, iwork;
     static logical wntua, wntva, wntun, wntuo, wntvn, wntvo, wntus, wntvs;
     static integer ie;
-    extern /* Subroutine */ HYPRE_Int dgebrd_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dgebrd_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
 	     doublereal *, integer *, integer *);
     extern doublereal dlamch_(const char *), dlange_(const char *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *);
     static integer ir, bdspac, iu;
-    extern /* Subroutine */ HYPRE_Int dgelqf_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dgelqf_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *, integer *), 
 	    dlascl_(const char *, integer *, integer *, doublereal *, doublereal *, 
 	    integer *, integer *, doublereal *, integer *, integer *),
@@ -64,10 +66,10 @@
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    integer *);
     static doublereal bignum;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dormbr_(const char *,const char *,const char *, integer *, 
+    extern /* Subroutine */ integer dormbr_(const char *,const char *,const char *, integer *, 
 	    integer *, integer *, doublereal *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, integer *), dorglq_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
@@ -4081,4 +4083,6 @@
 #undef u_ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgetf2.c hypre-2.13.0/src/lapack/dgetf2.c
--- hypre-2.11.2/src/lapack/dgetf2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgetf2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,9 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgetf2_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgetf2_(integer *m, integer *n, doublereal *a, integer *
 	lda, integer *ipiv, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -69,16 +71,16 @@
     integer a_dim1, a_offset, i__1, i__2, i__3;
     doublereal d__1;
     /* Local variables */
-    extern /* Subroutine */ HYPRE_Int dger_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dger_(integer *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer j;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *), dswap_(integer *, doublereal *, integer *, doublereal 
 	    *, integer *);
     static integer jp;
     extern integer idamax_(integer *, doublereal *, integer *);
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 
 
@@ -156,4 +158,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgetrf.c hypre-2.13.0/src/lapack/dgetrf.c
--- hypre-2.11.2/src/lapack/dgetrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgetrf.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgetrf_(integer *m, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dgetrf_(integer *m, integer *n, doublereal *a, integer *
 	lda, integer *ipiv, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -70,20 +73,20 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
     /* Local variables */
     static integer i__, j;
-    extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     static integer iinfo;
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *), dgetf2_(
 	    integer *, integer *, doublereal *, integer *, integer *, integer 
 	    *);
     static integer jb, nb;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dlaswp_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dlaswp_(integer *, doublereal *, integer *, 
 	    integer *, integer *, integer *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 
@@ -195,4 +198,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dgetrs.c hypre-2.13.0/src/lapack/dgetrs.c
--- hypre-2.11.2/src/lapack/dgetrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dgetrs.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,9 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dgetrs_(char *trans, integer *n, integer *nrhs, 
+/* Subroutine */ integer dgetrs_(const char *trans, integer *n, integer *nrhs, 
 	doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer *
 	ldb, integer *info)
 {
@@ -74,7 +76,7 @@
     integer a_dim1, a_offset, b_dim1, b_offset, i__1;
     /* Local variables */
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *), xerbla_(
 	    const char *, integer *), dlaswp_(integer *, doublereal *, 
@@ -159,3 +161,6 @@
 
 } /* dgetrs_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlabad.c hypre-2.13.0/src/lapack/dlabad.c
--- hypre-2.11.2/src/lapack/dlabad.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlabad.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlabad_(doublereal *small, doublereal *large)
+/* Subroutine */ integer dlabad_(doublereal *small, doublereal *large)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
        Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
@@ -41,7 +44,7 @@
        If it looks like we're on a Cray, take the square root of   
        SMALL and LARGE to avoid overflow and underflow problems. */
     /* Builtin functions */
-    HYPRE_Real d_lg10(doublereal *), sqrt(doublereal);
+    doublereal d_lg10(doublereal *);
 
 
     if (d_lg10(large) > 2e3) {
@@ -55,3 +58,6 @@
 
 } /* dlabad_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlabrd.c hypre-2.13.0/src/lapack/dlabrd.c
--- hypre-2.11.2/src/lapack/dlabrd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlabrd.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlabrd_(integer *m, integer *n, integer *nb, doublereal *
+/* Subroutine */ integer dlabrd_(integer *m, integer *n, integer *nb, doublereal *
 	a, integer *lda, doublereal *d__, doublereal *e, doublereal *tauq, 
 	doublereal *taup, doublereal *x, integer *ldx, doublereal *y, integer 
 	*ldy)
@@ -149,7 +152,7 @@
 	    i__3;
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *), dgemv_(const char *, integer *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    doublereal *, integer *), dlarfg_(integer *, doublereal *,
@@ -410,4 +413,6 @@
 #undef x_ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlacpy.c hypre-2.13.0/src/lapack/dlacpy.c
--- hypre-2.11.2/src/lapack/dlacpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlacpy.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlacpy_(const char *uplo, integer *m, integer *n, doublereal *
+/* Subroutine */ integer dlacpy_(const char *uplo, integer *m, integer *n, doublereal *
 	a, integer *lda, doublereal *b, integer *ldb)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -106,4 +109,6 @@
 #undef b_ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlae2.c hypre-2.13.0/src/lapack/dlae2.c
--- hypre-2.11.2/src/lapack/dlae2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlae2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlae2_(doublereal *a, doublereal *b, doublereal *c__, 
+/* Subroutine */ integer dlae2_(doublereal *a, doublereal *b, doublereal *c__, 
 	doublereal *rt1, doublereal *rt2)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -58,8 +61,6 @@
        Compute the eigenvalues */
     /* System generated locals */
     doublereal d__1;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal acmn, acmx, ab, df, tb, sm, rt, adf;
 
@@ -119,3 +120,6 @@
 
 } /* dlae2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlaev2.c hypre-2.13.0/src/lapack/dlaev2.c
--- hypre-2.11.2/src/lapack/dlaev2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlaev2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlaev2_(doublereal *a, doublereal *b, doublereal *c__, 
+/* Subroutine */ integer dlaev2_(doublereal *a, doublereal *b, doublereal *c__, 
 	doublereal *rt1, doublereal *rt2, doublereal *cs1, doublereal *sn1)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -69,8 +72,6 @@
        Compute the eigenvalues */
     /* System generated locals */
     doublereal d__1;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal acmn, acmx, ab, df, cs, ct, tb, sm, tn, rt, adf, acs;
     static integer sgn1, sgn2;
@@ -164,3 +165,6 @@
 
 } /* dlaev2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlamch.c hypre-2.13.0/src/lapack/dlamch.c
--- hypre-2.11.2/src/lapack/dlamch.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlamch.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include <stdio.h>
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 doublereal dlamch_(const char *cmach)
 {
@@ -14,7 +16,7 @@
     Purpose   
     =======   
 
-    DLAMCH determines HYPRE_Real precision machine parameters.   
+    DLAMCH determines doublereal precision machine parameters.   
 
     Arguments   
     =========   
@@ -54,7 +56,7 @@
     integer i__1;
     doublereal ret_val;
     /* Builtin functions */
-    HYPRE_Real pow_di(doublereal *, integer *);
+    doublereal pow_di(doublereal *, integer *);
     /* Local variables */
     static doublereal base;
     static integer beta;
@@ -64,7 +66,7 @@
     static doublereal rmin, rmax, t, rmach;
     extern logical lsame_(const char *,const char *);
     static doublereal small, sfmin;
-    extern /* Subroutine */ HYPRE_Int dlamc2_(integer *, integer *, logical *, 
+    extern /* Subroutine */ integer dlamc2_(integer *, integer *, logical *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *);
     static integer it;
     static doublereal rnd, eps;
@@ -129,9 +131,7 @@
 
 } /* dlamch_ */
 
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int dlamc1_(integer *beta, integer *t, logical *rnd, logical 
+/* Subroutine */ integer dlamc1_(integer *beta, integer *t, logical *rnd, logical 
 	*ieee1)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -252,7 +252,7 @@
 /* +       END WHILE   
 
           Now compute the base.  a and c  are neighbouring floating po
-HYPRE_Int   
+integer   
           numbers  in the  interval  ( beta**t, beta**( t + 1 ) )  and
  so   
           their difference is beta. Adding 0.25 to c is to ensure that
@@ -342,9 +342,7 @@
 
 } /* dlamc1_ */
 
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int dlamc2_(integer *beta, integer *t, logical *rnd, 
+/* Subroutine */ integer dlamc2_(integer *beta, integer *t, logical *rnd, 
 	doublereal *eps, integer *emin, doublereal *rmin, integer *emax, 
 	doublereal *rmax)
 {
@@ -420,7 +418,7 @@
     integer i__1;
     doublereal d__1, d__2, d__3, d__4, d__5;
     /* Builtin functions */
-    HYPRE_Real pow_di(doublereal *, integer *);
+    doublereal pow_di(doublereal *, integer *);
     /* Local variables */
     static logical ieee;
     static doublereal half;
@@ -432,11 +430,11 @@
     static doublereal small;
     static integer gpmin;
     static doublereal third, lrmin, lrmax, sixth;
-    extern /* Subroutine */ HYPRE_Int dlamc1_(integer *, integer *, logical *, 
+    extern /* Subroutine */ integer dlamc1_(integer *, integer *, logical *, 
 	    logical *);
     extern doublereal dlamc3_(doublereal *, doublereal *);
     static logical lieee1;
-    extern /* Subroutine */ HYPRE_Int dlamc4_(integer *, doublereal *, integer *), 
+    extern /* Subroutine */ integer dlamc4_(integer *, doublereal *, integer *), 
 	    dlamc5_(integer *, integer *, integer *, logical *, integer *, 
 	    doublereal *);
     static integer lt, ngnmin, ngpmin;
@@ -594,7 +592,7 @@
 	if (iwarn) {
 	    first = TRUE_;
 	    hypre_printf("\n\n WARNING. The value EMIN may be incorrect:- ");
-	    hypre_printf("EMIN = %8i\n",(HYPRE_Int)lemin);
+	    hypre_printf("EMIN = %8i\n",(integer)lemin);
 	    hypre_printf("If, after inspection, the value EMIN looks acceptable");
             hypre_printf("please comment out \n the IF block as marked within the"); 
             hypre_printf("code of routine DLAMC2, \n otherwise supply EMIN"); 
@@ -647,8 +645,6 @@
 
 } /* dlamc2_ */
 
-#include "f2c.h"
-
 doublereal dlamc3_(doublereal *a, doublereal *b)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -688,9 +684,7 @@
 
 } /* dlamc3_ */
 
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int dlamc4_(integer *emin, doublereal *start, integer *base)
+/* Subroutine */ integer dlamc4_(integer *emin, doublereal *start, integer *base)
 {
 /*  -- LAPACK auxiliary routine (version 2.0) --   
        Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
@@ -779,9 +773,7 @@
 
 } /* dlamc4_ */
 
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int dlamc5_(integer *beta, integer *p, integer *emin, 
+/* Subroutine */ integer dlamc5_(integer *beta, integer *p, integer *emin, 
 	logical *ieee, integer *emax, doublereal *rmax)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -954,3 +946,6 @@
 
 } /* dlamc5_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlange.c hypre-2.13.0/src/lapack/dlange.c
--- hypre-2.11.2/src/lapack/dlange.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlange.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,6 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 doublereal dlange_(const char *norm, integer *m, integer *n, doublereal *a, integer 
 	*lda, doublereal *work)
@@ -72,13 +75,13 @@
     integer a_dim1, a_offset, i__1, i__2;
     doublereal ret_val, d__1, d__2, d__3;
     /* Builtin functions */
-    /*HYPRE_Real sqrt(doublereal);*/
+    /*doublereal sqrt(doublereal);*/
     /* Local variables */
     static integer i__, j;
     static doublereal scale;
     extern logical lsame_(const char *,const char *);
     static doublereal value;
-    extern /* Subroutine */ HYPRE_Int dlassq_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dlassq_(integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *);
     static doublereal sum;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -174,4 +177,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlanst.c hypre-2.13.0/src/lapack/dlanst.c
--- hypre-2.11.2/src/lapack/dlanst.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlanst.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,6 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 doublereal dlanst_(const char *norm, integer *n, doublereal *d__, doublereal *e)
 {
@@ -62,14 +65,12 @@
     /* System generated locals */
     integer i__1;
     doublereal ret_val, d__1, d__2, d__3, d__4, d__5;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static integer i__;
     static doublereal scale;
     extern logical lsame_(const char *,const char *);
     static doublereal anorm;
-    extern /* Subroutine */ HYPRE_Int dlassq_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dlassq_(integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *);
     static doublereal sum;
 
@@ -138,3 +139,6 @@
 
 } /* dlanst_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlansy.c hypre-2.13.0/src/lapack/dlansy.c
--- hypre-2.11.2/src/lapack/dlansy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlansy.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
-#include "math.h"
+#include "hypre_lapack.h"
 
 doublereal dlansy_(const char *norm,const char *uplo, integer *n, doublereal *a, integer 
 	*lda, doublereal *work)
@@ -81,14 +83,14 @@
     integer a_dim1, a_offset, i__1, i__2;
     doublereal ret_val, d__1, d__2, d__3;
     /* Builtin functions */
-    /*HYPRE_Real sqrt(doublereal);*/
+    /*doublereal sqrt(doublereal);*/
     /* Local variables */
     static doublereal absa;
     static integer i__, j;
     static doublereal scale;
     extern logical lsame_(const char *,const char *);
     static doublereal value;
-    extern /* Subroutine */ HYPRE_Int dlassq_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dlassq_(integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *);
     static doublereal sum;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -214,4 +216,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlapy2.c hypre-2.13.0/src/lapack/dlapy2.c
--- hypre-2.11.2/src/lapack/dlapy2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlapy2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
-#include "math.h"
+#include "hypre_lapack.h"
 
 doublereal dlapy2_(doublereal *x, doublereal *y)
 {
@@ -28,7 +30,7 @@
     /* System generated locals */
     doublereal ret_val, d__1;
     /* Builtin functions */
-    /*HYPRE_Real sqrt(doublereal);*/
+    /*doublereal sqrt(doublereal);*/
     /* Local variables */
     static doublereal xabs, yabs, w, z__;
 
@@ -51,3 +53,6 @@
 
 } /* dlapy2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlarfb.c hypre-2.13.0/src/lapack/dlarfb.c
--- hypre-2.11.2/src/lapack/dlarfb.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlarfb.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,9 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlarfb_(const char *side,const char *trans,const char *direct,const char *
+/* Subroutine */ integer dlarfb_(const char *side,const char *trans,const char *direct,const char *
 	storev, integer *m, integer *n, integer *k, doublereal *v, integer *
 	ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, 
 	doublereal *work, integer *ldwork)
@@ -102,11 +104,11 @@
 	    work_offset, i__1, i__2;
     /* Local variables */
     static integer i__, j;
-    extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dcopy_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dcopy_(integer *, doublereal *, integer *, 
 	    doublereal *, integer *), dtrmm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *);
@@ -704,4 +706,6 @@
 #undef c___ref
 #undef work_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlarf.c hypre-2.13.0/src/lapack/dlarf.c
--- hypre-2.11.2/src/lapack/dlarf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlarf.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlarf_(const char *side, integer *m, integer *n, doublereal *v,
+/* Subroutine */ integer dlarf_(const char *side, integer *m, integer *n, doublereal *v,
 	 integer *incv, doublereal *tau, doublereal *c__, integer *ldc, 
 	doublereal *work)
 {
@@ -74,11 +77,11 @@
     integer c_dim1, c_offset;
     doublereal d__1;
     /* Local variables */
-    extern /* Subroutine */ HYPRE_Int dger_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dger_(integer *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *, integer *);
 
@@ -131,3 +134,6 @@
 
 } /* dlarf_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlarfg.c hypre-2.13.0/src/lapack/dlarfg.c
--- hypre-2.11.2/src/lapack/dlarfg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlarfg.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlarfg_(integer *n, doublereal *alpha, doublereal *x, 
+/* Subroutine */ integer dlarfg_(integer *n, doublereal *alpha, doublereal *x, 
 	integer *incx, doublereal *tau)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -63,12 +66,12 @@
     integer i__1;
     doublereal d__1;
     /* Builtin functions */
-    HYPRE_Real d_sign(doublereal *, doublereal *);
+    doublereal d_sign(doublereal *, doublereal *);
     /* Local variables */
     static doublereal beta;
     extern doublereal dnrm2_(integer *, doublereal *, integer *);
     static integer j;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *);
     static doublereal xnorm;
     extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(const char *);
@@ -148,3 +151,6 @@
 
 } /* dlarfg_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlarft.c hypre-2.13.0/src/lapack/dlarft.c
--- hypre-2.11.2/src/lapack/dlarft.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlarft.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,9 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlarft_(const char *direct,const char *storev, integer *n, integer *
+/* Subroutine */ integer dlarft_(const char *direct,const char *storev, integer *n, integer *
 	k, doublereal *v, integer *ldv, doublereal *tau, doublereal *t, 
 	integer *ldt)
 {
@@ -117,7 +119,7 @@
     /* Local variables */
     static integer i__, j;
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *, integer *), dtrmv_(const char *, 
 	    const char *,const char *, integer *, doublereal *, integer *, doublereal *, 
@@ -257,4 +259,6 @@
 #undef v_ref
 #undef t_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlartg.c hypre-2.13.0/src/lapack/dlartg.c
--- hypre-2.11.2/src/lapack/dlartg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlartg.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
-#include "math.h"
-/* Subroutine */ HYPRE_Int dlartg_(doublereal *f, doublereal *g, doublereal *cs, 
+#include "hypre_lapack.h"
+
+/* Subroutine */ integer dlartg_(doublereal *f, doublereal *g, doublereal *cs, 
 	doublereal *sn, doublereal *r__)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -54,24 +57,26 @@
     integer i__1;
     doublereal d__1, d__2;
     /* Builtin functions */
-    //HYPRE_Real log(doublereal), pow_di(doublereal *, integer *), sqrt(doublereal);
+//    doublereal log(doublereal), pow_di(doublereal *, integer *), sqrt(doublereal);
+    doublereal pow_di(doublereal *, integer *);
     /* Local variables */
     static integer i__;
     static doublereal scale;
     static integer count;
     static doublereal f1, g1, safmn2, safmx2;
     extern doublereal dlamch_(const char *);
-    static doublereal safmin, eps;
+//    static doublereal safmin, eps;
 
 
 
     if (first) {
 	first = FALSE_;
-	safmin = dlamch_("S");
-	eps = dlamch_("E");
+//	safmin = dlamch_("S");
+//	eps = dlamch_("E");
 	d__1 = dlamch_("B");
-	i__1 = (integer) (log(safmin / eps) / log(dlamch_("B")) / 
-		2.);
+//	i__1 = (integer) (log(safmin / eps) / log(dlamch_("B")) / 
+//		2.);
+        i__1 = HYPRE_REAL_MIN_EXP>>1;
 	safmn2 = pow_di(&d__1, &i__1);
 	safmx2 = 1. / safmn2;
     }
@@ -158,3 +163,6 @@
 
 } /* dlartg_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlas2.c hypre-2.13.0/src/lapack/dlas2.c
--- hypre-2.11.2/src/lapack/dlas2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlas2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlas2_(doublereal *f, doublereal *g, doublereal *h__, 
+/* Subroutine */ integer dlas2_(doublereal *f, doublereal *g, doublereal *h__, 
 	doublereal *ssmin, doublereal *ssmax)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -60,8 +63,6 @@
     ==================================================================== */
     /* System generated locals */
     doublereal d__1, d__2;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal fhmn, fhmx, c__, fa, ga, ha, as, at, au;
 
@@ -121,3 +122,6 @@
 
 } /* dlas2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlascl.c hypre-2.13.0/src/lapack/dlascl.c
--- hypre-2.11.2/src/lapack/dlascl.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlascl.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlascl_(const char *type__, integer *kl, integer *ku, 
+/* Subroutine */ integer dlascl_(const char *type__, integer *kl, integer *ku, 
 	doublereal *cfrom, doublereal *cto, integer *m, integer *n, 
 	doublereal *a, integer *lda, integer *info)
 {
@@ -88,7 +91,7 @@
     static doublereal cfrom1;
     extern doublereal dlamch_(const char *);
     static doublereal cfromc;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static doublereal bignum, smlnum, mul, cto1;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 
@@ -313,4 +316,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlaset.c hypre-2.13.0/src/lapack/dlaset.c
--- hypre-2.11.2/src/lapack/dlaset.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlaset.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlaset_(const char *uplo, integer *m, integer *n, doublereal *
+/* Subroutine */ integer dlaset_(const char *uplo, integer *m, integer *n, doublereal *
 	alpha, doublereal *beta, doublereal *a, integer *lda)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -131,4 +134,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasq1.c hypre-2.13.0/src/lapack/dlasq1.c
--- hypre-2.11.2/src/lapack/dlasq1.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasq1.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,6 +1,9 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 /*  -- translated by f2c (version 19990503).
    You must link the resulting object file with the libraries:
@@ -13,33 +16,30 @@
 static integer c__2 = 2;
 static integer c__0 = 0;
 
-/* Subroutine */ HYPRE_Int dlasq1_(integer *n, doublereal *d__, doublereal *e, 
+/* Subroutine */ integer dlasq1_(integer *n, doublereal *d__, doublereal *e, 
 	doublereal *work, integer *info)
 {
     /* System generated locals */
     integer i__1, i__2;
     doublereal d__1, d__2, d__3;
 
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
-
     /* Local variables */
-    extern /* Subroutine */ HYPRE_Int dlas2_(doublereal *, doublereal *, doublereal 
+    extern /* Subroutine */ integer dlas2_(doublereal *, doublereal *, doublereal 
 	    *, doublereal *, doublereal *);
     static integer i__;
     static doublereal scale;
     static integer iinfo;
     static doublereal sigmn;
-    extern /* Subroutine */ HYPRE_Int dcopy_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dcopy_(integer *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static doublereal sigmx;
-    extern /* Subroutine */ HYPRE_Int dlasq2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ integer dlasq2_(integer *, doublereal *, integer *);
     extern doublereal dlamch_(const char *);
-    extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
 	    integer *, integer *);
     static doublereal safmin;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *), dlasrt_(
+    extern /* Subroutine */ integer xerbla_(const char *, integer *), dlasrt_(
 	    const char *, integer *, doublereal *, integer *);
     static doublereal eps;
 
@@ -192,3 +192,6 @@
 
 } /* dlasq1_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasq2.c hypre-2.13.0/src/lapack/dlasq2.c
--- hypre-2.11.2/src/lapack/dlasq2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasq2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,12 +1,15 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
+#include "f2c.h"
 #include "hypre_lapack.h"
+
 /*  -- translated by f2c (version 19990503).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-#include "f2c.h"
-
 /* Table of constant values */
 
 static integer c__1 = 1;
@@ -16,15 +19,12 @@
 static integer c__4 = 4;
 static integer c__11 = 11;
 
-/* Subroutine */ HYPRE_Int dlasq2_(integer *n, doublereal *z__, integer *info)
+/* Subroutine */ integer dlasq2_(integer *n, doublereal *z__, integer *info)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
     doublereal d__1, d__2;
 
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
-
     /* Local variables */
     static logical ieee;
     static integer nbig;
@@ -38,16 +38,16 @@
     static integer nfail;
     static doublereal desig, trace, sigma;
     static integer iinfo, i0, i4, n0;
-    extern /* Subroutine */ HYPRE_Int dlasq3_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dlasq3_(integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, doublereal *, doublereal *,
 	     integer *, integer *, integer *, logical *);
     extern doublereal dlamch_(const char *);
     static integer pp, iwhila, iwhilb;
     static doublereal oldemn, safmin;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dlasrt_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dlasrt_(const char *, integer *, doublereal *, 
 	    integer *);
     static doublereal eps, tol;
     static integer ipn4;
@@ -522,3 +522,6 @@
 
 } /* dlasq2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasq3.c hypre-2.13.0/src/lapack/dlasq3.c
--- hypre-2.11.2/src/lapack/dlasq3.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasq3.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlasq3_(integer *i0, integer *n0, doublereal *z__, 
+/* Subroutine */ integer dlasq3_(integer *i0, integer *n0, doublereal *z__, 
 	integer *pp, doublereal *dmin__, doublereal *sigma, doublereal *desig,
 	 doublereal *qmax, integer *nfail, integer *iter, integer *ndiv, 
 	logical *ieee)
@@ -76,12 +79,10 @@
     /* System generated locals */
     integer i__1;
     doublereal d__1, d__2;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal temp, s, t;
     static integer j4;
-    extern /* Subroutine */ HYPRE_Int dlasq4_(integer *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dlasq4_(integer *, integer *, doublereal *, 
 	    integer *, integer *, doublereal *, doublereal *, doublereal *, 
 	    doublereal *, doublereal *, doublereal *, doublereal *, integer *)
 	    , dlasq5_(integer *, integer *, doublereal *, integer *, 
@@ -316,3 +317,6 @@
 
 } /* dlasq3_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasq4.c hypre-2.13.0/src/lapack/dlasq4.c
--- hypre-2.11.2/src/lapack/dlasq4.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasq4.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,13 +1,16 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
+#include "f2c.h"
 #include "hypre_lapack.h"
+
 /*  -- translated by f2c (version 19990503).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int dlasq4_(integer *i0, integer *n0, doublereal *z__, 
+/* Subroutine */ integer dlasq4_(integer *i0, integer *n0, doublereal *z__, 
 	integer *pp, integer *n0in, doublereal *dmin__, doublereal *dmin1, 
 	doublereal *dmin2, doublereal *dn, doublereal *dn1, doublereal *dn2, 
 	doublereal *tau, integer *ttype)
@@ -20,9 +23,6 @@
     integer i__1;
     doublereal d__1, d__2;
 
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
-
     /* Local variables */
     static doublereal s, a2, b1, b2;
     static integer i4, nn, np;
@@ -380,3 +380,6 @@
 
 } /* dlasq4_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasq5.c hypre-2.13.0/src/lapack/dlasq5.c
--- hypre-2.11.2/src/lapack/dlasq5.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasq5.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlasq5_(integer *i0, integer *n0, doublereal *z__, 
+/* Subroutine */ integer dlasq5_(integer *i0, integer *n0, doublereal *z__, 
 	integer *pp, doublereal *tau, doublereal *dmin__, doublereal *dmin1, 
 	doublereal *dmin2, doublereal *dn, doublereal *dnm1, doublereal *dnm2,
 	 logical *ieee)
@@ -209,3 +212,6 @@
 
 } /* dlasq5_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasq6.c hypre-2.13.0/src/lapack/dlasq6.c
--- hypre-2.11.2/src/lapack/dlasq6.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasq6.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,13 +1,16 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
+#include "f2c.h"
 #include "hypre_lapack.h"
+
 /*  -- translated by f2c (version 19990503).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-#include "f2c.h"
-
-/* Subroutine */ HYPRE_Int dlasq6_(integer *i0, integer *n0, doublereal *z__, 
+/* Subroutine */ integer dlasq6_(integer *i0, integer *n0, doublereal *z__, 
 	integer *pp, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2,
 	 doublereal *dn, doublereal *dnm1, doublereal *dnm2)
 {
@@ -187,3 +190,6 @@
 
 } /* dlasq6_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasr.c hypre-2.13.0/src/lapack/dlasr.c
--- hypre-2.11.2/src/lapack/dlasr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlasr_(const char *side,const char *pivot,const char *direct, integer *m,
+/* Subroutine */ integer dlasr_(const char *side,const char *pivot,const char *direct, integer *m,
 	 integer *n, doublereal *c__, doublereal *s, doublereal *a, integer *
 	lda)
 {
@@ -114,7 +117,7 @@
     static integer i__, j;
     extern logical lsame_(const char *,const char *);
     static doublereal ctemp, stemp;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 
     --c__;
@@ -392,4 +395,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasrt.c hypre-2.13.0/src/lapack/dlasrt.c
--- hypre-2.11.2/src/lapack/dlasrt.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasrt.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlasrt_(const char *id, integer *n, doublereal *d__, integer *
+/* Subroutine */ integer dlasrt_(const char *id, integer *n, doublereal *d__, integer *
 	info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -54,7 +57,7 @@
     static integer stack[64]	/* was [2][32] */;
     static doublereal dmnmx, d1, d2, d3;
     static integer start;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static integer stkpnt, dir;
     static doublereal tmp;
 #define stack_ref(a_1,a_2) stack[(a_2)*2 + a_1 - 3]
@@ -258,4 +261,6 @@
 
 #undef stack_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlassq.c hypre-2.13.0/src/lapack/dlassq.c
--- hypre-2.11.2/src/lapack/dlassq.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlassq.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlassq_(integer *n, doublereal *x, integer *incx, 
+/* Subroutine */ integer dlassq_(integer *n, doublereal *x, integer *incx, 
 	doublereal *scale, doublereal *sumsq)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -92,3 +95,6 @@
 
 } /* dlassq_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlasv2.c hypre-2.13.0/src/lapack/dlasv2.c
--- hypre-2.11.2/src/lapack/dlasv2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlasv2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlasv2_(doublereal *f, doublereal *g, doublereal *h__, 
+/* Subroutine */ integer dlasv2_(doublereal *f, doublereal *g, doublereal *h__, 
 	doublereal *ssmin, doublereal *ssmax, doublereal *snr, doublereal *
 	csr, doublereal *snl, doublereal *csl)
 {
@@ -83,7 +86,7 @@
     /* System generated locals */
     doublereal d__1;
     /* Builtin functions */
-    HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *);
+    doublereal d_sign(doublereal *, doublereal *);
     /* Local variables */
     static integer pmax;
     static doublereal temp;
@@ -249,3 +252,6 @@
 
 } /* dlasv2_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlaswp.c hypre-2.13.0/src/lapack/dlaswp.c
--- hypre-2.11.2/src/lapack/dlaswp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlaswp.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlaswp_(integer *n, doublereal *a, integer *lda, integer 
+/* Subroutine */ integer dlaswp_(integer *n, doublereal *a, integer *lda, integer 
 	*k1, integer *k2, integer *ipiv, integer *incx)
 {
 /*  -- LAPACK auxiliary routine (version 3.0) --   
@@ -141,4 +144,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dlatrd.c hypre-2.13.0/src/lapack/dlatrd.c
--- hypre-2.11.2/src/lapack/dlatrd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dlatrd.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dlatrd_(const char *uplo, integer *n, integer *nb, doublereal *
+/* Subroutine */ integer dlatrd_(const char *uplo, integer *n, integer *nb, doublereal *
 	a, integer *lda, doublereal *e, doublereal *tau, doublereal *w, 
 	integer *ldw)
 {
@@ -152,10 +155,10 @@
 	    integer *);
     static integer i__;
     static doublereal alpha;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *);
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *, integer *), daxpy_(integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, integer *), 
@@ -330,4 +333,6 @@
 #undef w_ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorg2l.c hypre-2.13.0/src/lapack/dorg2l.c
--- hypre-2.11.2/src/lapack/dorg2l.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorg2l.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorg2l_(integer *m, integer *n, integer *k, doublereal *
+/* Subroutine */ integer dorg2l_(integer *m, integer *n, integer *k, doublereal *
 	a, integer *lda, doublereal *tau, doublereal *work, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -69,11 +72,11 @@
     doublereal d__1;
     /* Local variables */
     static integer i__, j, l;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *), dlarf_(const char *, integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *);
     static integer ii;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 
 
@@ -152,4 +155,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorg2r.c hypre-2.13.0/src/lapack/dorg2r.c
--- hypre-2.11.2/src/lapack/dorg2r.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorg2r.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorg2r_(integer *m, integer *n, integer *k, doublereal *
+/* Subroutine */ integer dorg2r_(integer *m, integer *n, integer *k, doublereal *
 	a, integer *lda, doublereal *tau, doublereal *work, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -69,7 +72,7 @@
     doublereal d__1;
     /* Local variables */
     static integer i__, j, l;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *), dlarf_(const char *, integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -152,4 +155,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorgbr.c hypre-2.13.0/src/lapack/dorgbr.c
--- hypre-2.11.2/src/lapack/dorgbr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorgbr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorgbr_(const char *vect, integer *m, integer *n, integer *k, 
+/* Subroutine */ integer dorgbr_(const char *vect, integer *m, integer *n, integer *k, 
 	doublereal *a, integer *lda, doublereal *tau, doublereal *work, 
 	integer *lwork, integer *info)
 {
@@ -110,10 +113,10 @@
     static integer iinfo;
     static logical wantq;
     static integer nb, mn;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dorglq_(integer *, integer *, integer *, 
+    extern /* Subroutine */ integer dorglq_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    integer *), dorgqr_(integer *, integer *, integer *, doublereal *,
 	     integer *, doublereal *, doublereal *, integer *, integer *);
@@ -277,4 +280,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorgl2.c hypre-2.13.0/src/lapack/dorgl2.c
--- hypre-2.11.2/src/lapack/dorgl2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorgl2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorgl2_(integer *m, integer *n, integer *k, doublereal *
+/* Subroutine */ integer dorgl2_(integer *m, integer *n, integer *k, doublereal *
 	a, integer *lda, doublereal *tau, doublereal *work, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -65,7 +68,7 @@
     doublereal d__1;
     /* Local variables */
     static integer i__, j, l;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *), dlarf_(const char *, integer *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(const char *, integer *);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -152,4 +155,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorglq.c hypre-2.13.0/src/lapack/dorglq.c
--- hypre-2.11.2/src/lapack/dorglq.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorglq.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorglq_(integer *m, integer *n, integer *k, doublereal *
+/* Subroutine */ integer dorglq_(integer *m, integer *n, integer *k, doublereal *
 	a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, 
 	integer *info)
 {
@@ -82,15 +85,15 @@
     integer a_dim1, a_offset, i__1, i__2, i__3;
     /* Local variables */
     static integer i__, j, l, nbmin, iinfo;
-    extern /* Subroutine */ HYPRE_Int dorgl2_(integer *, integer *, integer *, 
+    extern /* Subroutine */ integer dorgl2_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *);
     static integer ib, nb, ki, kk;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nx;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -261,4 +264,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorgql.c hypre-2.13.0/src/lapack/dorgql.c
--- hypre-2.11.2/src/lapack/dorgql.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorgql.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorgql_(integer *m, integer *n, integer *k, doublereal *
+/* Subroutine */ integer dorgql_(integer *m, integer *n, integer *k, doublereal *
 	a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, 
 	integer *info)
 {
@@ -83,15 +86,15 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
     /* Local variables */
     static integer i__, j, l, nbmin, iinfo;
-    extern /* Subroutine */ HYPRE_Int dorg2l_(integer *, integer *, integer *, 
+    extern /* Subroutine */ integer dorg2l_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *);
     static integer ib, nb, kk;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nx;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -262,4 +265,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorgqr.c hypre-2.13.0/src/lapack/dorgqr.c
--- hypre-2.11.2/src/lapack/dorgqr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorgqr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorgqr_(integer *m, integer *n, integer *k, doublereal *
+/* Subroutine */ integer dorgqr_(integer *m, integer *n, integer *k, doublereal *
 	a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, 
 	integer *info)
 {
@@ -83,15 +86,15 @@
     integer a_dim1, a_offset, i__1, i__2, i__3;
     /* Local variables */
     static integer i__, j, l, nbmin, iinfo;
-    extern /* Subroutine */ HYPRE_Int dorg2r_(integer *, integer *, integer *, 
+    extern /* Subroutine */ integer dorg2r_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *);
     static integer ib, nb, ki, kk;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nx;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -263,4 +266,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorgtr.c hypre-2.13.0/src/lapack/dorgtr.c
--- hypre-2.11.2/src/lapack/dorgtr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorgtr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorgtr_(const char *uplo, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dorgtr_(const char *uplo, integer *n, doublereal *a, integer *
 	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -81,10 +84,10 @@
     static integer iinfo;
     static logical upper;
     static integer nb;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dorgql_(integer *, integer *, integer *, 
+    extern /* Subroutine */ integer dorgql_(integer *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    integer *), dorgqr_(integer *, integer *, integer *, doublereal *,
 	     integer *, doublereal *, doublereal *, integer *, integer *);
@@ -228,4 +231,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorm2r.c hypre-2.13.0/src/lapack/dorm2r.c
--- hypre-2.11.2/src/lapack/dorm2r.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorm2r.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorm2r_(const char *side,const char *trans, integer *m, integer *n, 
+/* Subroutine */ integer dorm2r_(const char *side,const char *trans, integer *m, integer *n, 
 	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
 	c__, integer *ldc, doublereal *work, integer *info)
 {
@@ -100,12 +103,12 @@
     /* Local variables */
     static logical left;
     static integer i__;
-    extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *);
     extern logical lsame_(const char *,const char *);
     static integer i1, i2, i3, ic, jc, mi, ni, nq;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static logical notran;
     static doublereal aii;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -213,4 +216,6 @@
 #undef c___ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dormbr.c hypre-2.13.0/src/lapack/dormbr.c
--- hypre-2.11.2/src/lapack/dormbr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dormbr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dormbr_(const char *vect,const char *side,const char *trans, integer *m, 
+/* Subroutine */ integer dormbr_(const char *vect,const char *side,const char *trans, integer *m, 
 	integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, 
 	doublereal *c__, integer *ldc, doublereal *work, integer *lwork, 
 	integer *info)
@@ -133,19 +136,19 @@
     integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2];
     char ch__1[2];
     /* Builtin functions   
-       Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen);
+       Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen);
     /* Local variables */
     static logical left;
     extern logical lsame_(const char *,const char *);
     static integer iinfo, i1, i2, nb, mi, ni, nq, nw;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dormlq_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dormlq_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *, integer *);
     static logical notran;
-    extern /* Subroutine */ HYPRE_Int dormqr_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dormqr_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *, integer *);
     static logical applyq;
@@ -341,4 +344,6 @@
 #undef c___ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dorml2.c hypre-2.13.0/src/lapack/dorml2.c
--- hypre-2.11.2/src/lapack/dorml2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dorml2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dorml2_(const char *side,const char *trans, integer *m, integer *n, 
+/* Subroutine */ integer dorml2_(const char *side,const char *trans, integer *m, integer *n, 
 	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
 	c__, integer *ldc, doublereal *work, integer *info)
 {
@@ -97,12 +100,12 @@
     /* Local variables */
     static logical left;
     static integer i__;
-    extern /* Subroutine */ HYPRE_Int dlarf_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarf_(const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *);
     extern logical lsame_(const char *,const char *);
     static integer i1, i2, i3, ic, jc, mi, ni, nq;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static logical notran;
     static doublereal aii;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -209,4 +212,6 @@
 #undef c___ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dormlq.c hypre-2.13.0/src/lapack/dormlq.c
--- hypre-2.11.2/src/lapack/dormlq.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dormlq.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dormlq_(const char *side,const char *trans, integer *m, integer *n, 
+/* Subroutine */ integer dormlq_(const char *side,const char *trans, integer *m, integer *n, 
 	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
 	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
 {
@@ -112,23 +115,23 @@
 	    i__5;
     char ch__1[2];
     /* Builtin functions   
-       Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen);
+       Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen);
     /* Local variables */
     static logical left;
     static integer i__;
     static doublereal t[4160]	/* was [65][64] */;
     extern logical lsame_(const char *,const char *);
     static integer nbmin, iinfo, i1, i2, i3;
-    extern /* Subroutine */ HYPRE_Int dorml2_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dorml2_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *);
     static integer ib, ic, jc, nb, mi, ni;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nq, nw;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -314,4 +317,6 @@
 #undef c___ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dormqr.c hypre-2.13.0/src/lapack/dormqr.c
--- hypre-2.11.2/src/lapack/dormqr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dormqr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dormqr_(const char *side,const char *trans, integer *m, integer *n, 
+/* Subroutine */ integer dormqr_(const char *side,const char *trans, integer *m, integer *n, 
 	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
 	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
 {
@@ -112,23 +115,23 @@
 	    i__5;
     char ch__1[2];
     /* Builtin functions   
-       Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen);
+       Subroutine */ integer s_cat(char *, char **, integer *, integer *, ftnlen);
     /* Local variables */
     static logical left;
     static integer i__;
     static doublereal t[4160]	/* was [65][64] */;
     extern logical lsame_(const char *,const char *);
     static integer nbmin, iinfo, i1, i2, i3;
-    extern /* Subroutine */ HYPRE_Int dorm2r_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dorm2r_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *);
     static integer ib, ic, jc, nb, mi, ni;
-    extern /* Subroutine */ HYPRE_Int dlarfb_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dlarfb_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, integer *, doublereal *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer nq, nw;
-    extern /* Subroutine */ HYPRE_Int dlarft_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlarft_(const char *,const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
@@ -306,4 +309,6 @@
 #undef c___ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dpotf2.c hypre-2.13.0/src/lapack/dpotf2.c
--- hypre-2.11.2/src/lapack/dpotf2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dpotf2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dpotf2_(const char *uplo, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dpotf2_(const char *uplo, integer *n, doublereal *a, integer *
 	lda, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -73,19 +76,19 @@
     integer a_dim1, a_offset, i__1, i__2, i__3;
     doublereal d__1;
     /* Builtin functions */
-    /*HYPRE_Real sqrt(doublereal);*/
+    /*doublereal sqrt(doublereal);*/
     /* Local variables */
     extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer j;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *);
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dgemv_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemv_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *, integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static doublereal ajj;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 
@@ -197,4 +200,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dpotrf.c hypre-2.13.0/src/lapack/dpotrf.c
--- hypre-2.11.2/src/lapack/dpotrf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dpotrf.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dpotrf_(const char *uplo, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dpotrf_(const char *uplo, integer *n, doublereal *a, integer *
 	lda, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -72,20 +75,20 @@
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
     /* Local variables */
     static integer j;
-    extern /* Subroutine */ HYPRE_Int dgemm_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dgemm_(const char *,const char *, integer *, integer *, 
 	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *);
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int dsyrk_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dsyrk_(const char *,const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, doublereal *,
 	     integer *), dpotf2_(const char *, integer *, 
 	    doublereal *, integer *, integer *);
     static integer jb, nb;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -223,4 +226,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dpotrs.c hypre-2.13.0/src/lapack/dpotrs.c
--- hypre-2.11.2/src/lapack/dpotrs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dpotrs.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dpotrs_(char *uplo, integer *n, integer *nrhs, 
+/* Subroutine */ integer dpotrs_(char *uplo, integer *n, integer *nrhs, 
 	doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *
 	info)
 {
@@ -64,11 +67,11 @@
     integer a_dim1, a_offset, b_dim1, b_offset, i__1;
     /* Local variables */
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
 
 
     a_dim1 = *lda;
@@ -138,3 +141,6 @@
 
 } /* dpotrs_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsteqr.c hypre-2.13.0/src/lapack/dsteqr.c
--- hypre-2.11.2/src/lapack/dsteqr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsteqr.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsteqr_(const char *compz, integer *n, doublereal *d__, 
+/* Subroutine */ integer dsteqr_(const char *compz, integer *n, doublereal *d__, 
 	doublereal *e, doublereal *z__, integer *ldz, doublereal *work, 
 	integer *info)
 {
@@ -89,22 +92,22 @@
     integer z_dim1, z_offset, i__1, i__2;
     doublereal d__1, d__2;
     /* Builtin functions */
-    HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *);
+    doublereal d_sign(doublereal *, doublereal *);
     /* Local variables */
     static integer lend, jtot;
-    extern /* Subroutine */ HYPRE_Int dlae2_(doublereal *, doublereal *, doublereal 
+    extern /* Subroutine */ integer dlae2_(doublereal *, doublereal *, doublereal 
 	    *, doublereal *, doublereal *);
     static doublereal b, c__, f, g;
     static integer i__, j, k, l, m;
     static doublereal p, r__, s;
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dlasr_(const char *,const char *,const char *, integer *, 
+    extern /* Subroutine */ integer dlasr_(const char *,const char *,const char *, integer *, 
 	    integer *, doublereal *, doublereal *, doublereal *, integer *);
     static doublereal anorm;
-    extern /* Subroutine */ HYPRE_Int dswap_(integer *, doublereal *, integer *, 
+    extern /* Subroutine */ integer dswap_(integer *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static integer l1;
-    extern /* Subroutine */ HYPRE_Int dlaev2_(doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dlaev2_(doublereal *, doublereal *, 
 	    doublereal *, doublereal *, doublereal *, doublereal *, 
 	    doublereal *);
     static integer lendm1, lendp1;
@@ -112,17 +115,17 @@
     static integer ii;
     extern doublereal dlamch_(const char *);
     static integer mm, iscale;
-    extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
 	    integer *, integer *), dlaset_(const char *, integer *, integer 
 	    *, doublereal *, doublereal *, doublereal *, integer *);
     static doublereal safmin;
-    extern /* Subroutine */ HYPRE_Int dlartg_(doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dlartg_(doublereal *, doublereal *, 
 	    doublereal *, doublereal *, doublereal *);
     static doublereal safmax;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern doublereal dlanst_(const char *, integer *, doublereal *, doublereal *);
-    extern /* Subroutine */ HYPRE_Int dlasrt_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dlasrt_(const char *, integer *, doublereal *, 
 	    integer *);
     static integer lendsv;
     static doublereal ssfmin;
@@ -598,4 +601,6 @@
 
 #undef z___ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsterf.c hypre-2.13.0/src/lapack/dsterf.c
--- hypre-2.11.2/src/lapack/dsterf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsterf.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsterf_(integer *n, doublereal *d__, doublereal *e, 
+/* Subroutine */ integer dsterf_(integer *n, doublereal *d__, doublereal *e, 
 	integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -54,11 +57,11 @@
     integer i__1;
     doublereal d__1, d__2, d__3;
     /* Builtin functions */
-    HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *);
+    doublereal d_sign(doublereal *, doublereal *);
     /* Local variables */
     static doublereal oldc;
     static integer lend, jtot;
-    extern /* Subroutine */ HYPRE_Int dlae2_(doublereal *, doublereal *, doublereal 
+    extern /* Subroutine */ integer dlae2_(doublereal *, doublereal *, doublereal 
 	    *, doublereal *, doublereal *);
     static doublereal c__;
     static integer i__, l, m;
@@ -68,14 +71,14 @@
     static doublereal bb;
     extern doublereal dlamch_(const char *);
     static integer iscale;
-    extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
 	    integer *, integer *);
     static doublereal oldgam, safmin;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static doublereal safmax;
     extern doublereal dlanst_(const char *, integer *, doublereal *, doublereal *);
-    extern /* Subroutine */ HYPRE_Int dlasrt_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dlasrt_(const char *, integer *, doublereal *, 
 	    integer *);
     static integer lendsv;
     static doublereal ssfmin;
@@ -433,3 +436,6 @@
 
 } /* dsterf_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsyev.c hypre-2.13.0/src/lapack/dsyev.c
--- hypre-2.11.2/src/lapack/dsyev.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsyev.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsyev_(const char *jobz,const char *uplo, integer *n, doublereal *a,
+/* Subroutine */ integer dsyev_(const char *jobz,const char *uplo, integer *n, doublereal *a,
 	 integer *lda, doublereal *w, doublereal *work, integer *lwork, 
 	integer *info)
 {
@@ -85,15 +88,13 @@
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
     doublereal d__1;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static integer inde;
     static doublereal anrm;
     static integer imax;
     static doublereal rmin, rmax;
 /***static integer lopt;***/
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *);
     static doublereal sigma;
     extern logical lsame_(const char *,const char *);
@@ -102,21 +103,21 @@
     static integer nb;
     extern doublereal dlamch_(const char *);
     static integer iscale;
-    extern /* Subroutine */ HYPRE_Int dlascl_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlascl_(const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
 	    integer *, integer *);
     static doublereal safmin;
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static doublereal bignum;
     static integer indtau;
-    extern /* Subroutine */ HYPRE_Int dsterf_(integer *, doublereal *, doublereal *,
+    extern /* Subroutine */ integer dsterf_(integer *, doublereal *, doublereal *,
 	     integer *);
     extern doublereal dlansy_(const char *,const char *, integer *, doublereal *, 
 	    integer *, doublereal *);
     static integer indwrk;
-    extern /* Subroutine */ HYPRE_Int dorgtr_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dorgtr_(const char *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, integer *, integer *), dsteqr_(const char *, integer *, doublereal *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *), 
 	    dsytrd_(const char *, integer *, doublereal *, integer *, doublereal *, 
@@ -261,4 +262,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsygs2.c hypre-2.13.0/src/lapack/dsygs2.c
--- hypre-2.11.2/src/lapack/dsygs2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsygs2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsygs2_(integer *itype,const char *uplo, integer *n, 
+/* Subroutine */ integer dsygs2_(integer *itype,const char *uplo, integer *n, 
 	doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *
 	info)
 {
@@ -83,21 +86,21 @@
     integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
     doublereal d__1;
     /* Local variables */
-    extern /* Subroutine */ HYPRE_Int dsyr2_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dsyr2_(const char *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer k;
-    extern /* Subroutine */ HYPRE_Int dscal_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer dscal_(integer *, doublereal *, doublereal *, 
 	    integer *);
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int daxpy_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer daxpy_(integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int dtrmv_(const char *,const char *,const char *, integer *, 
+    extern /* Subroutine */ integer dtrmv_(const char *,const char *,const char *, integer *, 
 	    doublereal *, integer *, doublereal *, integer *), dtrsv_(const char *,const char *,const char *, integer *, doublereal *, 
 	    integer *, doublereal *, integer *);
     static doublereal ct;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     static doublereal akk, bkk;
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
 #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]
@@ -272,4 +275,6 @@
 #undef b_ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsygst.c hypre-2.13.0/src/lapack/dsygst.c
--- hypre-2.11.2/src/lapack/dsygst.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsygst.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsygst_(integer *itype,const char *uplo, integer *n, 
+/* Subroutine */ integer dsygst_(integer *itype,const char *uplo, integer *n, 
 	doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *
 	info)
 {
@@ -87,24 +90,24 @@
     /* Local variables */
     static integer k;
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dtrmm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrmm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *), dsymm_(
 	    const char *,const char *, integer *, integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
 	    integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *), dsygs2_(
 	    integer *,const char *, integer *, doublereal *, integer *, doublereal 
 	    *, integer *, integer *);
     static integer kb;
-    extern /* Subroutine */ HYPRE_Int dsyr2k_(const char *,const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dsyr2k_(const char *,const char *, integer *, integer *, 
 	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
 	    doublereal *, doublereal *, integer *);
     static integer nb;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
 #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
@@ -318,4 +321,6 @@
 #undef b_ref
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsygv.c hypre-2.13.0/src/lapack/dsygv.c
--- hypre-2.11.2/src/lapack/dsygv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsygv.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsygv_(integer *itype, char *jobz, char *uplo, integer *
+/* Subroutine */ integer dsygv_(integer *itype, char *jobz, char *uplo, integer *
 	n, doublereal *a, integer *lda, doublereal *b, integer *ldb, 
 	doublereal *w, doublereal *work, integer *lwork, integer *info)
 {
@@ -118,22 +121,22 @@
     /* Local variables */
     static integer neig;
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int dtrmm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrmm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static char trans[1];
-    extern /* Subroutine */ HYPRE_Int dtrsm_(const char *,const char *,const char *,const char *, 
+    extern /* Subroutine */ integer dtrsm_(const char *,const char *,const char *,const char *, 
 	    integer *, integer *, doublereal *, doublereal *, integer *, 
 	    doublereal *, integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int dsyev_(const char *,const char *, integer *, doublereal *
+    extern /* Subroutine */ integer dsyev_(const char *,const char *, integer *, doublereal *
 	    , integer *, doublereal *, doublereal *, integer *, integer *);
     static logical wantz;
     static integer nb;
-    extern /* Subroutine */ HYPRE_Int xerbla_(const char *, integer *);
+    extern /* Subroutine */ integer xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
 	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ HYPRE_Int dpotrf_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dpotrf_(const char *, integer *, doublereal *, 
 	    integer *, integer *), dsygst_(integer *,const char *, integer 
 	    *, doublereal *, integer *, doublereal *, integer *, integer *);
     static integer lwkopt;
@@ -254,3 +257,6 @@
 
 } /* dsygv_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsytd2.c hypre-2.13.0/src/lapack/dsytd2.c
--- hypre-2.11.2/src/lapack/dsytd2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsytd2.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsytd2_(const char *uplo, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dsytd2_(const char *uplo, integer *n, doublereal *a, integer *
 	lda, doublereal *d__, doublereal *e, doublereal *tau, integer *info)
 {
 /*  -- LAPACK routine (version 3.0) --   
@@ -127,16 +130,16 @@
     extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static doublereal taui;
-    extern /* Subroutine */ HYPRE_Int dsyr2_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dsyr2_(const char *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    integer *);
     static integer i__;
     static doublereal alpha;
     extern logical lsame_(const char *,const char *);
-    extern /* Subroutine */ HYPRE_Int daxpy_(integer *, doublereal *, doublereal *, 
+    extern /* Subroutine */ integer daxpy_(integer *, doublereal *, doublereal *, 
 	    integer *, doublereal *, integer *);
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int dsymv_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dsymv_(const char *, integer *, doublereal *, 
 	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
 	    doublereal *, integer *), dlarfg_(integer *, doublereal *,
 	     doublereal *, integer *, doublereal *), xerbla_(const char *, integer *
@@ -280,4 +283,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/dsytrd.c hypre-2.13.0/src/lapack/dsytrd.c
--- hypre-2.11.2/src/lapack/dsytrd.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/dsytrd.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,8 +1,11 @@
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int dsytrd_(const char *uplo, integer *n, doublereal *a, integer *
+/* Subroutine */ integer dsytrd_(const char *uplo, integer *n, doublereal *a, integer *
 	lda, doublereal *d__, doublereal *e, doublereal *tau, doublereal *
 	work, integer *lwork, integer *info)
 {
@@ -144,12 +147,12 @@
     extern logical lsame_(const char *,const char *);
     static integer nbmin, iinfo;
     static logical upper;
-    extern /* Subroutine */ HYPRE_Int dsytd2_(const char *, integer *, doublereal *, 
+    extern /* Subroutine */ integer dsytd2_(const char *, integer *, doublereal *, 
 	    integer *, doublereal *, doublereal *, doublereal *, integer *), dsyr2k_(const char *,const char *, integer *, integer *, doublereal 
 	    *, doublereal *, integer *, doublereal *, integer *, doublereal *,
 	     doublereal *, integer *);
     static integer nb, kk, nx;
-    extern /* Subroutine */ HYPRE_Int dlatrd_(const char *, integer *, integer *, 
+    extern /* Subroutine */ integer dlatrd_(const char *, integer *, integer *, 
 	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
 	     integer *), xerbla_(const char *, integer *);
     extern integer ilaenv_(integer *,const char *,const char *, integer *, integer *, 
@@ -339,4 +342,6 @@
 
 #undef a_ref
 
-
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/f2c.h hypre-2.13.0/src/lapack/f2c.h
--- hypre-2.11.2/src/lapack/f2c.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/f2c.h	2017-10-20 17:42:22.000000000 +0000
@@ -25,6 +25,13 @@
 #define HYPRE_SEQUENTIAL
 #endif
 #include "_hypre_utilities.h"
+#include "math.h"
+
+#if defined(HYPRE_SINGLE)
+#define sqrt sqrtf
+#elif defined(HYPRE_LONG_DOUBLE)
+#define sqrt sqrtl
+#endif
 
 #ifdef HYPRE_BIGINT
 typedef long long int HYPRE_LongInt;
diff -Nru hypre-2.11.2/src/lapack/_hypre_lapack.h hypre-2.13.0/src/lapack/_hypre_lapack.h
--- hypre-2.11.2/src/lapack/_hypre_lapack.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/lapack/_hypre_lapack.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,202 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header file for HYPRE LAPACK
+ *
+ *****************************************************************************/
+
+#ifndef HYPRE_LAPACK_H
+#define HYPRE_LAPACK_H
+
+#include "_hypre_utilities.h"
+#include "fortran.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*--------------------------------------------------------------------------
+ * Change all 'hypre_' names based on using HYPRE or external library
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_USING_HYPRE_LAPACK
+
+#define hypre_dbdsqr  hypre_F90_NAME_LAPACK(dbdsqr,DBDSQR)
+#define hypre_dgebd2  hypre_F90_NAME_LAPACK(dgebd2,DGEBD2)
+#define hypre_dgebrd  hypre_F90_NAME_LAPACK(dgebrd,DGEBRD)
+#define hypre_dgelq2  hypre_F90_NAME_LAPACK(dgelq2,DGELQ2)
+#define hypre_dgelqf  hypre_F90_NAME_LAPACK(dgelqf,DGELQF)
+#define hypre_dgels   hypre_F90_NAME_LAPACK(dgels ,DGELS )
+#define hypre_dgeqr2  hypre_F90_NAME_LAPACK(dgeqr2,DGEQR2)
+#define hypre_dgeqrf  hypre_F90_NAME_LAPACK(dgeqrf,DGEQRF)
+#define hypre_dgesvd  hypre_F90_NAME_LAPACK(dgesvd,DGESVD)
+#define hypre_dgetf2  hypre_F90_NAME_LAPACK(dgetf2,DGETF2)
+#define hypre_dgetrf  hypre_F90_NAME_LAPACK(dgetrf,DGETRF)
+#define hypre_dgetrs  hypre_F90_NAME_LAPACK(dgetrs,DGETRS)
+#define hypre_dlasq1  hypre_F90_NAME_LAPACK(dlasq1,DLASQ1)
+#define hypre_dlasq2  hypre_F90_NAME_LAPACK(dlasq2,DLASQ2)
+#define hypre_dlasrt  hypre_F90_NAME_LAPACK(dlasrt,DLASRT)
+#define hypre_dorg2l  hypre_F90_NAME_LAPACK(dorg2l,DORG2L)
+#define hypre_dorg2r  hypre_F90_NAME_LAPACK(dorg2r,DORG2R)
+#define hypre_dorgbr  hypre_F90_NAME_LAPACK(dorgbr,DORGBR)
+#define hypre_dorgl2  hypre_F90_NAME_LAPACK(dorgl2,DORGL2)
+#define hypre_dorglq  hypre_F90_NAME_LAPACK(dorglq,DORGLQ)
+#define hypre_dorgql  hypre_F90_NAME_LAPACK(dorgql,DORGQL)
+#define hypre_dorgqr  hypre_F90_NAME_LAPACK(dorgqr,DORGQR)
+#define hypre_dorgtr  hypre_F90_NAME_LAPACK(dorgtr,DORGTR)
+#define hypre_dorm2r  hypre_F90_NAME_LAPACK(dorm2r,DORM2R)
+#define hypre_dormbr  hypre_F90_NAME_LAPACK(dormbr,DORMBR)
+#define hypre_dorml2  hypre_F90_NAME_LAPACK(dorml2,DORML2)
+#define hypre_dormlq  hypre_F90_NAME_LAPACK(dormlq,DORMLQ)
+#define hypre_dormqr  hypre_F90_NAME_LAPACK(dormqr,DORMQR)
+#define hypre_dpotf2  hypre_F90_NAME_LAPACK(dpotf2,DPOTF2)
+#define hypre_dpotrf  hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)
+#define hypre_dpotrs  hypre_F90_NAME_LAPACK(dpotrs,DPOTRS)
+#define hypre_dsteqr  hypre_F90_NAME_LAPACK(dsteqr,DSTEQR)
+#define hypre_dsterf  hypre_F90_NAME_LAPACK(dsterf,DSTERF)
+#define hypre_dsyev   hypre_F90_NAME_LAPACK(dsyev ,DSYEV )
+#define hypre_dsygs2  hypre_F90_NAME_LAPACK(dsygs2,DSYGS2)
+#define hypre_dsygst  hypre_F90_NAME_LAPACK(dsygst,DSYGST)
+#define hypre_dsygv   hypre_F90_NAME_LAPACK(dsygv ,DSYGV )
+#define hypre_dsytd2  hypre_F90_NAME_LAPACK(dsytd2,DSYTD2)
+#define hypre_dsytrd  hypre_F90_NAME_LAPACK(dsytrd,DSYTRD)
+
+#endif
+
+/*--------------------------------------------------------------------------
+ * Prototypes
+ *--------------------------------------------------------------------------*/
+
+/* dbdsqr.c */
+HYPRE_Int hypre_dbdsqr (const char *uplo , HYPRE_Int *n , HYPRE_Int *ncvt , HYPRE_Int *nru , HYPRE_Int *ncc , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *vt , HYPRE_Int *ldvt , HYPRE_Real *u , HYPRE_Int *ldu , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dgebd2.c */
+HYPRE_Int hypre_dgebd2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tauq , HYPRE_Real *taup , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dgebrd.c */
+HYPRE_Int hypre_dgebrd ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tauq , HYPRE_Real *taup , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dgelq2.c */
+HYPRE_Int hypre_dgelq2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dgelqf.c */
+HYPRE_Int hypre_dgelqf ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dgels.c */
+HYPRE_Int hypre_dgels ( char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *nrhs , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dgeqr2.c */
+HYPRE_Int hypre_dgeqr2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dgeqrf.c */
+HYPRE_Int hypre_dgeqrf ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dgesvd.c */
+HYPRE_Int hypre_dgesvd ( char *jobu , char *jobvt , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *s , HYPRE_Real *u , HYPRE_Int *ldu , HYPRE_Real *vt , HYPRE_Int *ldvt , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dgetf2.c */
+HYPRE_Int hypre_dgetf2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *ipiv , HYPRE_Int *info );
+
+/* dgetrf.c */
+HYPRE_Int hypre_dgetrf ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *ipiv , HYPRE_Int *info );
+
+/* dgetrs.c */
+HYPRE_Int hypre_dgetrs ( const char *trans , HYPRE_Int *n , HYPRE_Int *nrhs , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *ipiv , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info );
+
+/* dlasq1.c */
+HYPRE_Int hypre_dlasq1 ( HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dlasq2.c */
+HYPRE_Int hypre_dlasq2 ( HYPRE_Int *n , HYPRE_Real *z__ , HYPRE_Int *info );
+
+/* dlasrt.c */
+HYPRE_Int hypre_dlasrt (const char *id , HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Int *info );
+
+/* dorg2l.c */
+HYPRE_Int hypre_dorg2l ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dorg2r.c */
+HYPRE_Int hypre_dorg2r ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dorgbr.c */
+HYPRE_Int hypre_dorgbr (const char *vect , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dorgl2.c */
+HYPRE_Int hypre_dorgl2 ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dorglq.c */
+HYPRE_Int hypre_dorglq ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dorgql.c */
+HYPRE_Int hypre_dorgql ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dorgqr.c */
+HYPRE_Int hypre_dorgqr ( HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dorgtr.c */
+HYPRE_Int hypre_dorgtr (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dorm2r.c */
+HYPRE_Int hypre_dorm2r (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dormbr.c */
+HYPRE_Int hypre_dormbr (const char *vect ,const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dorml2.c */
+HYPRE_Int hypre_dorml2 (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dormlq.c */
+HYPRE_Int hypre_dormlq (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dormqr.c */
+HYPRE_Int hypre_dormqr (const char *side ,const char *trans , HYPRE_Int *m , HYPRE_Int *n , HYPRE_Int *k , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *tau , HYPRE_Real *c__ , HYPRE_Int *ldc , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dpotf2.c */
+HYPRE_Int hypre_dpotf2 (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *info );
+
+/* dpotrf.c */
+HYPRE_Int hypre_dpotrf (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Int *info );
+
+/* dpotrs.c */
+HYPRE_Int hypre_dpotrs ( char *uplo , HYPRE_Int *n , HYPRE_Int *nrhs , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info );
+
+/* dsteqr.c */
+HYPRE_Int hypre_dsteqr (const char *compz , HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *z__ , HYPRE_Int *ldz , HYPRE_Real *work , HYPRE_Int *info );
+
+/* dsterf.c */
+HYPRE_Int hypre_dsterf ( HYPRE_Int *n , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Int *info );
+
+/* dsyev.c */
+HYPRE_Int hypre_dsyev (const char *jobz ,const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *w , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dsygs2.c */
+HYPRE_Int hypre_dsygs2 ( HYPRE_Int *itype ,const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info );
+
+/* dsygst.c */
+HYPRE_Int hypre_dsygst ( HYPRE_Int *itype ,const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Int *info );
+
+/* dsygv.c */
+HYPRE_Int hypre_dsygv ( HYPRE_Int *itype , char *jobz , char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *b , HYPRE_Int *ldb , HYPRE_Real *w , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+/* dsytd2.c */
+HYPRE_Int hypre_dsytd2 (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tau , HYPRE_Int *info );
+
+/* dsytrd.c */
+HYPRE_Int hypre_dsytrd (const char *uplo , HYPRE_Int *n , HYPRE_Real *a , HYPRE_Int *lda , HYPRE_Real *d__ , HYPRE_Real *e , HYPRE_Real *tau , HYPRE_Real *work , HYPRE_Int *lwork , HYPRE_Int *info );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff -Nru hypre-2.11.2/src/lapack/hypre_lapack.h hypre-2.13.0/src/lapack/hypre_lapack.h
--- hypre-2.11.2/src/lapack/hypre_lapack.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/hypre_lapack.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,22 +10,46 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-/* hypre_lapack.h  --  Contains LAPACK prototypes needed by Hypre */
+/***** DO NOT use this file outside of the LAPACK directory *****/
 
-#ifndef HYPRE_LAPACK_H
-#define HYPRE_LAPACK_H
-#include "f2c.h"
-#include "fortran.h"
-#include "math.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* --------------------------------------------------------------------------
- *  Change all names to hypre_ to avoid link conflicts
- * --------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------
+ * This header renames the functions in LAPACK to avoid conflicts
+ *--------------------------------------------------------------------------*/
+
+/* blas */
+#define dasum_   hypre_dasum
+#define daxpy_   hypre_daxpy
+#define dcopy_   hypre_dcopy
+#define ddot_    hypre_ddot
+#define dgemm_   hypre_dgemm
+#define dgemv_   hypre_dgemv
+#define dger_    hypre_dger
+#define dnrm2_   hypre_dnrm2
+#define drot_    hypre_drot
+#define dscal_   hypre_dscal
+#define dswap_   hypre_dswap
+#define dsymm_   hypre_dsymm
+#define dsymv_   hypre_dsymv
+#define dsyr2_   hypre_dsyr2
+#define dsyr2k_  hypre_dsyr2k
+#define dsyrk_   hypre_dsyrk
+#define dtrmm_   hypre_dtrmm
+#define dtrmv_   hypre_dtrmv
+#define dtrsm_   hypre_dtrsm
+#define dtrsv_   hypre_dtrsv
+#define idamax_  hypre_idamax
+
+/* f2c library routines */
+#define s_cmp    hypre_s_cmp
+#define s_copy   hypre_s_copy
+#define s_cat    hypre_s_cat
+#define d_lg10   hypre_d_lg10
+#define d_sign   hypre_d_sign
+#define pow_dd   hypre_pow_dd
+#define pow_di   hypre_pow_di
 
+/* lapack */
+#define dbdsqr_  hypre_dbdsqr
 #define dgebd2_  hypre_dgebd2
 #define dgebrd_  hypre_dgebrd
 #define dgelq2_  hypre_dgelq2
@@ -37,38 +61,19 @@
 #define dgetf2_  hypre_dgetf2
 #define dgetrf_  hypre_dgetrf
 #define dgetrs_  hypre_dgetrs
-#define dlabad_  hypre_dlabad
-#define dlabrd_  hypre_dlabrd
-#define dlae2_   hypre_dlae2
-#define dlaev2_  hypre_dlaev2
-#define dlamch_  hypre_dlamch
-#define dlamc1_  hypre_dlamc1
-#define dlamc2_  hypre_dlamc2
-#define dlamc3_  hypre_dlamc3
-#define dlamc4_  hypre_dlamc4
-#define dlamc5_  hypre_dlamc5
-#define dlaswp_  hypre_dlaswp
-#define dlange_  hypre_dlange
-#define dlanst_  hypre_dlanst
-#define dlansy_  hypre_dlansy
-#define dlapy2_  hypre_dlapy2
-#define dlarf_   hypre_dlarf
-#define dlarfb_  hypre_dlarfb
-#define dlarfg_  hypre_dlarfg
-#define dlarft_  hypre_dlarft
-#define dlartg_  hypre_dlartg
-#define dlascl_  hypre_dlascl
-#define dlaset_  hypre_dlaset
-#define dlasr_   hypre_dlasr
+#define dlasq1_  hypre_dlasq1
+#define dlasq2_  hypre_dlasq2
 #define dlasrt_  hypre_dlasrt
-#define dlassq_  hypre_dlassq
-#define dlatrd_  hypre_dlatrd
 #define dorg2l_  hypre_dorg2l
 #define dorg2r_  hypre_dorg2r
+#define dorgbr_  hypre_dorgbr
+#define dorgl2_  hypre_dorgl2
+#define dorglq_  hypre_dorglq
 #define dorgql_  hypre_dorgql
 #define dorgqr_  hypre_dorgqr
 #define dorgtr_  hypre_dorgtr
 #define dorm2r_  hypre_dorm2r
+#define dormbr_  hypre_dormbr
 #define dorml2_  hypre_dorml2
 #define dormlq_  hypre_dormlq
 #define dormqr_  hypre_dormqr
@@ -78,273 +83,51 @@
 #define dsteqr_  hypre_dsteqr
 #define dsterf_  hypre_dsterf
 #define dsyev_   hypre_dsyev
+#define dsygs2_  hypre_dsygs2
 #define dsygst_  hypre_dsygst
 #define dsygv_   hypre_dsygv
 #define dsytd2_  hypre_dsytd2
 #define dsytrd_  hypre_dsytrd
-#define ieeeck_  hypre_ieeeck
-#define ilaenv_  hypre_ilaenv
-#define d_lg10_  hypre_d_lg10
-#define d_sign_  hypre_d_sign
-#define pow_di_  hypre_pow_di
-#define pow_dd_  hypre_pow_dd
-#define s_cat_   hypre_s_cat
-#define lsame_   hypre_lsame
-#define xerbla_  hypre_xerbla
-#define dbdsqr_  hypre_dbdsqr
-#define dorgbr_  hypre_dorgbr
-#define dsygs2_  hypre_dsygs2
-#define dorglq_  hypre_dorglq
+
+/* lapack auxiliary routines */
+#define dlabad_  hypre_dlabad
+#define dlabrd_  hypre_dlabrd
 #define dlacpy_  hypre_dlacpy
-#define dormbr_  hypre_dormbr
-#define dlasq1_  hypre_dlasq1
+#define dlae2_   hypre_dlae2
+#define dlaev2_  hypre_dlaev2
+#define dlamch_  hypre_dlamch
+#define dlamc1_  hypre_dlamc1
+#define dlamc2_  hypre_dlamc2
+#define dlamc3_  hypre_dlamc3
+#define dlamc4_  hypre_dlamc4
+#define dlamc5_  hypre_dlamc5
+#define dlange_  hypre_dlange
+#define dlanst_  hypre_dlanst
+#define dlansy_  hypre_dlansy
+#define dlapy2_  hypre_dlapy2
+#define dlarf_   hypre_dlarf
+#define dlarfb_  hypre_dlarfb
+#define dlarfg_  hypre_dlarfg
+#define dlarft_  hypre_dlarft
+#define dlartg_  hypre_dlartg
 #define dlas2_   hypre_dlas2
-#define dlasv2_  hypre_dlasv2
-#define dorgl2_  hypre_dorgl2
-#define dlasq2_  hypre_dlasq2
+#define dlascl_  hypre_dlascl
+#define dlaset_  hypre_dlaset
 #define dlasq3_  hypre_dlasq3
 #define dlasq4_  hypre_dlasq4
 #define dlasq5_  hypre_dlasq5
 #define dlasq6_  hypre_dlasq6
+#define dlasr_   hypre_dlasr
+#define dlassq_  hypre_dlassq
+#define dlasv2_  hypre_dlasv2
+#define dlaswp_  hypre_dlaswp
+#define dlatrd_  hypre_dlatrd
+#define ieeeck_  hypre_ieeeck
+#define ilaenv_  hypre_ilaenv
 
-#define s_cmp    hypre_F90_NAME_BLAS(s_cmp,S_CMP)
-#define s_copy   hypre_F90_NAME_BLAS(s_copy,S_COPY)
-#define d_lg10   hypre_d_lg10
-#define d_sign   hypre_d_sign
-#define pow_dd   hypre_pow_dd
-#define pow_di   hypre_pow_di
-#define s_cat    hypre_s_cat
-
-/* --------------------------------------------------------------------------
- *           Prototypes
- * --------------------------------------------------------------------------*/
-
-/* dgebd2.c */
-HYPRE_Int dgebd2_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tauq , doublereal *taup , doublereal *work , integer *info );
-
-/* dgebrd.c */
-HYPRE_Int dgebrd_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tauq , doublereal *taup , doublereal *work , integer *lwork , integer *info );
-
-/* dgelq2.c */
-HYPRE_Int dgelq2_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info );
-
-/* dgelqf.c */
-HYPRE_Int dgelqf_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dgels.c */
-HYPRE_Int dgels_ ( char *trans , integer *m , integer *n , integer *nrhs , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *work , integer *lwork , integer *info );
-
-/* dgeqr2.c */
-HYPRE_Int dgeqr2_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info );
-
-/* dgeqrf.c */
-HYPRE_Int dgeqrf_ ( integer *m , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dgesvd.c */
-HYPRE_Int dgesvd_ ( char *jobu , char *jobvt , integer *m , integer *n , doublereal *a , integer *lda , doublereal *s , doublereal *u , integer *ldu , doublereal *vt , integer *ldvt , doublereal *work , integer *lwork , integer *info );
-
-
-/* dgetf2.c */
-HYPRE_Int dgetf2_( integer *m , integer *n , doublereal *a , integer *lda , integer *ipiv , integer *info );
-
-/* dgetrf.c */
-HYPRE_Int dgetrf_( integer *m , integer *n , doublereal *a , integer *lda , integer *ipiv , integer *info );
-
-/* dgetrs.c */
-HYPRE_Int dgetrs_( char *trans , integer *n , integer *nrhs , doublereal *a , integer *lda , integer *ipiv , doublereal *b , integer *ldb , integer *info );
-
-/* dlabad.c */
-HYPRE_Int dlabad_ ( doublereal *small , doublereal *large );
-
-/* dlabrd.c */
-HYPRE_Int dlabrd_ ( integer *m , integer *n , integer *nb , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tauq , doublereal *taup , doublereal *x , integer *ldx , doublereal *y , integer *ldy );
-
-/* dlae2.c */
-HYPRE_Int dlae2_ ( doublereal *a , doublereal *b , doublereal *c__ , doublereal *rt1 , doublereal *rt2 );
-
-/* dlaev2.c */
-HYPRE_Int dlaev2_ ( doublereal *a , doublereal *b , doublereal *c__ , doublereal *rt1 , doublereal *rt2 , doublereal *cs1 , doublereal *sn1 );
-
-/* dlamch.c */
-doublereal dlamch_ (const char *cmach );
-HYPRE_Int dlamc1_ ( integer *beta , integer *t , logical *rnd , logical *ieee1 );
-HYPRE_Int dlamc2_ ( integer *beta , integer *t , logical *rnd , doublereal *eps , integer *emin , doublereal *rmin , integer *emax , doublereal *rmax );
-doublereal dlamc3_ ( doublereal *a , doublereal *b );
-HYPRE_Int dlamc4_ ( integer *emin , doublereal *start , integer *base );
-HYPRE_Int dlamc5_ ( integer *beta , integer *p , integer *emin , logical *ieee , integer *emax , doublereal *rmax );
-
-
-/* dlaswp.c */
-HYPRE_Int dlaswp_( integer *n , doublereal *a , integer *lda , integer *k1 , integer *k2 , integer *ipiv , integer *incx );
-
-/* dlange.c */
-doublereal dlange_ (const char *norm , integer *m , integer *n , doublereal *a , integer *lda , doublereal *work );
-
-/* dlanst.c */
-doublereal dlanst_ (const char *norm , integer *n , doublereal *d__ , doublereal *e );
-
-/* dlansy.c */
-doublereal dlansy_ (const char *norm ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *work );
-
-/* dlapy2.c */
-doublereal dlapy2_ ( doublereal *x , doublereal *y );
-
-/* dlarf.c */
-HYPRE_Int dlarf_ (const char *side , integer *m , integer *n , doublereal *v , integer *incv , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work );
-
-/* dlarfb.c */
-HYPRE_Int dlarfb_ (const char *side ,const char *trans ,const char *direct ,const char *storev , integer *m , integer *n , integer *k , doublereal *v , integer *ldv , doublereal *t , integer *ldt , doublereal *c__ , integer *ldc , doublereal *work , integer *ldwork );
-
-/* dlarfg.c */
-HYPRE_Int dlarfg_ ( integer *n , doublereal *alpha , doublereal *x , integer *incx , doublereal *tau );
-
-/* dlarft.c */
-HYPRE_Int dlarft_ (const char *direct ,const char *storev , integer *n , integer *k , doublereal *v , integer *ldv , doublereal *tau , doublereal *t , integer *ldt );
-
-/* dlartg.c */
-HYPRE_Int dlartg_ ( doublereal *f , doublereal *g , doublereal *cs , doublereal *sn , doublereal *r__ );
-
-/* dlascl.c */
-HYPRE_Int dlascl_ (const char *type__ , integer *kl , integer *ku , doublereal *cfrom , doublereal *cto , integer *m , integer *n , doublereal *a , integer *lda , integer *info );
-
-/* dlaset.c */
-HYPRE_Int dlaset_ (const char *uplo , integer *m , integer *n , doublereal *alpha , doublereal *beta , doublereal *a , integer *lda );
-
-/* dlasr.c */
-HYPRE_Int dlasr_ (const char *side ,const char *pivot ,const char *direct , integer *m , integer *n , doublereal *c__ , doublereal *s , doublereal *a , integer *lda );
-
-/* dlasrt.c */
-HYPRE_Int dlasrt_ (const char *id , integer *n , doublereal *d__ , integer *info );
-
-/* dlassq.c */
-HYPRE_Int dlassq_ ( integer *n , doublereal *x , integer *incx , doublereal *scale , doublereal *sumsq );
-
-/* dlatrd.c */
-HYPRE_Int dlatrd_ (const char *uplo , integer *n , integer *nb , doublereal *a , integer *lda , doublereal *e , doublereal *tau , doublereal *w , integer *ldw );
-
-/* dorg2l.c */
-HYPRE_Int dorg2l_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info );
-
-/* dorg2r.c */
-HYPRE_Int dorg2r_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info );
-
-/* dorgql.c */
-HYPRE_Int dorgql_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dorgqr.c */
-HYPRE_Int dorgqr_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dorgtr.c */
-HYPRE_Int dorgtr_ (const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dorm2r.c */
-HYPRE_Int dorm2r_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *info );
-
-/* dorml2.c */
-HYPRE_Int dorml2_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *info );
-
-/* dormlq.c */
-HYPRE_Int dormlq_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *lwork , integer *info );
-
-/* dormqr.c */
-HYPRE_Int dormqr_ (const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *lwork , integer *info );
-
-/* dpotf2.c */
-HYPRE_Int dpotf2_ (const char *uplo , integer *n , doublereal *a , integer *lda , integer *info );
-
-/* dpotrf.c */
-HYPRE_Int dpotrf_ (const char *uplo , integer *n , doublereal *a , integer *lda , integer *info );
-
-/* dpotrs.c */
-HYPRE_Int dpotrs_ ( char *uplo , integer *n , integer *nrhs , doublereal *a , integer *lda , doublereal *b , integer *ldb , integer *info );
-
-/* dsteqr.c */
-HYPRE_Int dsteqr_ (const char *compz , integer *n , doublereal *d__ , doublereal *e , doublereal *z__ , integer *ldz , doublereal *work , integer *info );
-
-/* dsterf.c */
-HYPRE_Int dsterf_ ( integer *n , doublereal *d__ , doublereal *e , integer *info );
-
-/* dsyev.c */
-HYPRE_Int dsyev_ (const char *jobz ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *w , doublereal *work , integer *lwork , integer *info );
-
-/* dsygst.c */
-HYPRE_Int dsygst_ ( integer *itype ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb , integer *info );
-
-/* dsygv.c */
-HYPRE_Int dsygv_ ( integer *itype , char *jobz , char *uplo , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb , doublereal *w , doublereal *work , integer *lwork , integer *info );
-
-/* dsytd2.c */
-HYPRE_Int dsytd2_ (const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tau , integer *info );
-
-/* dsytrd.c */
-HYPRE_Int dsytrd_ (const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *d__ , doublereal *e , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* ieeeck.c */
-integer ieeeck_ ( integer *ispec , real *zero , real *one );
-
-/* ilaenv.c */
-integer ilaenv_ ( integer *ispec ,const char *name__ ,const char *opts , integer *n1 , integer *n2 , integer *n3 , integer *n4 , ftnlen name_len , ftnlen opts_len );
-
-/* lapack_utils.c */
-HYPRE_Real d_lg10 ( doublereal *x );
-HYPRE_Real d_sign ( doublereal *a , doublereal *b );
-HYPRE_Real pow_di ( doublereal *ap , integer *bp );
-HYPRE_Real pow_dd ( doublereal *ap , doublereal *bp );
-HYPRE_Int s_cat ( char *lp , char *rpp [], ftnlen rnp [], ftnlen *np , ftnlen ll );
-
-/* lsame.c */
-logical lsame_ (const char *ca ,const char *cb );
-
-/* xerbla.c */
-HYPRE_Int xerbla_ (const char *srname , integer *info );
-
-/* dbdsqr.c */
-HYPRE_Int dbdsqr_ (const char *uplo , integer *n , integer *ncvt , integer *nru , integer *ncc , doublereal *d__ , doublereal *e , doublereal *vt , integer *ldvt , doublereal *u , integer *ldu , doublereal *c__ , integer *ldc , doublereal *work , integer *info );
-
-/* dorgbr.c */
-HYPRE_Int dorgbr_ (const char *vect , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dsygs2.c */
-HYPRE_Int dsygs2_ ( integer *itype ,const char *uplo , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb , integer *info );
-
-/* dorglq.c */
-HYPRE_Int dorglq_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *lwork , integer *info );
-
-/* dlacpy.c */
-HYPRE_Int dlacpy_ (const char *uplo , integer *m , integer *n , doublereal *a , integer *lda , doublereal *b , integer *ldb );
-
-/* dormbr.c */
-HYPRE_Int dormbr_ (const char *vect ,const char *side ,const char *trans , integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *c__ , integer *ldc , doublereal *work , integer *lwork , integer *info );
-
-/* dlasq1.c */
-HYPRE_Int dlasq1_ ( integer *n , doublereal *d__ , doublereal *e , doublereal *work , integer *info );
-
-/* dlas2.c */
-HYPRE_Int dlas2_ ( doublereal *f , doublereal *g , doublereal *h__ , doublereal *ssmin , doublereal *ssmax );
-
-/* dlasv2.c */
-HYPRE_Int dlasv2_ ( doublereal *f , doublereal *g , doublereal *h__ , doublereal *ssmin , doublereal *ssmax , doublereal *snr , doublereal *csr , doublereal *snl , doublereal *csl );
-
-/* dorgl2.c */
-HYPRE_Int dorgl2_ ( integer *m , integer *n , integer *k , doublereal *a , integer *lda , doublereal *tau , doublereal *work , integer *info );
-
-/* dlasq2.c */
-HYPRE_Int dlasq2_ ( integer *n , doublereal *z__ , integer *info );
-
-/* dlasq3.c */
-HYPRE_Int dlasq3_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , doublereal *dmin__ , doublereal *sigma , doublereal *desig , doublereal *qmax , integer *nfail , integer *iter , integer *ndiv , logical *ieee );
-
-/* dlasq4.c */
-HYPRE_Int dlasq4_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , integer *n0in , doublereal *dmin__ , doublereal *dmin1 , doublereal *dmin2 , doublereal *dn , doublereal *dn1 , doublereal *dn2 , doublereal *tau , integer *ttype );
-
-/* dlasq5.c */
-HYPRE_Int dlasq5_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , doublereal *tau , doublereal *dmin__ , doublereal *dmin1 , doublereal *dmin2 , doublereal *dn , doublereal *dnm1 , doublereal *dnm2 , logical *ieee );
-
-/* dlasq6.c */
-HYPRE_Int dlasq6_ ( integer *i0 , integer *n0 , doublereal *z__ , integer *pp , doublereal *dmin__ , doublereal *dmin1 , doublereal *dmin2 , doublereal *dn , doublereal *dnm1 , doublereal *dnm2 );
-
-#ifdef __cplusplus
-}
-#endif
+/* these auxiliary routines have a different definition in BLAS */
+#define lsame_   hypre_lapack_lsame
+#define xerbla_  hypre_lapack_xerbla
 
-#endif
+/* this is needed so that lapack can call external BLAS */
+#include "_hypre_blas.h"
diff -Nru hypre-2.11.2/src/lapack/ieeeck.c hypre-2.13.0/src/lapack/ieeeck.c
--- hypre-2.11.2/src/lapack/ieeeck.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/ieeeck.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,6 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 integer ieeeck_(integer *ispec, real *zero, real *one)
 {
@@ -149,3 +152,6 @@
     return ret_val;
 } /* ieeeck_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/ilaenv.c hypre-2.13.0/src/lapack/ilaenv.c
--- hypre-2.11.2/src/lapack/ilaenv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/ilaenv.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "hypre_lapack.h"
 #include "f2c.h"
-#include "hypre_blas.h"
+#include "hypre_lapack.h"
 
 integer ilaenv_(integer *ispec,const char *name__,const char *opts, integer *n1, 
 	integer *n2, integer *n3, integer *n4, ftnlen name_len, ftnlen 
@@ -113,8 +115,9 @@
     /* Builtin functions   
        Subroutine */
 
-	//VOID s_copy( char *, char *, ftnlen, ftnlen);
-    //integer s_cmp( char *, char *, ftnlen, ftnlen);
+    /* Builtin functions */
+    /* Subroutine */ integer s_copy(char *, const char *, ftnlen, ftnlen);
+    integer s_cmp(char *, const char *, ftnlen, ftnlen);
 
     /* Local variables */
     static integer i__;
@@ -228,7 +231,7 @@
 
        In these examples, separate code is provided for setting NB for   
        real and complex.  We assume that NB will take the same value in   
-       single or HYPRE_Real precision. */
+       single or doublereal precision. */
 
     nb = 1;
 
@@ -615,3 +618,6 @@
 
 } /* ilaenv_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/lapack_utils.c hypre-2.13.0/src/lapack/lapack_utils.c
--- hypre-2.11.2/src/lapack/lapack_utils.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/lapack_utils.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,123 +0,0 @@
-/*BHEADER**********************************************************************
- * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
- * Produced at the Lawrence Livermore National Laboratory.
- * This file is part of HYPRE.  See file COPYRIGHT for details.
- *
- * HYPRE is free software; you can redistribute it and/or modify it under the
- * terms of the GNU Lesser General Public License (as published by the Free
- * Software Foundation) version 2.1 dated February 1999.
- *
- * $Revision$
- ***********************************************************************EHEADER*/
-
-
-
-
-#include "f2c.h"
-#include "hypre_lapack.h"
-
-#define log10e 0.43429448190325182765
-
-#ifdef KR_headers
-HYPRE_Real log();
-HYPRE_Real d_lg10(x) doublereal *x;
-#else
-/*
-#undef abs
-#include "math.h"
-*/
-/*FIXME: need to figure out how log calls to optimize this part*/
-#include "math.h"					 
-//HYPRE_Real log(HYPRE_Real); /* declaration added 2/17/00 */
-HYPRE_Real d_lg10(doublereal *x)
-#endif
-{
-return( log10e * log(*x) );
-}
-#include "f2c.h"
-
-#ifdef KR_headers
-HYPRE_Real d_sign(a,b) doublereal *a, *b;
-#else
-HYPRE_Real d_sign(doublereal *a, doublereal *b)
-#endif
-{
-HYPRE_Real x;
-x = (*a >= 0 ? *a : - *a);
-return( *b >= 0 ? x : -x);
-}
-
-#include "f2c.h"
-
-#ifdef KR_headers
-HYPRE_Real pow_di(ap, bp) doublereal *ap; integer *bp;
-#else
-HYPRE_Real pow_di(doublereal *ap, integer *bp)
-#endif
-{
-HYPRE_Real pow, x;
-integer n;
-
-pow = 1;
-x = *ap;
-n = *bp;
-
-if(n != 0)
-	{
-	if(n < 0)
-		{
-		n = -n;
-		x = 1/x;
-		}
-	for( ; ; )
-		{
-		if(n & 01)
-			pow *= x;
-		if(n >>= 1)
-			x *= x;
-		else
-			break;
-		}
-	}
-return(pow);
-}
-#include "f2c.h"
-
-#ifdef KR_headers
-HYPRE_Real pow();
-HYPRE_Real pow_dd(ap, bp) doublereal *ap, *bp;
-#else
-#undef abs
-#include "math.h"
-HYPRE_Real pow_dd(doublereal *ap, doublereal *bp)
-#endif
-{
-return(pow(*ap, *bp) );
-}
-
-#include "f2c.h"
-
-#ifdef KR_headers
-HYPRE_Int s_cat(lp, rpp, rnp, np, ll) char *lp, *rpp[]; ftnlen rnp[], *np, ll;
-#else
-HYPRE_Int s_cat(char *lp, char *rpp[], ftnlen rnp[], ftnlen *np, ftnlen ll)
-#endif
-{
-ftnlen i, n, nc;
-char *f__rp;
-
-n = (HYPRE_Int)*np;
-for(i = 0 ; i < n ; ++i)
-	{
-	nc = ll;
-	if(rnp[i] < nc)
-		nc = rnp[i];
-	ll -= nc;
-	f__rp = rpp[i];
-	while(--nc >= 0)
-		*lp++ = *f__rp++;
-	}
-while(--ll >= 0)
-	*lp++ = ' ';
-return 0;
-}
diff -Nru hypre-2.11.2/src/lapack/lsame.c hypre-2.13.0/src/lapack/lsame.c
--- hypre-2.11.2/src/lapack/lsame.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/lsame.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,7 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include "../blas/hypre_blas.h"
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
 logical lsame_(const char *ca,const char *cb)
 {
@@ -104,3 +106,6 @@
     return ret_val;
 } /* lsame_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lapack/Makefile hypre-2.13.0/src/lapack/Makefile
--- hypre-2.11.2/src/lapack/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -15,7 +15,7 @@
 
 include ../config/Makefile.config
 
-C_COMPILE_FLAGS = ${INCLUDES} -I.. -I../utilities
+C_COMPILE_FLAGS = ${INCLUDES} -I.. -I../blas -I../utilities
 
 LAPACK_HEADERS =  f2c.h hypre_lapack.h
 LAPACK_FILES = \
@@ -86,7 +86,6 @@
  dsytrd.c\
  ieeeck.c\
  ilaenv.c\
- lapack_utils.c\
  lsame.c\
  xerbla.c
 
@@ -99,6 +98,7 @@
 all: ${OBJS} dlamch.o
 
 install: all
+	cp -fR $(srcdir)/_hypre_lapack.h $(HYPRE_INC_INSTALL)
 
 clean:
 	rm -rf *.o
@@ -113,7 +113,4 @@
 dlamch.o : dlamch.c ${LAPACK_HEADERS}
 	${CC} ${CFLAGS} -c dlamch.c
 
-ilaenv.o : ilaenv.c ${LAPACK_HEADERS}
-	${CC} ${CFLAGS} -I../blas -c ilaenv.c
-
 ${OBJS}: ${LAPACK_HEADERS}
diff -Nru hypre-2.11.2/src/lapack/README hypre-2.13.0/src/lapack/README
--- hypre-2.11.2/src/lapack/README	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/lapack/README	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,5 @@
+
+HYPRE LAPACK README file
+
+- See blas/README file for instructions on how to add LAPACK routines.
+
diff -Nru hypre-2.11.2/src/lapack/xerbla.c hypre-2.13.0/src/lapack/xerbla.c
--- hypre-2.11.2/src/lapack/xerbla.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lapack/xerbla.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,9 +1,11 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-#include <stdio.h>
-#include "hypre_lapack.h"
 #include "f2c.h"
+#include "hypre_lapack.h"
 
-/* Subroutine */ HYPRE_Int xerbla_(const char *srname, integer *info)
+/* Subroutine */ integer xerbla_(const char *srname, integer *info)
 {
 /*  -- LAPACK auxiliary routine (version 2.0) --   
        Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
@@ -36,10 +38,13 @@
 */
 
     hypre_printf("** On entry to %6s, parameter number %2i had an illegal value\n",
-		srname, (HYPRE_Int)*info);
+		srname, (integer)*info);
 
 /*     End of XERBLA */
 
     return 0;
 } /* xerbla_ */
 
+#ifdef __cplusplus
+}
+#endif
diff -Nru hypre-2.11.2/src/lib/Makefile hypre-2.13.0/src/lib/Makefile
--- hypre-2.11.2/src/lib/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/lib/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -12,7 +12,6 @@
 
 include ../config/Makefile.config
 
-SUPERLUFILES = ${HYPRE_FEI_SUPERLU_FILES}
 FEIHYPREFILES = ${HYPRE_FEI_HYPRE_FILES}
 FEMLIFILES = ${HYPRE_FEI_FEMLI_FILES}
 IJMVFILES = ${HYPRE_SRC_TOP_DIR}/IJ_mv/*.o
@@ -32,11 +31,10 @@
 STRUCTLSFILES = ${HYPRE_SRC_TOP_DIR}/struct_ls/*.o
 STRUCTMVFILES = ${HYPRE_SRC_TOP_DIR}/struct_mv/*.o
 UTILITIESFILES = ${HYPRE_SRC_TOP_DIR}/utilities/*.o
-BLASFILES = ${HYPRE_BLAS_FILES}
-LAPACKFILES = ${HYPRE_LAPACK_FILES}
+BLASFILES = ${HYPRE_SRC_TOP_DIR}/blas/*.o
+LAPACKFILES = ${HYPRE_SRC_TOP_DIR}/lapack/*.o
 
 FILES_HYPRE = \
-$(SUPERLUFILES)\
 $(FEIHYPREFILES)\
 $(FEMLIFILES)\
 $(IJMVFILES)\
@@ -86,7 +84,7 @@
 
 libHYPRE.a: ${FILES_HYPRE}
 	@echo  "Building libHYPRE ... "
-	${AR} $@ $(SUPERLUFILES) $(FEIHYPREFILES) $(FEMLIFILES) $(IJMVFILES)
+	${AR} $@ $(FEIHYPREFILES) $(FEMLIFILES) $(IJMVFILES)
 	${AR} $@ $(EUCLIDFILES)
 	${AR} $@ $(PARASAILSFILES)
 	${AR} $@ $(PILUTFILES)
diff -Nru hypre-2.11.2/src/Makefile hypre-2.13.0/src/Makefile
--- hypre-2.11.2/src/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -10,14 +10,15 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+default: all
 
 # Include all variables defined by configure
 include config/Makefile.config
 
 # These are the directories for internal blas, lapack and general utilities
 HYPRE_BASIC_DIRS =\
- ${HYPRE_BLAS_SRC_DIR}\
- ${HYPRE_LAPACK_SRC_DIR}\
+ blas\
+ lapack\
  utilities
 
 #These are the directories for multivector
diff -Nru hypre-2.11.2/src/multivector/backup.c hypre-2.13.0/src/multivector/backup.c
--- hypre-2.11.2/src/multivector/backup.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/multivector/backup.c	2017-10-20 17:42:22.000000000 +0000
@@ -132,10 +132,10 @@
 
   hypre_assert( data != NULL );
 
-  srand( seed );
+  hypre_SeedRand( seed );
   for ( i = 0; i < data->numVectors; i++ ) {
     if ( data->mask == NULL || (data->mask)[i] ) {
-      seed = rand();
+      seed = hypre_RandI();
       (data->interpreter->SetRandomValues)(data->vector[i],seed);
     }
   }
diff -Nru hypre-2.11.2/src/multivector/csr_matmultivec.c hypre-2.13.0/src/multivector/csr_matmultivec.c
--- hypre-2.11.2/src/multivector/csr_matmultivec.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/multivector/csr_matmultivec.c	2017-10-20 17:42:22.000000000 +0000
@@ -114,8 +114,8 @@
          temp = y_data[i];
          for (jj = A_i[i]; jj < A_i[i+1]; jj++)
             temp += A_data[jj] * x_data[A_j[jj]];
+         y_data[i] = temp;
       }
-      y_data[i] = temp;
    }
    else
    {
diff -Nru hypre-2.11.2/src/parcsr_block_mv/csr_block_matrix.c hypre-2.13.0/src/parcsr_block_mv/csr_block_matrix.c
--- hypre-2.11.2/src/parcsr_block_mv/csr_block_matrix.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_block_mv/csr_block_matrix.c	2017-10-20 17:42:22.000000000 +0000
@@ -18,16 +18,6 @@
 
 #include "_hypre_parcsr_block_mv.h"
 
-#define LB_VERSION 0 /* lapack and blas version 5/12/06 - preliminary testing
-                        shows this is slower for block sizes < 4 (did not try
-                        larger block sizes)- need to edit the Makefile to find
-                        blas and lapack */
-
-#if LB_VERSION
-#include "hypre_blas.h"
-#include "hypre_lapack.h"
-#endif
-
 /*--------------------------------------------------------------------------
  * hypre_CSRBlockMatrixCreate
  *--------------------------------------------------------------------------*/
diff -Nru hypre-2.11.2/src/parcsr_ls/ame.c hypre-2.13.0/src/parcsr_ls/ame.c
--- hypre-2.11.2/src/parcsr_ls/ame.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/ame.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,10 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
-
 #include "_hypre_parcsr_ls.h"
 #include "float.h"
 #include "ams.h"
@@ -588,13 +584,8 @@
    lobpcg_Tolerance lobpcg_tol;
    HYPRE_Real *residuals;
 
-#ifdef HYPRE_USING_ESSL
-   blap_fn.dsygv  = dsygv;
-   blap_fn.dpotrf = dpotrf;
-#else
-   blap_fn.dsygv  = hypre_F90_NAME_LAPACK(dsygv,DSYGV);
-   blap_fn.dpotrf = hypre_F90_NAME_LAPACK(dpotrf,DPOTRF);
-#endif
+   blap_fn.dsygv  = hypre_dsygv;
+   blap_fn.dpotrf = hypre_dpotrf;
    lobpcg_tol.relative = ame_data -> rtol;
    lobpcg_tol.absolute = ame_data -> atol;
    residuals = hypre_TAlloc(HYPRE_Real, ame_data -> block_size);
diff -Nru hypre-2.11.2/src/parcsr_ls/ame.h hypre-2.13.0/src/parcsr_ls/ame.h
--- hypre-2.11.2/src/parcsr_ls/ame.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/ame.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,10 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
-
 #ifndef hypre_AME_HEADER
 #define hypre_AME_HEADER
 
@@ -56,16 +52,6 @@
 
 } hypre_AMEData;
 
-#include "fortran.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-    
-HYPRE_Int hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dsygv,DSYGV)(HYPRE_Int *, char *, char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *,
-                                       HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-#ifdef __cplusplus
-}
-#endif
+#include "_hypre_lapack.h"
 
 #endif
diff -Nru hypre-2.11.2/src/parcsr_ls/amg_hybrid.c hypre-2.13.0/src/parcsr_ls/amg_hybrid.c
--- hypre-2.11.2/src/parcsr_ls/amg_hybrid.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/amg_hybrid.c	2017-10-20 17:42:22.000000000 +0000
@@ -123,13 +123,13 @@
    (AMGhybrid_data -> strong_threshold)  = 0.25;
    (AMGhybrid_data -> max_row_sum)  = 0.9;
    (AMGhybrid_data -> trunc_factor)  = 0.0;
-   (AMGhybrid_data -> pmax)  = 0;
+   (AMGhybrid_data -> pmax)  = 4;
    (AMGhybrid_data -> max_levels)  = 25;
    (AMGhybrid_data -> measure_type)  = 0;
-   (AMGhybrid_data -> coarsen_type)  = 6;
-   (AMGhybrid_data -> interp_type)  = 0;
+   (AMGhybrid_data -> coarsen_type)  = 10;
+   (AMGhybrid_data -> interp_type)  = 6;
    (AMGhybrid_data -> cycle_type)  = 1;
-   (AMGhybrid_data -> relax_order)  = 1;
+   (AMGhybrid_data -> relax_order)  = 0;
    (AMGhybrid_data -> max_coarse_size)  = 9;
    (AMGhybrid_data -> min_coarse_size)  = 1;
    (AMGhybrid_data -> seq_threshold)  = 0;
@@ -1860,9 +1860,15 @@
 	    boom_grt = hypre_CTAlloc(HYPRE_Int,4);
 	    for (i=0; i < 4; i++)
 	       boom_grt[i] = grid_relax_type[i];
-	    if (solver_type == 1 && grid_relax_type[1] == 3 &&
-		grid_relax_type[2] == 3)
-	       boom_grt[2] = 4;
+   	    hypre_BoomerAMGSetGridRelaxType(pcg_precond, boom_grt);
+         }
+   	 else
+         {
+	    boom_grt = hypre_CTAlloc(HYPRE_Int,4);
+	    boom_grt[0] = 3;
+	    boom_grt[1] = 13;
+	    boom_grt[2] = 14;
+	    boom_grt[3] = 9;
    	    hypre_BoomerAMGSetGridRelaxType(pcg_precond, boom_grt);
          }
    	 if (relax_weight)
diff -Nru hypre-2.11.2/src/parcsr_ls/ams.c hypre-2.13.0/src/parcsr_ls/ams.c
--- hypre-2.11.2/src/parcsr_ls/ams.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/ams.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,10 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
-
 #include "_hypre_parcsr_ls.h"
 #include "float.h"
 #include "ams.h"
@@ -24,7 +20,7 @@
  * Relaxation on the ParCSR matrix A with right-hand side f and
  * initial guess u. Possible values for relax_type are:
  *
- * 1 = l1-scaled Jacobi
+ * 1 = l1-scaled (or weighted) Jacobi
  * 2 = l1-scaled block Gauss-Seidel/SSOR
  * 3 = Kaczmarz
  * 4 = truncated version of 2 (Remark 6.2 in smoothers paper)
@@ -70,14 +66,27 @@
    {
       if (relax_type == 1) /* l1-scaled Jacobi */
       {
-         HYPRE_Int i, num_rows = hypre_ParCSRMatrixNumRows(A);
-
+	 PUSH_RANGE_PAYLOAD("RELAX",4,sweep);
+	 HYPRE_Int i, num_rows = hypre_ParCSRMatrixNumRows(A);
+#ifdef HYPRE_USE_GPU
+	 if (sweep==0){
+	   hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(v));
+	   hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(f));
+	 }
+	 VecCopy(v_data,f_data,hypre_VectorSize(hypre_ParVectorLocalVector(v)),HYPRE_STREAM(4));
+#else
          hypre_ParVectorCopy(f,v);
+#endif
          hypre_ParCSRMatrixMatvec(-relax_weight, A, u, relax_weight, v);
-
+#ifdef HYPRE_USE_GPU
+	 
+	 VecScale(u_data,v_data,l1_norms,num_rows,HYPRE_STREAM(4));
+#else
          /* u += w D^{-1}(f - A u), where D_ii = ||A(i,:)||_1 */
          for (i = 0; i < num_rows; i++)
             u_data[i] += v_data[i] / l1_norms[i];
+#endif
+	 POP_RANGE;
       }
       else if (relax_type == 2 || relax_type == 4) /* offd-l1-scaled block GS */
       {
@@ -704,6 +713,18 @@
             l1_norm[i] = diag;
       }
    }
+   else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */
+   {
+      for (i = 0; i < num_rows; i++)
+      {
+         diag = A_diag_data[A_diag_I[i]];
+         if (diag != 0.0) l1_norm[i] = diag;
+	 else l1_norm[i] = 1.0;
+      }
+      *l1_norm_ptr = l1_norm;
+
+      return hypre_error_flag;
+   }
 
    /* Handle negative definite matrices */
    for (i = 0; i < num_rows; i++)
@@ -718,6 +739,7 @@
          break;
       }
 
+   //for (i = 0; i < num_rows; i++) l1_norm[i]=1.0/l1_norm[i];
    hypre_TFree(cf_marker_offd);
 
    *l1_norm_ptr = l1_norm;
@@ -1077,7 +1099,7 @@
    ams_data -> A_Pi = A_Pi;
 
    /* Penalize the eliminated degrees of freedom */
-   hypre_ParCSRMatrixSetDiagRows(A_Pi, DBL_MAX);
+   hypre_ParCSRMatrixSetDiagRows(A_Pi, HYPRE_REAL_MAX);
 
    /* Make sure that the first entry in each row is the diagonal one. */
    /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_Pi)); */
@@ -1106,7 +1128,7 @@
    else
    {
       /* Penalize the eliminated degrees of freedom */
-      hypre_ParCSRMatrixSetDiagRows(A_G, DBL_MAX);
+      hypre_ParCSRMatrixSetDiagRows(A_G, HYPRE_REAL_MAX);
 
       /* Make sure that the first entry in each row is the diagonal one. */
       /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_G)); */
diff -Nru hypre-2.11.2/src/parcsr_ls/F90_HYPRE_parcsr_amg.c hypre-2.13.0/src/parcsr_ls/F90_HYPRE_parcsr_amg.c
--- hypre-2.11.2/src/parcsr_ls/F90_HYPRE_parcsr_amg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/F90_HYPRE_parcsr_amg.c	2017-10-20 17:42:22.000000000 +0000
@@ -1743,6 +1743,54 @@
 }
 
 /*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyScale
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetchebyscale, HYPRE_BOOMERAMGSETCHEBYSCALE)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *cheby_scale,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetChebyScale(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (cheby_scale) ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyVariant
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetchebyvariant, HYPRE_BOOMERAMGSETCHEBYVARIANT)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *cheby_variant,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetChebyVariant(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (cheby_variant) ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyEigEst
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetchebyeigest, HYPRE_BOOMERAMGSETCHEBYEIGEST)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *cheby_eig_est,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetChebyEigEst(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (cheby_eig_est) ) );
+}
+
+/*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetKeepTranspose
  *--------------------------------------------------------------------------*/
 
@@ -1775,7 +1823,7 @@
 }
 
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetAdditive
+ * HYPRE_BoomerAMGSetAdditive, HYPRE_BoomerAMGGetAdditive
  *--------------------------------------------------------------------------*/
 
 void
@@ -1790,8 +1838,20 @@
            hypre_F90_PassInt (add_lvl) ) );
 }
 
+void
+hypre_F90_IFACE(hypre_boomeramggetadditive, HYPRE_BOOMERAMGGETADDITIVE)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGGetAdditive(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassIntRef (add_lvl) ) );
+}
+
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetMultAdditive
+ * HYPRE_BoomerAMGSetMultAdditive, HYPRE BoomerAMGGetMultAdditive
  *--------------------------------------------------------------------------*/
 
 void
@@ -1806,8 +1866,20 @@
            hypre_F90_PassInt (add_lvl) ) );
 }
 
+void
+hypre_F90_IFACE(hypre_boomeramggetmultadd, HYPRE_BOOMERAMGGETMULTADD)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGGetMultAdditive(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassIntRef (add_lvl) ) );
+}
+
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetSimple
+ * HYPRE_BoomerAMGSetSimple, HYPRE_BoomerAMGGetSimple
  *--------------------------------------------------------------------------*/
 
 void
@@ -1822,6 +1894,34 @@
            hypre_F90_PassInt (add_lvl) ) );
 }
 
+void
+hypre_F90_IFACE(hypre_boomeramggetsimple, HYPRE_BOOMERAMGGETSIMPLE)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGGetSimple(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassIntRef (add_lvl) ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetAddLastLvl
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetaddlastlvl, HYPRE_BOOMERAMGSETADDLASTLVL)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_last_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetAddLastLvl(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (add_last_lvl) ) );
+}
+
 /*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetMultAddTruncFactor
  *--------------------------------------------------------------------------*/
diff -Nru hypre-2.11.2/src/parcsr_ls/gen_redcs_mat.c hypre-2.13.0/src/parcsr_ls/gen_redcs_mat.c
--- hypre-2.11.2/src/parcsr_ls/gen_redcs_mat.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/gen_redcs_mat.c	2017-10-20 17:42:22.000000000 +0000
@@ -495,7 +495,7 @@
       *new_comm_ptr = new_comm;
       return 0;
    }
-   ranks = hypre_CTAlloc(HYPRE_Int, new_num_procs+2);
+   ranks = hypre_HostCTAlloc(HYPRE_Int, new_num_procs+2);
    if (new_num_procs == 1)
    {
       if (participate) my_info = my_id;
@@ -503,8 +503,9 @@
    }
    else
    {
-      info = hypre_CTAlloc(HYPRE_Int, new_num_procs+2);
-      list_len = hypre_CTAlloc(HYPRE_Int, 1);
+      info = hypre_HostCTAlloc(HYPRE_Int, new_num_procs+2);
+      list_len = hypre_HostCTAlloc(HYPRE_Int, 1);
+
 
       if (participate) 
       {
@@ -522,8 +523,9 @@
       hypre_MPI_Allreduce(info, ranks, list_len[0], HYPRE_MPI_INT, hypre_MPI_MERGE, comm);
 
       hypre_MPI_Op_free (&hypre_MPI_MERGE);
-      hypre_TFree(list_len);
-      hypre_TFree(info);
+
+      hypre_HostTFree(list_len);
+      hypre_HostTFree(info);
    }
    hypre_MPI_Comm_size(comm,&num_procs);
    hypre_MPI_Comm_group(comm, &orig_group);
@@ -532,7 +534,7 @@
    hypre_MPI_Group_free(&new_group);
    hypre_MPI_Group_free(&orig_group);
 
-   hypre_TFree(ranks);
+   hypre_HostTFree(ranks);
    
    *new_comm_ptr = new_comm;
    
diff -Nru hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_amg.c hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_amg.c
--- hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_amg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_amg.c	2017-10-20 17:42:22.000000000 +0000
@@ -1466,7 +1466,7 @@
    return( hypre_BoomerAMGSetChebyOrder( (void *) solver, order ) );
 }
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetChebyEigRatio
+ * HYPRE_BoomerAMGSetChebyFraction
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
@@ -1475,10 +1475,43 @@
 {
    return( hypre_BoomerAMGSetChebyFraction( (void *) solver, ratio ) );
 }
+
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetInterpVectors
+ * HYPRE_BoomerAMGSetChebyScale
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetChebyScale( HYPRE_Solver  solver,
+                                 HYPRE_Int     scale )
+{
+   return( hypre_BoomerAMGSetChebyScale( (void *) solver, scale ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyVariant
  *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetChebyVariant( HYPRE_Solver  solver,
+                                 HYPRE_Int     variant )
+{
+   return( hypre_BoomerAMGSetChebyVariant( (void *) solver, variant ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyEigEst
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetChebyEigEst( HYPRE_Solver  solver,
+                                 HYPRE_Int     eig_est )
+{
+   return( hypre_BoomerAMGSetChebyEigEst( (void *) solver, eig_est ) );
+}
                                                                                                        
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetInterpVectors
+ *--------------------------------------------------------------------------*/
 HYPRE_Int
 HYPRE_BoomerAMGSetInterpVectors (HYPRE_Solver solver, HYPRE_Int num_vectors,
                                  HYPRE_ParVector *vectors)
@@ -1604,6 +1637,17 @@
 }
 
 /*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetAddLastLvl
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetAddLastLvl( HYPRE_Solver solver,
+                          HYPRE_Int          add_last_lvl  )
+{
+   return( hypre_BoomerAMGSetAddLastLvl( (void *) solver, add_last_lvl ) );
+}
+
+/*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetNonGalerkinTol
  *--------------------------------------------------------------------------*/
 
@@ -1660,3 +1704,11 @@
    return (hypre_BoomerAMGSetKeepTranspose ( (void *) solver, keepTranspose ) );
 }
 
+HYPRE_Int
+HYPRE_BoomerAMGSetCpointsToKeep(HYPRE_Solver solver,
+				HYPRE_Int  cpt_coarse_level,
+				HYPRE_Int  num_cpt_coarse,
+				HYPRE_Int *cpt_coarse_index)
+{
+   return (hypre_BoomerAMGSetCpointsToKeep( (void *) solver, cpt_coarse_level, num_cpt_coarse, cpt_coarse_index));
+}
diff -Nru hypre-2.11.2/src/parcsr_ls/_hypre_parcsr_ls.h hypre-2.13.0/src/parcsr_ls/_hypre_parcsr_ls.h
--- hypre-2.11.2/src/parcsr_ls/_hypre_parcsr_ls.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/_hypre_parcsr_ls.h	2017-10-20 17:42:22.000000000 +0000
@@ -167,8 +167,13 @@
 
    HYPRE_Real          *max_eig_est;
    HYPRE_Real          *min_eig_est;
-   HYPRE_Int                  cheby_order;
+   HYPRE_Int           cheby_eig_est;
+   HYPRE_Int            cheby_order;
+   HYPRE_Int           cheby_variant;
+   HYPRE_Int           cheby_scale;
    HYPRE_Real           cheby_fraction;
+   HYPRE_Real         **cheby_ds;
+   HYPRE_Real         **cheby_coefs;
 
    /* data needed for non-Galerkin option */
    HYPRE_Int           nongalerk_num_tol;
@@ -240,18 +245,23 @@
    HYPRE_Int      additive;
    HYPRE_Int      mult_additive;
    HYPRE_Int      simple;
+   HYPRE_Int      add_last_lvl;
    HYPRE_Int      add_P_max_elmts;
    HYPRE_Real     add_trunc_factor;
    HYPRE_Int      add_rlx_type;
    HYPRE_Real     add_rlx_wt;
    hypre_ParCSRMatrix *Lambda;
+   hypre_ParCSRMatrix *Atilde;
    hypre_ParVector *Rtilde;
    hypre_ParVector *Xtilde;
    HYPRE_Real *D_inv;
 
    HYPRE_Int rap2;
    HYPRE_Int keepTranspose;
-
+/* information for preserving indexes as coarse grid points */
+	HYPRE_Int C_point_keep_level;
+	HYPRE_Int num_C_point_marker;
+	HYPRE_Int   **C_point_marker_array;
 } hypre_ParAMGData;
 
 /*--------------------------------------------------------------------------
@@ -363,7 +373,11 @@
 #define hypre_ParAMGDataMinEigEst(amg_data) ((amg_data)->min_eig_est)	
 #define hypre_ParAMGDataChebyOrder(amg_data) ((amg_data)->cheby_order)
 #define hypre_ParAMGDataChebyFraction(amg_data) ((amg_data)->cheby_fraction)
-
+#define hypre_ParAMGDataChebyEigEst(amg_data) ((amg_data)->cheby_eig_est)
+#define hypre_ParAMGDataChebyVariant(amg_data) ((amg_data)->cheby_variant)
+#define hypre_ParAMGDataChebyScale(amg_data) ((amg_data)->cheby_scale)
+#define hypre_ParAMGDataChebyDS(amg_data) ((amg_data)->cheby_ds)
+#define hypre_ParAMGDataChebyCoefs(amg_data) ((amg_data)->cheby_coefs)
 
 /* block */
 #define hypre_ParAMGDataABlockArray(amg_data) ((amg_data)->A_block_array)
@@ -437,11 +451,13 @@
 #define hypre_ParAMGDataAdditive(amg_data) ((amg_data)->additive)
 #define hypre_ParAMGDataMultAdditive(amg_data) ((amg_data)->mult_additive)
 #define hypre_ParAMGDataSimple(amg_data) ((amg_data)->simple)
+#define hypre_ParAMGDataAddLastLvl(amg_data) ((amg_data)->add_last_lvl)
 #define hypre_ParAMGDataMultAddPMaxElmts(amg_data) ((amg_data)->add_P_max_elmts)
 #define hypre_ParAMGDataMultAddTruncFactor(amg_data) ((amg_data)->add_trunc_factor)
 #define hypre_ParAMGDataAddRelaxType(amg_data) ((amg_data)->add_rlx_type)
 #define hypre_ParAMGDataAddRelaxWt(amg_data) ((amg_data)->add_rlx_wt)
 #define hypre_ParAMGDataLambda(amg_data) ((amg_data)->Lambda)
+#define hypre_ParAMGDataAtilde(amg_data) ((amg_data)->Atilde)
 #define hypre_ParAMGDataRtilde(amg_data) ((amg_data)->Rtilde)
 #define hypre_ParAMGDataXtilde(amg_data) ((amg_data)->Xtilde)
 #define hypre_ParAMGDataDinv(amg_data) ((amg_data)->D_inv)
@@ -454,6 +470,11 @@
 
 #define hypre_ParAMGDataRAP2(amg_data) ((amg_data)->rap2)
 #define hypre_ParAMGDataKeepTranspose(amg_data) ((amg_data)->keepTranspose)
+
+/*indeces for the dof which will keep coarsening to the coarse level */
+#define hypre_ParAMGDataCPointKeepMarkerArray(amg_data) ((amg_data)-> C_point_marker_array)
+#define hypre_ParAMGDataCPointKeepLevel(amg_data) ((amg_data)-> C_point_keep_level)
+#define hypre_ParAMGDataNumCPointKeep(amg_data) ((amg_data)-> num_C_point_marker)
 #endif
 
 
@@ -532,6 +553,7 @@
 HYPRE_Int hypre_AMGHybridSetNumSweeps ( void *AMGhybrid_vdata , HYPRE_Int num_sweeps );
 HYPRE_Int hypre_AMGHybridSetCycleNumSweeps ( void *AMGhybrid_vdata , HYPRE_Int num_sweeps , HYPRE_Int k );
 HYPRE_Int hypre_AMGHybridSetRelaxType ( void *AMGhybrid_vdata , HYPRE_Int relax_type );
+HYPRE_Int hypre_AMGHybridSetSplittingStrategy( void *AMGhybrid_vdata , HYPRE_Int splitting_strategy );
 HYPRE_Int hypre_AMGHybridSetCycleRelaxType ( void *AMGhybrid_vdata , HYPRE_Int relax_type , HYPRE_Int k );
 HYPRE_Int hypre_AMGHybridSetRelaxOrder ( void *AMGhybrid_vdata , HYPRE_Int relax_order );
 HYPRE_Int hypre_AMGHybridSetMaxCoarseSize ( void *AMGhybrid_vdata , HYPRE_Int max_coarse_size );
@@ -839,6 +861,9 @@
 HYPRE_Int HYPRE_BoomerAMGSetCoordinates ( HYPRE_Solver solver , float *coordinates );
 HYPRE_Int HYPRE_BoomerAMGSetChebyOrder ( HYPRE_Solver solver , HYPRE_Int order );
 HYPRE_Int HYPRE_BoomerAMGSetChebyFraction ( HYPRE_Solver solver , HYPRE_Real ratio );
+HYPRE_Int HYPRE_BoomerAMGSetChebyEigEst ( HYPRE_Solver solver , HYPRE_Int eig_est );
+HYPRE_Int HYPRE_BoomerAMGSetChebyVariant ( HYPRE_Solver solver , HYPRE_Int variant );
+HYPRE_Int HYPRE_BoomerAMGSetChebyScale ( HYPRE_Solver solver , HYPRE_Int scale );
 HYPRE_Int HYPRE_BoomerAMGSetInterpVectors ( HYPRE_Solver solver , HYPRE_Int num_vectors , HYPRE_ParVector *vectors );
 HYPRE_Int HYPRE_BoomerAMGSetInterpVecVariant ( HYPRE_Solver solver , HYPRE_Int num );
 HYPRE_Int HYPRE_BoomerAMGSetInterpVecQMax ( HYPRE_Solver solver , HYPRE_Int q_max );
@@ -852,11 +877,13 @@
 HYPRE_Int HYPRE_BoomerAMGGetMultAdditive ( HYPRE_Solver solver , HYPRE_Int *mult_additive );
 HYPRE_Int HYPRE_BoomerAMGSetSimple ( HYPRE_Solver solver , HYPRE_Int simple );
 HYPRE_Int HYPRE_BoomerAMGGetSimple ( HYPRE_Solver solver , HYPRE_Int *simple );
+HYPRE_Int HYPRE_BoomerAMGSetAddLastLvl ( HYPRE_Solver solver , HYPRE_Int add_last_lvl );
 HYPRE_Int HYPRE_BoomerAMGSetNonGalerkinTol ( HYPRE_Solver solver , HYPRE_Real nongalerkin_tol );
 HYPRE_Int HYPRE_BoomerAMGSetLevelNonGalerkinTol ( HYPRE_Solver solver , HYPRE_Real nongalerkin_tol , HYPRE_Int level );
 HYPRE_Int HYPRE_BoomerAMGSetNonGalerkTol ( HYPRE_Solver solver , HYPRE_Int nongalerk_num_tol , HYPRE_Real *nongalerk_tol );
 HYPRE_Int HYPRE_BoomerAMGSetRAP2 ( HYPRE_Solver solver , HYPRE_Int rap2 );
 HYPRE_Int HYPRE_BoomerAMGSetKeepTranspose ( HYPRE_Solver solver , HYPRE_Int keepTranspose );
+HYPRE_Int HYPRE_BoomerAMGSetCpointsToKeep( HYPRE_Solver solver, HYPRE_Int cpt_coarse_level, HYPRE_Int num_cpt_coarse,HYPRE_Int *cpt_coarse_index);
 
 /* HYPRE_parcsr_bicgstab.c */
 HYPRE_Int HYPRE_ParCSRBiCGSTABCreate ( MPI_Comm comm , HYPRE_Solver *solver );
@@ -1261,6 +1288,9 @@
 HYPRE_Int hypre_BoomerAMGSetEuBJ ( void *data , HYPRE_Int eu_bj );
 HYPRE_Int hypre_BoomerAMGSetChebyOrder ( void *data , HYPRE_Int order );
 HYPRE_Int hypre_BoomerAMGSetChebyFraction ( void *data , HYPRE_Real ratio );
+HYPRE_Int hypre_BoomerAMGSetChebyEigEst ( void *data , HYPRE_Int eig_est );
+HYPRE_Int hypre_BoomerAMGSetChebyVariant ( void *data , HYPRE_Int variant );
+HYPRE_Int hypre_BoomerAMGSetChebyScale ( void *data , HYPRE_Int scale );
 HYPRE_Int hypre_BoomerAMGSetInterpVectors ( void *solver , HYPRE_Int num_vectors , hypre_ParVector **interp_vectors );
 HYPRE_Int hypre_BoomerAMGSetInterpVecVariant ( void *solver , HYPRE_Int var );
 HYPRE_Int hypre_BoomerAMGSetInterpVecQMax ( void *data , HYPRE_Int q_max );
@@ -1274,11 +1304,13 @@
 HYPRE_Int hypre_BoomerAMGGetMultAdditive ( void *data , HYPRE_Int *mult_additive );
 HYPRE_Int hypre_BoomerAMGSetSimple ( void *data , HYPRE_Int simple );
 HYPRE_Int hypre_BoomerAMGGetSimple ( void *data , HYPRE_Int *simple );
+HYPRE_Int hypre_BoomerAMGSetAddLastLvl ( void *data , HYPRE_Int add_last_lvl );
 HYPRE_Int hypre_BoomerAMGSetNonGalerkinTol ( void *data , HYPRE_Real nongalerkin_tol );
 HYPRE_Int hypre_BoomerAMGSetLevelNonGalerkinTol ( void *data , HYPRE_Real nongalerkin_tol , HYPRE_Int level );
 HYPRE_Int hypre_BoomerAMGSetNonGalerkTol ( void *data , HYPRE_Int nongalerk_num_tol , HYPRE_Real *nongalerk_tol );
 HYPRE_Int hypre_BoomerAMGSetRAP2 ( void *data , HYPRE_Int rap2 );
 HYPRE_Int hypre_BoomerAMGSetKeepTranspose ( void *data , HYPRE_Int keepTranspose );
+HYPRE_Int hypre_BoomerAMGSetCpointsToKeep(void *data, HYPRE_Int cpt_coarse_level, HYPRE_Int  num_cpt_coarse, HYPRE_Int *cpt_coarse_index);
 
 /* par_amg_setup.c */
 HYPRE_Int hypre_BoomerAMGSetup ( void *amg_vdata , hypre_ParCSRMatrix *A , hypre_ParVector *f , hypre_ParVector *u );
@@ -1303,6 +1335,10 @@
 HYPRE_Int hypre_BoomerAMGCGRelaxWt ( void *amg_vdata , HYPRE_Int level , HYPRE_Int num_cg_sweeps , HYPRE_Real *rlx_wt_ptr );
 HYPRE_Int hypre_Bisection ( HYPRE_Int n , HYPRE_Real *diag , HYPRE_Real *offd , HYPRE_Real y , HYPRE_Real z , HYPRE_Real tol , HYPRE_Int k , HYPRE_Real *ev_ptr );
 
+/* par_cheby.c */
+HYPRE_Int hypre_ParCSRRelax_Cheby_Setup ( hypre_ParCSRMatrix *A , HYPRE_Real max_eig , HYPRE_Real min_eig , HYPRE_Real fraction , HYPRE_Int order , HYPRE_Int scale , HYPRE_Int variant , HYPRE_Real **coefs_ptr , HYPRE_Real **ds_ptr );
+HYPRE_Int hypre_ParCSRRelax_Cheby_Solve ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Real *ds_data , HYPRE_Real *coefs , HYPRE_Int order , HYPRE_Int scale , HYPRE_Int variant , hypre_ParVector *u , hypre_ParVector *v , hypre_ParVector *r );
+
 /* par_coarsen.c */
 HYPRE_Int hypre_BoomerAMGCoarsen ( hypre_ParCSRMatrix *S , hypre_ParCSRMatrix *A , HYPRE_Int CF_init , HYPRE_Int debug_flag , HYPRE_Int **CF_marker_ptr );
 HYPRE_Int hypre_BoomerAMGCoarsenRuge ( hypre_ParCSRMatrix *S , hypre_ParCSRMatrix *A , HYPRE_Int measure_type , HYPRE_Int coarsen_type , HYPRE_Int debug_flag , HYPRE_Int **CF_marker_ptr );
@@ -1408,6 +1444,7 @@
 HYPRE_Int hypre_IntersectTwoArrays ( HYPRE_Int *x , HYPRE_Real *x_data , HYPRE_Int x_length , HYPRE_Int *y , HYPRE_Int y_length , HYPRE_Int *z , HYPRE_Real *output_x_data , HYPRE_Int *intersect_length );
 HYPRE_Int hypre_SortedCopyParCSRData ( hypre_ParCSRMatrix *A , hypre_ParCSRMatrix *B );
 HYPRE_Int hypre_BoomerAMG_MyCreateS ( hypre_ParCSRMatrix *A , HYPRE_Real strength_threshold , HYPRE_Real max_row_sum , HYPRE_Int num_functions , HYPRE_Int *dof_func , hypre_ParCSRMatrix **S_ptr );
+HYPRE_Int hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix    *A, HYPRE_Real strength_threshold, HYPRE_Real max_row_sum, HYPRE_Int *CF_marker, HYPRE_Int SMRK, hypre_ParCSRMatrix    **S_ptr);
 HYPRE_Int hypre_NonGalerkinIJBufferInit ( HYPRE_Int *ijbuf_cnt , HYPRE_Int *ijbuf_rowcounter , HYPRE_Int *ijbuf_numcols );
 HYPRE_Int hypre_NonGalerkinIJBufferNewRow ( HYPRE_Int *ijbuf_rownums , HYPRE_Int *ijbuf_numcols , HYPRE_Int *ijbuf_rowcounter , HYPRE_Int new_row );
 HYPRE_Int hypre_NonGalerkinIJBufferCompressRow ( HYPRE_Int *ijbuf_cnt , HYPRE_Int ijbuf_rowcounter , HYPRE_Real *ijbuf_data , HYPRE_Int *ijbuf_cols , HYPRE_Int *ijbuf_rownums , HYPRE_Int *ijbuf_numcols );
@@ -1430,7 +1467,7 @@
 HYPRE_Int hypre_BoomerAMGRelax ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Int *cf_marker , HYPRE_Int relax_type , HYPRE_Int relax_points , HYPRE_Real relax_weight , HYPRE_Real omega , HYPRE_Real *l1_norms , hypre_ParVector *u , hypre_ParVector *Vtemp , hypre_ParVector *Ztemp );
 HYPRE_Int hypre_GaussElimSetup ( hypre_ParAMGData *amg_data , HYPRE_Int level , HYPRE_Int relax_type );
 HYPRE_Int hypre_GaussElimSolve ( hypre_ParAMGData *amg_data , HYPRE_Int level , HYPRE_Int relax_type );
-HYPRE_Int gselim ( HYPRE_Real *A , HYPRE_Real *x , HYPRE_Int n );
+HYPRE_CUDA_GLOBAL HYPRE_Int gselim ( HYPRE_Real *A , HYPRE_Real *x , HYPRE_Int n );
 
 /* par_relax_interface.c */
 HYPRE_Int hypre_BoomerAMGRelaxIF ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Int *cf_marker , HYPRE_Int relax_type , HYPRE_Int relax_order , HYPRE_Int cycle_type , HYPRE_Real relax_weight , HYPRE_Real omega , HYPRE_Real *l1_norms , hypre_ParVector *u , hypre_ParVector *Vtemp , hypre_ParVector *Ztemp );
@@ -1509,6 +1546,19 @@
 HYPRE_Real rfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
 HYPRE_Real bndfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
 
+/* par_vardifconv_rs.c */
+HYPRE_ParCSRMatrix GenerateRSVarDifConv ( MPI_Comm comm , HYPRE_Int nx , HYPRE_Int ny , HYPRE_Int nz , HYPRE_Int P , HYPRE_Int Q , HYPRE_Int R , HYPRE_Int p , HYPRE_Int q , HYPRE_Int r , HYPRE_Real eps , HYPRE_ParVector *rhs_ptr, HYPRE_Int type );
+HYPRE_Real afun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real bfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real cfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real dfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real efun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real ffun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real gfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real rfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real bndfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+
+
 /* pcg_par.c */
 char *hypre_ParKrylovCAlloc ( HYPRE_Int count , HYPRE_Int elt_size );
 HYPRE_Int hypre_ParKrylovFree ( char *ptr );
@@ -1552,6 +1602,59 @@
 HYPRE_Int hypre_ParGenerateScale ( hypre_ParCSRMatrix *A , hypre_CSRMatrix *domain_structure , HYPRE_Real relaxation_weight , HYPRE_Real **scale_pointer );
 HYPRE_Int hypre_ParGenerateHybridScale ( hypre_ParCSRMatrix *A , hypre_CSRMatrix *domain_structure , hypre_CSRMatrix **A_boundary_pointer , HYPRE_Real **scale_pointer );
 
+/* par_mgr.c */
+void *hypre_MGRCreate ( void );
+HYPRE_Int hypre_MGRDestroy ( void *mgr_vdata );
+HYPRE_Int hypre_MGRCycle( void *mgr_vdata, hypre_ParVector **F_array, hypre_ParVector **U_array );
+void *hypre_MGRCreateFrelaxVcycleData();
+HYPRE_Int hypre_MGRDestroyFrelaxVcycleData( void *mgr_vdata );
+HYPRE_Int hypre_MGRSetupFrelaxVcycleData( void *mgr_vdata, hypre_ParCSRMatrix *A, hypre_ParVector *f, hypre_ParVector *u, HYPRE_Int level);
+HYPRE_Int hypre_MGRFrelaxVcycle ( void *mgr_vdata );
+HYPRE_Int hypre_MGRSetCpointsByBlock( void *mgr_vdata, HYPRE_Int  block_size, HYPRE_Int  max_num_levels, HYPRE_Int *block_num_coarse_points, HYPRE_Int  **block_coarse_indexes);
+HYPRE_Int hypre_MGRCoarsen(hypre_ParCSRMatrix *S,  hypre_ParCSRMatrix *A,HYPRE_Int final_coarse_size,HYPRE_Int *final_coarse_indexes,HYPRE_Int debug_flag,HYPRE_Int **CF_marker,HYPRE_Int last_level);
+HYPRE_Int hypre_MGRSetReservedCoarseNodes(void      *mgr_vdata, HYPRE_Int reserved_coarse_size, HYPRE_Int *reserved_coarse_nodes);
+HYPRE_Int hypre_MGRSetMaxGlobalsmoothIters( void *mgr_vdata, HYPRE_Int max_iter );
+HYPRE_Int hypre_MGRSetGlobalsmoothType( void *mgr_vdata, HYPRE_Int iter_type );
+HYPRE_Int hypre_MGRSetNonCpointsToFpoints( void      *mgr_vdata, HYPRE_Int nonCptToFptFlag);
+
+//HYPRE_Int hypre_MGRInitCFMarker(HYPRE_Int num_variables, HYPRE_Int *CF_marker, HYPRE_Int initial_coarse_size,HYPRE_Int *initial_coarse_indexes);
+//HYPRE_Int hypre_MGRUpdateCoarseIndexes(HYPRE_Int num_variables, HYPRE_Int *CF_marker, HYPRE_Int initial_coarse_size,HYPRE_Int *initial_coarse_indexes);
+HYPRE_Int hypre_MGRBuildInterp(hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker, hypre_ParCSRMatrix *S, HYPRE_Int *num_cpts_global, HYPRE_Int num_functions, HYPRE_Int *dof_func, HYPRE_Int debug_flag, HYPRE_Real trunc_factor, HYPRE_Int max_elmts, HYPRE_Int *col_offd_S_to_A, hypre_ParCSRMatrix  **P, HYPRE_Int last_level, HYPRE_Int level, HYPRE_Int numsweeps);
+//HYPRE_Int hypre_MGRBuildRestrictionToper(hypre_ParCSRMatrix *AT, HYPRE_Int *CF_marker, hypre_ParCSRMatrix *ST, HYPRE_Int *num_cpts_global,HYPRE_Int num_functions,HYPRE_Int *dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor, HYPRE_Int max_elmts, HYPRE_Int  *col_offd_ST_to_AT,hypre_ParCSRMatrix  **RT,HYPRE_Int last_level,HYPRE_Int level, HYPRE_Int numsweeps);
+//HYPRE_Int hypre_BoomerAMGBuildInjectionInterp( hypre_ParCSRMatrix   *A, HYPRE_Int *CF_marker, HYPRE_Int *num_cpts_global, HYPRE_Int num_functions, HYPRE_Int debug_flag,HYPRE_Int init_data,hypre_ParCSRMatrix  **P_ptr);
+HYPRE_Int hypre_MGRSetCoarseSolver( void  *mgr_vdata, HYPRE_Int  (*coarse_grid_solver_solve)(void*,void*,void*,void*), HYPRE_Int  (*coarse_grid_solver_setup)(void*,void*,void*,void*), void  *coarse_grid_solver );
+HYPRE_Int hypre_MGRSetup( void *mgr_vdata, hypre_ParCSRMatrix *A, hypre_ParVector    *f, hypre_ParVector    *u );
+HYPRE_Int hypre_MGRSolve( void *mgr_vdata, hypre_ParCSRMatrix *A, hypre_ParVector *f, hypre_ParVector  *u );
+HYPRE_Int hypre_block_jacobi_scaling(hypre_ParCSRMatrix *A,hypre_ParCSRMatrix **B_ptr,void               *mgr_vdata,HYPRE_Int             debug_flag);
+HYPRE_Int hypre_block_jacobi (hypre_ParCSRMatrix *A,hypre_ParVector    *f,hypre_ParVector    *u,HYPRE_Real         blk_size,HYPRE_Int           n_block,HYPRE_Int           left_size,HYPRE_Real *diaginv,hypre_ParVector    *Vtemp);
+HYPRE_Int hypre_blockRelax_setup(hypre_ParCSRMatrix *A,HYPRE_Int blk_size, HYPRE_Int reserved_coarse_size, HYPRE_Real **diaginvptr);
+HYPRE_Int hypre_blockRelax(hypre_ParCSRMatrix *A,hypre_ParVector *f,hypre_ParVector *u,HYPRE_Int blk_size,HYPRE_Int reserved_coarse_size,hypre_ParVector *Vtemp,hypre_ParVector *Ztemp);
+
+HYPRE_Int hypre_MGRBuildAff( MPI_Comm comm, HYPRE_Int local_num_variables, HYPRE_Int num_functions, 
+HYPRE_Int *dof_func, HYPRE_Int *CF_marker, HYPRE_Int **coarse_dof_func_ptr, HYPRE_Int **coarse_pnts_global_ptr,
+hypre_ParCSRMatrix *A, HYPRE_Int debug_flag, hypre_ParCSRMatrix **P_f_ptr, hypre_ParCSRMatrix **A_ff_ptr );
+	
+HYPRE_Int hypre_MGRSetAffSolverType( void *systg_vdata, HYPRE_Int *aff_solver_type );	
+HYPRE_Int hypre_MGRSetCoarseSolverType( void *systg_vdata, HYPRE_Int coarse_solver_type );
+HYPRE_Int hypre_MGRSetCoarseSolverIter( void *systg_vdata, HYPRE_Int coarse_solver_iter );
+HYPRE_Int hypre_MGRSetFineSolverIter( void *systg_vdata, HYPRE_Int fine_solver_iter );
+HYPRE_Int hypre_MGRSetFineSolverMaxLevels( void *systg_vdata, HYPRE_Int fine_solver_max_levels );
+HYPRE_Int hypre_MGRSetMaxCoarseLevels( void *mgr_vdata, HYPRE_Int maxlev );
+HYPRE_Int hypre_MGRSetBlockSize( void *mgr_vdata, HYPRE_Int bsize );
+HYPRE_Int hypre_MGRSetRelaxType( void *mgr_vdata, HYPRE_Int relax_type );
+HYPRE_Int hypre_MGRSetFRelaxMethod( void *mgr_vdata, HYPRE_Int relax_method);
+HYPRE_Int hypre_MGRSetRestrictType( void *mgr_vdata, HYPRE_Int interpType);
+HYPRE_Int hypre_MGRSetInterpType( void *mgr_vdata, HYPRE_Int interpType);
+HYPRE_Int hypre_MGRSetNumRelaxSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
+HYPRE_Int hypre_MGRSetNumInterpSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
+HYPRE_Int hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
+HYPRE_Int hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging );
+HYPRE_Int hypre_MGRSetMaxIter( void *mgr_vdata, HYPRE_Int max_iter );
+HYPRE_Int hypre_MGRSetTol( void *mgr_vdata, HYPRE_Real tol );
+// Accessor functions
+HYPRE_Int hypre_MGRGetNumIterations( void *mgr_vdata, HYPRE_Int *num_iterations );
+HYPRE_Int hypre_MGRGetFinalRelativeResidualNorm( void *mgr_vdata, HYPRE_Real *res_norm );
+	
 #ifdef __cplusplus
 }
 #endif
diff -Nru hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_ls.h hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_ls.h
--- hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_ls.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_ls.h	2017-10-20 17:42:22.000000000 +0000
@@ -274,7 +274,7 @@
  * \hline
  * \end{tabular}
  * 
- * The default is 6. 
+ * The default is 10. 
  **/
 HYPRE_Int HYPRE_BoomerAMGSetCoarsenType(HYPRE_Solver solver,
                                         HYPRE_Int    coarsen_type);
@@ -406,21 +406,21 @@
  * \hline
  * \end{tabular}
  * 
- * The default is 0. 
+ * The default is ext+i interpolation (interp_type 6) trunctated to at most 4 \\
+ * elements per row. (see HYPRE_BoomerAMGSetPMaxElmts).
  **/
 HYPRE_Int HYPRE_BoomerAMGSetInterpType(HYPRE_Solver solver,
                                        HYPRE_Int    interp_type);
 
 /**
- * (Optional) Defines a truncation factor for the interpolation.
- * The default is 0.
+ * (Optional) Defines a truncation factor for the interpolation. The default is 0.
  **/
 HYPRE_Int HYPRE_BoomerAMGSetTruncFactor(HYPRE_Solver solver,
                                         HYPRE_Real   trunc_factor);
 
 /**
  * (Optional) Defines the maximal number of elements per row for the interpolation.
- * The default is 0.
+ * The default is 4. To turn off truncation, it needs to be set to 0.
  **/
 HYPRE_Int HYPRE_BoomerAMGSetPMaxElmts(HYPRE_Solver solver,
                                       HYPRE_Int    P_max_elmts);
@@ -582,6 +582,16 @@
                                    HYPRE_Int    addlvl);
 
 /**
+ * (Optional) Defines last level where additive, mult-additive
+ * or simple cycle is used.
+ * The multiplicative approach is used on levels > add_last_lvl.
+ * 
+ * Can only be used when AMG is used as a preconditioner !!!
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetAddLastLvl(HYPRE_Solver solver,
+                                     HYPRE_Int    add_last_lvl);
+
+/**
  * (Optional) Defines the truncation factor for the 
  * smoothed interpolation used for mult-additive or simple method.
  * The default is 0.
@@ -676,7 +686,8 @@
  * (Optional) Defines the smoother to be used. It uses the given
  * smoother on the fine grid, the up and 
  * the down cycle and sets the solver on the coarsest level to Gaussian
- * elimination (9). The default is Gauss-Seidel (3).
+ * elimination (9). The default is $\ell_1$-Gauss-Seidel, forward solve (13)
+ * on the down cycle and backward solve (14) on the up cycle.
  *
  * There are the following options for relax\_type:
  *
@@ -690,6 +701,8 @@
  * 6 & hybrid symmetric Gauss-Seidel or SSOR \\
  * 8 & $\ell_1$-scaled hybrid symmetric Gauss-Seidel\\
  * 9 & Gaussian elimination (only on coarsest level) \\
+ * 13 & $\ell_1$ Gauss-Seidel, forward solve \\
+ * 14 & $\ell_1$ Gauss-Seidel, backward solve \\
  * 15 & CG (warning - not a fixed smoother - may require FGMRES)\\
  * 16 & Chebyshev\\
  * 17 & FCF-Jacobi\\
@@ -733,7 +746,7 @@
  * \hline
  * \end{tabular}
  *
- * The default is 1 (CF-relaxation).
+ * The default is 0. 
  **/
 HYPRE_Int HYPRE_BoomerAMGSetRelaxOrder(HYPRE_Solver  solver,
                                        HYPRE_Int     relax_order);
@@ -838,6 +851,31 @@
 HYPRE_Int HYPRE_BoomerAMGSetChebyFraction (HYPRE_Solver solver,
                                            HYPRE_Real   ratio);
 
+/*
+ * (Optional) Defines whether matrix should be scaled.
+ *  The default is 1 (i.e., scaled).
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetChebyScale (HYPRE_Solver solver,
+                                           HYPRE_Int   scale);
+/*
+ * (Optional) Defines which polynomial variant should be used.
+ *  The default is 0 (i.e., scaled).
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetChebyVariant (HYPRE_Solver solver,
+                                           HYPRE_Int   variant);
+
+/*
+ * (Optional) Defines how to estimate eigenvalues.
+ *  The default is 10 (i.e., 10 CG iterations are used to find extreme 
+ *  eigenvalues.) If eig_est=0, the largest eigenvalue is estimated
+ *  using Gershgorin, the smallest is set to 0.
+ *  If eig_est is a positive number n, n iterations of CG are used to
+ *  determine the smallest and largest eigenvalue.
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetChebyEigEst (HYPRE_Solver solver,
+                                           HYPRE_Int   eig_est);
+
+
 /**
  * (Optional) Enables the use of more complex smoothers.
  * The following options exist for smooth\_type:
@@ -1107,6 +1145,19 @@
 HYPRE_Int HYPRE_BoomerAMGSetCoordinates (HYPRE_Solver  solver,
                                          float        *coordinates);
 
+/**
+ * (Optional) Fix C points to be kept till a specified coarse level.
+ *
+ * @param solver [IN] solver or preconditioner
+ * @param cpt_coarse_level [IN] coarse level up to which to keep C points
+ * @param num_cpt_coarse [IN] number of C points to be kept
+ * @param cpt_coarse_index [IN] indexes of C points to be kept
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetCpointsToKeep(HYPRE_Solver solver,
+				HYPRE_Int  cpt_coarse_level,
+				HYPRE_Int  num_cpt_coarse,
+				HYPRE_Int *cpt_coarse_index);
+
 /*@}*/
 
 /*--------------------------------------------------------------------------
@@ -2578,16 +2629,18 @@
                                        HYPRE_Int    logging);
 
 /**
- * Set print level (default: 0, no printing).
+ * Set print level (default: 0, no printing)
+ * 2 will print residual norms per iteration
+ * 10 will print AMG setup information if AMG is used
+ * 12 both Setup information and iterations.
  **/
 HYPRE_Int HYPRE_ParCSRHybridSetPrintLevel(HYPRE_Solver solver,
                                           HYPRE_Int    print_level);
 
 /**
  * (Optional) Sets AMG strength threshold. The default is 0.25.
- * For 2d Laplace operators, 0.25 is a good value, for 3d Laplace
- * operators, 0.5 or 0.6 is a better value. For elasticity problems,
- * a large strength threshold, such as 0.9, is often better.
+ * For elasticity problems, a larger strength threshold, such as 0.7 or 0.8, 
+ * is often better.
  **/
 HYPRE_Int
 HYPRE_ParCSRHybridSetStrongThreshold(HYPRE_Solver solver,
@@ -2655,7 +2708,7 @@
  * \hline
  * \end{tabular}
  *
- * The default is 6.
+ * The default is 10.
  **/
 HYPRE_Int
 HYPRE_ParCSRHybridSetCoarsenType(HYPRE_Solver solver,
@@ -2663,7 +2716,7 @@
 
 /*
  * (Optional) Specifies which interpolation operator is used
- * The default is modified ''classical" interpolation.
+ * The default is ext+i interpolation truncated to at most 4 elements per row.
  **/
 HYPRE_Int
 HYPRE_ParCSRHybridSetInterpType(HYPRE_Solver solver,
@@ -2721,7 +2774,8 @@
  * (Optional) Defines the smoother to be used. It uses the given
  * smoother on the fine grid, the up and
  * the down cycle and sets the solver on the coarsest level to Gaussian
- * elimination (9). The default is Gauss-Seidel (3).
+ * elimination (9). The default is l1-Gauss-Seidel, forward solve on the down
+ * cycle (13) and backward solve on the up cycle (14).
  *
  * There are the following options for relax\_type:
  *
@@ -2731,8 +2785,11 @@
  * 2 &  Gauss-Seidel, interior points in parallel, boundary sequential (slow!) \\
  * 3 &  hybrid Gauss-Seidel or SOR, forward solve \\
  * 4 &  hybrid Gauss-Seidel or SOR, backward solve \\
- * 5 &  hybrid chaotic Gauss-Seidel (works only with OpenMP) \\
  * 6 &  hybrid symmetric Gauss-Seidel or SSOR \\
+ * 8 &  hybrid symmetric l1-Gauss-Seidel or SSOR \\
+ * 13 &  l1-Gauss-Seidel, forward solve \\
+ * 14 &  l1-Gauss-Seidel, backward solve \\
+ * 18 &  l1-Jacobi \\
  * 9 &  Gaussian elimination (only on coarsest level) \\
  * \hline
  * \end{tabular}
@@ -2775,7 +2832,7 @@
  * \hline
  * \end{tabular}
  *
- * The default is 1 (CF-relaxation).
+ * The default is 0 (CF-relaxation).
  **/
 HYPRE_Int
 HYPRE_ParCSRHybridSetRelaxOrder(HYPRE_Solver solver,
@@ -3056,6 +3113,353 @@
                                                        HYPRE_Real   *norm);
 
 /*--------------------------------------------------------------------------
+ *--------------------------------------------------------------------------*/
+	
+/**
+ * @name ParCSR MGR Solver
+ *
+ * Parallel multigrid reduction solver and preconditioner. 
+ * This solver or preconditioner is designed with systems of 
+ * PDEs in mind. However, it can also be used for scalar linear 
+ * systems, particularly for problems where the user can exploit 
+ * information from the physics of the problem. In this way, the 
+ * MGR solver could potentially be used as a foundation 
+ * for a physics-based preconditioner. 
+ **/                                                       
+/*@{*/
+
+/**
+ * Create a solver object
+ **/
+HYPRE_Int HYPRE_MGRCreate( HYPRE_Solver *solver );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRDestroy
+ *--------------------------------------------------------------------------*/
+/**
+ * Destroy a solver object
+ **/
+HYPRE_Int HYPRE_MGRDestroy( HYPRE_Solver solver );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetup
+ *--------------------------------------------------------------------------*/
+/**
+ * Setup the MGR solver or preconditioner.
+ * If used as a preconditioner, this function should be passed
+ * to the iterative solver {\tt SetPrecond} function.  
+ *
+ * @param solver [IN] object to be set up.
+ * @param A [IN] ParCSR matrix used to construct the solver/preconditioner.
+ * @param b right-hand-side of the linear system to be solved (Ignored by this function).
+ * @param x approximate solution of the linear system to be solved (Ignored by this function).  
+ **/
+HYPRE_Int HYPRE_MGRSetup( HYPRE_Solver solver,
+                         HYPRE_ParCSRMatrix A,
+                         HYPRE_ParVector b,
+                         HYPRE_ParVector x      );
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSolve
+ *--------------------------------------------------------------------------*/
+ /**
+ * Solve the system or apply MGR as a preconditioner.
+ * If used as a preconditioner, this function should be passed
+ * to the iterative solver {\tt SetPrecond} function.
+ *
+ * @param solver [IN] solver or preconditioner object to be applied.
+ * @param A [IN] ParCSR matrix, matrix of the linear system to be solved
+ * @param b [IN] right hand side of the linear system to be solved
+ * @param x [OUT] approximated solution of the linear system to be solved
+ **/
+HYPRE_Int HYPRE_MGRSolve( HYPRE_Solver solver,
+                         HYPRE_ParCSRMatrix A,
+                         HYPRE_ParVector b,
+                         HYPRE_ParVector x      );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_Int HYPRE_MGRSetCpointsByBlock
+ *--------------------------------------------------------------------------*/
+/**
+ * Set the block data and prescribe the coarse indexes per block 
+ * for each reduction level.
+ *
+ * @param solver [IN] solver or preconditioner object
+ * @param block_size [IN] system block size
+ * @param max_num_levels [IN] maximum number of reduction levels
+ * @param num_block_coarse_points [IN] number of coarse points per block per level
+ * @param block_coarse_indexes [IN] index for each block coarse point per level
+ **/
+HYPRE_Int HYPRE_MGRSetCpointsByBlock( HYPRE_Solver solver,
+                         HYPRE_Int  block_size, 
+                         HYPRE_Int max_num_levels,
+                         HYPRE_Int *num_block_coarse_points, 
+                         HYPRE_Int  **block_coarse_indexes);
+
+/**
+ * (Optional) Set non C-points to F-points.
+ * This routine determines how the coarse points are selected for the next level
+ * reduction. Options for {\tt nonCptToFptFlag} are:
+ * 
+ * \begin{tabular}{|c|l|} \hline
+ * 0 & Allow points not prescribed as C points to be potentially set as C points \\
+ * & using classical AMG coarsening strategies (currently uses CLJP-coarsening). \\
+ * 1 & Fix points not prescribed as C points to be F points for the next reduction \\
+ * \hline
+ * \end{tabular}
+ *
+ **/                         
+HYPRE_Int
+HYPRE_MGRSetNonCpointsToFpoints( HYPRE_Solver solver, HYPRE_Int nonCptToFptFlag);
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetMaxCoarseLevels
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set maximum number of coarsening (or reduction) levels.
+ * The default is 10.
+ **/
+HYPRE_Int
+HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver, HYPRE_Int maxlev );
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetBlockSize
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set the system block size.
+ * This should match the block size set in the MGRSetCpointsByBlock function.
+ * The default is 1.
+ **/
+HYPRE_Int
+HYPRE_MGRSetBlockSize( HYPRE_Solver solver, HYPRE_Int bsize );
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetReservedCoarseNodes
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Defines indexes of coarse nodes to be kept to the coarsest level.
+ * These indexes are passed down through the MGR hierarchy to the coarsest grid 
+ * of the coarse grid (BoomerAMG) solver.
+ *
+ * @param solver [IN] solver or preconditioner object
+ * @param reserved_coarse_size [IN] number of reserved coarse points
+ * @param reserved_coarse_nodes [IN] (global) indexes of reserved coarse points
+ **/
+HYPRE_Int
+HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver, HYPRE_Int reserved_coarse_size, HYPRE_Int *reserved_coarse_nodes );
+	
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetRelaxType
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set the relaxation type for F-relaxation.
+ * Currently supports the following flavors of relaxation types
+ * as described in the {\tt BoomerAMGSetRelaxType}: 
+ * relax\_types 0 - 8, 13, 14, 18, 19, 98.
+ **/
+HYPRE_Int
+HYPRE_MGRSetRelaxType(HYPRE_Solver solver, HYPRE_Int relax_type );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetRelaxMethod
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set the strategy for F-relaxation.
+ * Options for {\tt relax\_method} are:
+ *
+ * \begin{tabular}{|c|l|} \hline
+ * 0 & Single-level relaxation sweeps for F-relaxation as prescribed by {\tt MGRSetRelaxType} \\
+ * 1 & Multi-level relaxation strategy for F-relaxation (V(1,0) cycle currently supported). \\
+ * \hline
+ * \end{tabular} 
+ **/
+HYPRE_Int
+HYPRE_MGRSetFRelaxMethod(HYPRE_Solver solver, HYPRE_Int relax_method );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetRestrictType
+ *--------------------------------------------------------------------------*/
+/** 
+ * (Optional) Set the strategy for computing the MGR restriction operator. 
+ *
+ * Options for {\tt restrict\_type} are:
+ *
+ * \begin{tabular}{|c|l|} \hline
+ * 0 & injection $[0  I]$ \\
+ * 1 & unscaled (not recommended) \\
+ * 2 & diagonal scaling (Jacobi) \\
+ * else & use classical modified interpolation \\
+ * \hline
+ * \end{tabular} 
+ *
+ * These options are currently active for the last stage reduction. Intermediate 
+ * reduction levels use injection. The default is injection.
+ **/
+HYPRE_Int
+HYPRE_MGRSetRestrictType( HYPRE_Solver solver, HYPRE_Int restrict_type);
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetInterpType
+ *--------------------------------------------------------------------------*/
+/** 
+ * (Optional) Set the strategy for computing the MGR restriction operator. 
+ * Options for {\tt interp\_type} are:
+ *
+ * \begin{tabular}{|c|l|} \hline
+ * 0 & injection $[0  I]^{T}$ \\
+ * 1 & unscaled (not recommended) \\
+ * 2 & diagonal scaling (Jacobi) \\
+ * else & use default (classical modified interpolation) \\
+ * \hline
+ * \end{tabular} 
+ *
+ * These options are currently active for the last stage reduction. Intermediate 
+ * reduction levels use diagonal scaling.
+ **/
+HYPRE_Int
+HYPRE_MGRSetInterpType( HYPRE_Solver solver, HYPRE_Int interp_type );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetNumRelaxSweeps
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set number of relaxation sweeps.
+ * This option is for the `single level' F-relaxation (relax\_method = 0).
+ **/ 
+HYPRE_Int
+HYPRE_MGRSetNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetNumInterpSweeps
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set number of interpolation sweeps.
+ * This option is for interp\_type > 2.
+ **/
+HYPRE_Int
+HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps );                         
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetCoarseSolver
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set the coarse grid solver.
+ * Currently uses BoomerAMG.
+ * The default, if not set, is BoomerAMG with default options.
+ *
+ * @param solver [IN] solver or preconditioner object
+ * @param coarse_grid_solver_solve [IN] solve routine for BoomerAMG
+ * @param coarse_grid_solver_setup [IN] setup routine for BoomerAMG
+ * @param coarse_grid_solver [IN] BoomerAMG solver 
+ **/
+HYPRE_Int HYPRE_MGRSetCoarseSolver(HYPRE_Solver          solver,
+                             HYPRE_PtrToParSolverFcn  coarse_grid_solver_solve,
+                             HYPRE_PtrToParSolverFcn  coarse_grid_solver_setup,
+                             HYPRE_Solver          coarse_grid_solver );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetPrintLevel
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set the print level to print setup and solve information.
+ *
+ * \begin{tabular}{|c|l|} \hline
+ * 0 & no printout (default) \\
+ * 1 & print setup information \\
+ * 2 & print solve information \\
+ * 3 & print both setup and solve information \\
+ * \hline
+ * \end{tabular} 
+ **/
+HYPRE_Int
+HYPRE_MGRSetPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetLogging
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Requests logging of solver diagnostics.
+ * Requests additional computations for diagnostic and similar
+ * data to be logged by the user. Default to 0 for do nothing.  The latest
+ * residual will be available if logging > 1.
+ **/
+HYPRE_Int
+HYPRE_MGRSetLogging( HYPRE_Solver solver, HYPRE_Int logging );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetMaxIter
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set maximum number of iterations if used as a solver.
+ * Set this to 1 if MGR is used as a preconditioner. The default is 20.
+ **/
+HYPRE_Int
+HYPRE_MGRSetMaxIter( HYPRE_Solver solver, HYPRE_Int max_iter );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetTol
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Set the convergence tolerance for the MGR solver.
+ * Use tol = 0.0 if MGR is used as a preconditioner. The default is 1.e-7. 
+ **/
+HYPRE_Int
+HYPRE_MGRSetTol( HYPRE_Solver solver, HYPRE_Real tol );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetMaxGlobalsmoothIters
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Determines how many sweeps of global smoothing to do.
+ * Default is 0 (no global smoothing).
+ **/
+HYPRE_Int
+HYPRE_MGRSetMaxGlobalsmoothIters( HYPRE_Solver solver, HYPRE_Int smooth_iter );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetGlobalsmoothType
+ *--------------------------------------------------------------------------*/	
+/**
+ * (Optional) Determines type of global smoother.
+ * Options for {\tt smooth\_type} are:
+ *
+ * \begin{tabular}{|c|l|} \hline
+ * 0 & block Jacobi (default) \\
+ * 1 & Jacobi \\
+ * 2 & Gauss-Seidel, sequential (very slow!) \\
+ * 3 & Gauss-Seidel, interior points in parallel, boundary sequential (slow!) \\
+ * 4 & hybrid Gauss-Seidel or SOR, forward solve \\
+ * 5 & hybrid Gauss-Seidel or SOR, backward solve \\
+ * 6 & hybrid chaotic Gauss-Seidel (works only with OpenMP) \\
+ * 7 & hybrid symmetric Gauss-Seidel or SSOR \\
+ * 8 & Euclid (ILU) \\
+ * \hline
+ * \end{tabular} 
+ **/
+HYPRE_Int
+HYPRE_MGRSetGlobalsmoothType( HYPRE_Solver solver, HYPRE_Int smooth_type );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRGetNumIterations
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Return the number of MGR iterations.
+ **/
+HYPRE_Int
+HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations );
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRGetResidualNorm
+ *--------------------------------------------------------------------------*/
+/**
+ * (Optional) Return the norm of the final relative residual.
+ **/
+HYPRE_Int
+HYPRE_MGRGetFinalRelativeResidualNorm(  HYPRE_Solver solver, HYPRE_Real *res_norm );
+
+/*@}*/
+
+/*--------------------------------------------------------------------------
+ *--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------
  * Miscellaneous: These probably do not belong in the interface.
  *--------------------------------------------------------------------------*/
 
@@ -3129,6 +3533,21 @@
                    HYPRE_Real       eps,
                    HYPRE_ParVector *rhs_ptr);
 
+HYPRE_ParCSRMatrix
+GenerateRSVarDifConv(MPI_Comm         comm,
+                     HYPRE_Int        nx,
+                     HYPRE_Int        ny,
+                     HYPRE_Int        nz,
+                     HYPRE_Int        P,
+                     HYPRE_Int        Q,
+                     HYPRE_Int        R,
+                     HYPRE_Int        p,
+                     HYPRE_Int        q,
+                     HYPRE_Int        r,
+                     HYPRE_Real       eps,
+                     HYPRE_ParVector *rhs_ptr,
+                     HYPRE_Int        type);
+
 float*
 GenerateCoordinates(MPI_Comm  comm,
                     HYPRE_Int nx,
diff -Nru hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_mgr.c hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_mgr.c
--- hypre-2.11.2/src/parcsr_ls/HYPRE_parcsr_mgr.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/HYPRE_parcsr_mgr.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,249 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#include "_hypre_parcsr_ls.h"
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRCreate
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_MGRCreate( HYPRE_Solver *solver )
+{
+   if (!solver)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+   *solver = ( (HYPRE_Solver) hypre_MGRCreate( ) );
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRDestroy
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int 
+HYPRE_MGRDestroy( HYPRE_Solver solver )
+{
+   return( hypre_MGRDestroy( (void *) solver ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetup
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int 
+HYPRE_MGRSetup( HYPRE_Solver solver,
+                         HYPRE_ParCSRMatrix A,
+                         HYPRE_ParVector b,
+                         HYPRE_ParVector x      )
+{
+   return( hypre_MGRSetup( (void *) solver,
+                                 (hypre_ParCSRMatrix *) A,
+                                 (hypre_ParVector *) b,
+                                 (hypre_ParVector *) x ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSolve
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int 
+HYPRE_MGRSolve( HYPRE_Solver solver,
+                         HYPRE_ParCSRMatrix A,
+                         HYPRE_ParVector b,
+                         HYPRE_ParVector x      )
+{
+   return( hypre_MGRSolve( (void *) solver,
+                                 (hypre_ParCSRMatrix *) A,
+                                 (hypre_ParVector *) b,
+                                 (hypre_ParVector *) x ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetCpointsByBlock
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int 
+HYPRE_MGRSetCpointsByBlock( HYPRE_Solver solver, 
+			HYPRE_Int  block_size, 
+			HYPRE_Int  max_num_levels, 
+			HYPRE_Int *block_num_coarse_points, 
+			HYPRE_Int  **block_coarse_indexes)
+{
+   return( hypre_MGRSetCpointsByBlock( (void *) solver, block_size, max_num_levels, block_num_coarse_points, block_coarse_indexes));
+}
+
+HYPRE_Int
+HYPRE_MGRSetNonCpointsToFpoints( HYPRE_Solver solver, HYPRE_Int nonCptToFptFlag)
+{   
+   return hypre_MGRSetNonCpointsToFpoints((void *) solver, nonCptToFptFlag);
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetCoarseSolver
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_MGRSetCoarseSolver(HYPRE_Solver          solver,
+                             HYPRE_PtrToParSolverFcn  coarse_grid_solver_solve,
+                             HYPRE_PtrToParSolverFcn  coarse_grid_solver_setup,
+                             HYPRE_Solver          coarse_grid_solver )
+{
+   return( hypre_MGRSetCoarseSolver( (void *) solver,
+									   (HYPRE_Int (*)(void*, void*, void*, void*)) coarse_grid_solver_solve,
+									   (HYPRE_Int (*)(void*, void*, void*, void*)) coarse_grid_solver_setup,
+									   (void *) coarse_grid_solver ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetMaxLevels
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver, HYPRE_Int maxlev )
+{
+   return hypre_MGRSetMaxCoarseLevels(solver, maxlev);
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetBlockSize
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetBlockSize( HYPRE_Solver solver, HYPRE_Int bsize )
+{
+   return hypre_MGRSetBlockSize(solver, bsize );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetReservedCoarseNodes
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver, HYPRE_Int reserved_coarse_size, HYPRE_Int *reserved_coarse_indexes )
+{
+   return hypre_MGRSetReservedCoarseNodes(solver, reserved_coarse_size, reserved_coarse_indexes );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetRestrictType
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetRestrictType(HYPRE_Solver solver, HYPRE_Int restrict_type )
+{
+   return hypre_MGRSetRestrictType(solver, restrict_type );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetRelaxMethod
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetFRelaxMethod(HYPRE_Solver solver, HYPRE_Int relax_method )
+{
+   return hypre_MGRSetFRelaxMethod(solver, relax_method );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetRelaxType
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetRelaxType(HYPRE_Solver solver, HYPRE_Int relax_type )
+{
+   return hypre_MGRSetRelaxType(solver, relax_type );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetNumRelaxSweeps
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
+{
+   return hypre_MGRSetNumRelaxSweeps(solver, nsweeps);
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetInterpType
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetInterpType( HYPRE_Solver solver, HYPRE_Int interpType )
+{
+   return hypre_MGRSetInterpType(solver, interpType);
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetNumInterpSweeps
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
+{
+   return hypre_MGRSetNumInterpSweeps(solver, nsweeps);
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetPrintLevel
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
+{
+   return hypre_MGRSetPrintLevel( solver, print_level );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetLogging
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetLogging( HYPRE_Solver solver, HYPRE_Int logging )
+{
+   return hypre_MGRSetLogging(solver, logging );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetMaxIter
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetMaxIter( HYPRE_Solver solver, HYPRE_Int max_iter )
+{
+   return hypre_MGRSetMaxIter( solver, max_iter );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetTol
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetTol( HYPRE_Solver solver, HYPRE_Real tol )
+{
+   return hypre_MGRSetTol( solver, tol );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetMaxGlobalsmoothIters
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRSetMaxGlobalsmoothIters( HYPRE_Solver solver, HYPRE_Int max_iter )
+{
+	return hypre_MGRSetMaxGlobalsmoothIters(solver, max_iter);
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetGlobalsmoothType
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_MGRSetGlobalsmoothType( HYPRE_Solver solver, HYPRE_Int iter_type )
+{
+	return hypre_MGRSetGlobalsmoothType(solver, iter_type);
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRGetNumIterations
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations )
+{
+   return hypre_MGRGetNumIterations( solver, num_iterations );
+}
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRGetFinalRelativeResidualNorm
+ *--------------------------------------------------------------------------*/
+HYPRE_Int
+HYPRE_MGRGetFinalRelativeResidualNorm(  HYPRE_Solver solver, HYPRE_Real *res_norm )
+{
+   return hypre_MGRGetFinalRelativeResidualNorm(solver, res_norm);
+}
diff -Nru hypre-2.11.2/src/parcsr_ls/Makefile hypre-2.13.0/src/parcsr_ls/Makefile
--- hypre-2.11.2/src/parcsr_ls/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -10,7 +10,6 @@
 # $Revision$
 #EHEADER**********************************************************************
 
-
 include ../config/Makefile.config
 
 ## Euclid specific macros taken from distributed_ls/Euclid
@@ -22,6 +21,8 @@
  -I../distributed_ls/Euclid\
  -I$(srcdir)\
  -I$(srcdir)/..\
+ -I$(srcdir)/../blas\
+ -I$(srcdir)/../lapack\
  -I$(srcdir)/../multivector\
  -I$(srcdir)/../utilities\
  -I$(srcdir)/../krylov\
@@ -72,6 +73,7 @@
  HYPRE_parcsr_lgmres.c\
  HYPRE_parcsr_hybrid.c\
  HYPRE_parcsr_int.c\
+ HYPRE_parcsr_mgr.c\
  HYPRE_parcsr_ParaSails.c\
  HYPRE_parcsr_pcg.c\
  HYPRE_parcsr_pilut.c\
@@ -86,6 +88,7 @@
  par_cg_relax_wt.c\
  par_coarsen.c\
  par_cgc_coarsen.c\
+ par_cheby.c\
  par_coarse_parms.c\
  par_coordinates.c\
  par_cr.c\
@@ -101,12 +104,16 @@
  par_laplace_9pt.c\
  par_laplace.c\
  par_lr_interp.c\
+ par_mgr.c\
+ par_mgr_setup.c\
+ par_mgr_solve.c\
  par_nongalerkin.c\
  par_nodal_systems.c\
  par_rap.c\
  par_rap_communication.c\
  par_rotate_7pt.c\
  par_vardifconv.c\
+ par_vardifconv_rs.c\
  par_relax.c\
  par_relax_more.c\
  par_relax_interface.c\
diff -Nru hypre-2.11.2/src/parcsr_ls/par_add_cycle.c hypre-2.13.0/src/parcsr_ls/par_add_cycle.c
--- hypre-2.11.2/src/parcsr_ls/par_add_cycle.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_add_cycle.c	2017-10-20 17:42:22.000000000 +0000
@@ -38,6 +38,7 @@
    hypre_ParCSRMatrix    **P_array;
    hypre_ParCSRMatrix    **R_array;
    hypre_ParCSRMatrix    *Lambda;
+   hypre_ParCSRMatrix    *Atilde;
    hypre_ParVector    **F_array;
    hypre_ParVector    **U_array;
    hypre_ParVector    *Vtemp;
@@ -46,11 +47,12 @@
    HYPRE_Int      **CF_marker_array;
 
    HYPRE_Int       num_levels;
-   HYPRE_Int       addlvl;
+   HYPRE_Int       addlvl, add_end;
    HYPRE_Int       additive;
    HYPRE_Int       mult_additive;
    HYPRE_Int       simple;
-   HYPRE_Int       i, num_rows;
+   HYPRE_Int       add_last_lvl;
+   HYPRE_Int       i, j, num_rows;
    HYPRE_Int       n_global;
    HYPRE_Int       rlx_order;
 
@@ -61,7 +63,9 @@
    HYPRE_Int       fine_grid;
    HYPRE_Int       rlx_down;
    HYPRE_Int       rlx_up;
+   HYPRE_Int       rlx_coarse;
    HYPRE_Int      *grid_relax_type;
+   HYPRE_Int      *num_grid_sweeps;
    HYPRE_Real      **l1_norms;
    HYPRE_Real    alpha, beta;
    HYPRE_Real *u_data;
@@ -92,8 +96,10 @@
    additive          = hypre_ParAMGDataAdditive(amg_data);
    mult_additive     = hypre_ParAMGDataMultAdditive(amg_data);
    simple            = hypre_ParAMGDataSimple(amg_data);
+   add_last_lvl      = hypre_ParAMGDataAddLastLvl(amg_data);
    grid_relax_type   = hypre_ParAMGDataGridRelaxType(amg_data);
    Lambda            = hypre_ParAMGDataLambda(amg_data);
+   Atilde            = hypre_ParAMGDataAtilde(amg_data);
    Xtilde            = hypre_ParAMGDataXtilde(amg_data);
    Rtilde            = hypre_ParAMGDataRtilde(amg_data);
    l1_norms          = hypre_ParAMGDataL1Norms(amg_data);
@@ -101,11 +107,14 @@
    relax_weight      = hypre_ParAMGDataRelaxWeight(amg_data);
    omega             = hypre_ParAMGDataOmega(amg_data);
    rlx_order         = hypre_ParAMGDataRelaxOrder(amg_data);
+   num_grid_sweeps   = hypre_ParAMGDataNumGridSweeps(amg_data);
 
    /* Initialize */
 
    addlvl = hypre_max(additive, mult_additive);
    addlvl = hypre_max(addlvl, simple);
+   if (add_last_lvl == -1 ) add_end = num_levels-1;
+   else add_end = add_last_lvl;
    Solve_err_flag = 0;
 
    /*---------------------------------------------------------------------
@@ -115,6 +124,7 @@
    /* down cycle */
    rlx_down = grid_relax_type[1];
    rlx_up = grid_relax_type[2];
+   rlx_coarse = grid_relax_type[3];
    for (level = 0; level < num_levels-1; level++)
    {
       fine_grid = level;
@@ -126,7 +136,7 @@
 
       hypre_ParVectorSetConstantValues(U_array[coarse_grid], 0.0); 
 
-      if (level < addlvl) /* multiplicative version */
+      if (level < addlvl || level > add_end) /* multiplicative version */
       {
          /* smoothing step */
 
@@ -134,33 +144,42 @@
          {
             HYPRE_Real *A_data = hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
             HYPRE_Int *A_i = hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
-            hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
             num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
+            for (j=0; j < num_grid_sweeps[1]; j++)
+            {
+             hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
-            for (i = 0; i < num_rows; i++)
+             for (i = 0; i < num_rows; i++)
                u_data[i] = relax_weight[level]*v_data[i] / A_data[A_i[i]];
+            }
          }
 
          else if (rlx_down != 18)
          {
             /*hypre_BoomerAMGRelax(A_array[fine_grid],F_array[fine_grid],NULL,rlx_down,0,*/
-            hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
-	     CF_marker_array[fine_grid], rlx_down,rlx_order,1,
-             relax_weight[fine_grid], omega[fine_grid],
-             l1_norms[level], U_array[fine_grid], Vtemp, Ztemp);
-            hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
+            for (j=0; j < num_grid_sweeps[1]; j++)
+            {
+               hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
+	           CF_marker_array[fine_grid], rlx_down,rlx_order,1,
+                   relax_weight[fine_grid], omega[fine_grid],
+                   l1_norms[level], U_array[fine_grid], Vtemp, Ztemp);
+               hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
+            }
          }
          else
          {
-            hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
             num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
+            for (j=0; j < num_grid_sweeps[1]; j++)
+            {
+             hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
-	    for (i = 0; i < num_rows; i++)
+	     for (i = 0; i < num_rows; i++)
                u_data[i] += v_data[i] / l1_norms_lvl[i];
+            }
          }
      
          alpha = -1.0;
@@ -188,7 +207,7 @@
       }
    }
 
-   /* solve coarse grid */ 
+   /* additive smoothing and solve coarse grid */ 
    if (addlvl < num_levels)
    {
       if (simple > -1)
@@ -203,16 +222,38 @@
 	    x_global[i] += D_inv[i]*r_global[i];
       }
       else
-	 hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 1.0, Xtilde);
+      {
+         if (num_grid_sweeps[1] > 1)
+         {
+            n_global = hypre_VectorSize(hypre_ParVectorLocalVector(Rtilde));
+            hypre_ParVector *Tmptilde = hypre_CTAlloc(hypre_ParVector, 1);
+            hypre_Vector *Tmptilde_local = hypre_SeqVectorCreate(n_global);   
+            hypre_SeqVectorInitialize(Tmptilde_local);
+            hypre_ParVectorLocalVector(Tmptilde) = Tmptilde_local;   
+            hypre_ParVectorOwnsData(Tmptilde) = 1;
+            hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 0.0, Tmptilde);
+            hypre_ParVectorScale(2.0,Rtilde);
+            hypre_ParCSRMatrixMatvec(-1.0, Atilde, Tmptilde, 1.0, Rtilde);
+            hypre_ParVectorDestroy(Tmptilde);
+         }
+         hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 1.0, Xtilde);
+      }
       if (addlvl == 0) hypre_ParVectorCopy(Xtilde, U_array[0]);
    }
-   else
+   if (add_end < num_levels -1)
    {
       fine_grid = num_levels -1;
-      hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid],
+      for (j=0; j < num_grid_sweeps[3]; j++)
+         if (rlx_coarse == 18)
+	    hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid],
                               1, 1, l1_norms[fine_grid],
                               1.0, 1.0 ,0,0,0,0,
                               U_array[fine_grid], Vtemp, Ztemp);
+         else
+            hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
+		NULL, rlx_coarse,0,0,
+                relax_weight[fine_grid], omega[fine_grid],
+                l1_norms[fine_grid], U_array[fine_grid], Vtemp, Ztemp);
    }
 
    /* up cycle */
@@ -221,7 +262,7 @@
       fine_grid = level - 1;
       coarse_grid = level;
 
-      if (level <= addlvl) /* multiplicative version */
+      if (level <= addlvl || level > add_end+1) /* multiplicative version */
       {
          alpha = 1.0;
          beta = 1.0;
@@ -230,7 +271,8 @@
                                      beta, U_array[fine_grid]);            
          if (rlx_up != 18)
             /*hypre_BoomerAMGRelax(A_array[fine_grid],F_array[fine_grid],NULL,rlx_up,0,*/
-            hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
+            for (j=0; j < num_grid_sweeps[2]; j++)
+              hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
 		CF_marker_array[fine_grid],
 		rlx_up,rlx_order,2,
                 relax_weight[fine_grid], omega[fine_grid],
@@ -240,6 +282,7 @@
             HYPRE_Int loc_relax_points[2];
             loc_relax_points[0] = -1;
             loc_relax_points[1] = 1;
+            for (j=0; j < num_grid_sweeps[2]; j++)
             for (i=0; i < 2; i++)
                 hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid],F_array[fine_grid],
                                             CF_marker_array[fine_grid],
@@ -248,6 +291,7 @@
                                             U_array[fine_grid], Vtemp);
          }
          else 
+            for (j=0; j < num_grid_sweeps[2]; j++)
             hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid],
                                  1, 1, l1_norms[fine_grid],
                                  1.0, 1.0 ,0,0,0,0,
@@ -281,6 +325,11 @@
    hypre_ParCSRMatrix *Lambda;
    hypre_CSRMatrix *L_diag;
    hypre_CSRMatrix *L_offd;
+   hypre_ParCSRMatrix *Atilde;
+   hypre_CSRMatrix *Atilde_diag;
+   hypre_CSRMatrix *Atilde_offd;
+   HYPRE_Real    *Atilde_diag_data;
+   HYPRE_Real    *Atilde_offd_data;
    hypre_CSRMatrix *A_tmp_diag;
    hypre_CSRMatrix *A_tmp_offd;
    hypre_ParVector *Xtilde;
@@ -305,6 +354,10 @@
    HYPRE_Int *L_diag_j;
    HYPRE_Int *L_offd_i;
    HYPRE_Int *L_offd_j;
+   HYPRE_Int *Atilde_diag_i;
+   HYPRE_Int *Atilde_diag_j;
+   HYPRE_Int *Atilde_offd_i;
+   HYPRE_Int *Atilde_offd_j;
    HYPRE_Int *A_tmp_diag_i;
    HYPRE_Int *A_tmp_offd_i;
    HYPRE_Int *A_tmp_diag_j;
@@ -340,10 +393,12 @@
    HYPRE_Int       num_cols_offd = 0;
    HYPRE_Int       level, i, j, k;
    HYPRE_Int       this_proc, cnt, cnt_diag, cnt_offd;
+   HYPRE_Int       A_cnt_diag, A_cnt_offd;
    HYPRE_Int       cnt_recv, cnt_send, cnt_row, row_start;
    HYPRE_Int       start_diag, start_offd, indx, cnt_map;
    HYPRE_Int       start, j_indx, index, cnt_level;
    HYPRE_Int       max_sends, max_recvs;
+   HYPRE_Int       ns;
 
  /* Local variables  */ 
    HYPRE_Int       Solve_err_flag = 0;
@@ -354,6 +409,7 @@
    /*HYPRE_Real   *relax_weight = NULL;
    HYPRE_Int      relax_type; */
    HYPRE_Int       add_rlx;
+   HYPRE_Int       add_last_lvl, add_end;
    HYPRE_Real  add_rlx_wt;
 
    /* Acquire data and allocate storage */
@@ -363,19 +419,23 @@
    U_array           = hypre_ParAMGDataUArray(amg_data);
    additive          = hypre_ParAMGDataAdditive(amg_data);
    mult_additive     = hypre_ParAMGDataMultAdditive(amg_data);
+   add_last_lvl      = hypre_ParAMGDataAddLastLvl(amg_data);
    num_levels        = hypre_ParAMGDataNumLevels(amg_data);
    /*relax_weight      = hypre_ParAMGDataRelaxWeight(amg_data);
    relax_type        = hypre_ParAMGDataGridRelaxType(amg_data)[1];*/
    comm              = hypre_ParCSRMatrixComm(A_array[0]);
    add_rlx           = hypre_ParAMGDataAddRelaxType(amg_data);
    add_rlx_wt        = hypre_ParAMGDataAddRelaxWt(amg_data);
+   ns                = hypre_ParAMGDataNumGridSweeps(amg_data)[1];
 
    hypre_MPI_Comm_size(comm,&num_procs);
 
    l1_norms_ptr      = hypre_ParAMGDataL1Norms(amg_data); 
 
    addlvl = hypre_max(additive, mult_additive);
-   num_add_lvls = num_levels+1-addlvl;
+   if (add_last_lvl != -1) add_end = add_last_lvl+1;
+   else add_end = num_levels;
+   num_add_lvls = add_end+1-addlvl;
 
    level_start = hypre_CTAlloc(HYPRE_Int, num_add_lvls+1);
    send_data_L = 0;
@@ -387,7 +447,7 @@
    cnt = 1;
    max_sends = 0;
    max_recvs = 0;
-   for (i=addlvl; i < num_levels; i++)
+   for (i=addlvl; i < add_end; i++)
    {
       A_tmp = A_array[i];
       A_tmp_diag = hypre_ParCSRMatrixDiag(A_tmp);
@@ -426,7 +486,7 @@
    {
       if (max_sends < num_procs && max_recvs < num_procs)
       {
-         for (i=addlvl; i < num_levels; i++)
+         for (i=addlvl; i < add_end; i++)
          {
             A_tmp = A_array[i];
             comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp);
@@ -482,7 +542,7 @@
          L_recv_ptr = hypre_CTAlloc(HYPRE_Int, num_recvs_L+1);
          L_send_ptr = hypre_CTAlloc(HYPRE_Int, num_sends_L+1);
 
-         for (i=addlvl; i < num_levels; i++)
+         for (i=addlvl; i < add_end; i++)
          {
             A_tmp = A_array[i];
             comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp);
@@ -524,7 +584,7 @@
       {
          num_recvs_L = 0;
          num_sends_L = 0;
-         for (i=addlvl; i < num_levels; i++)
+         for (i=addlvl; i < add_end; i++)
          {
             A_tmp = A_array[i];
             comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp);
@@ -591,6 +651,7 @@
    L_offd = hypre_CSRMatrixCreate(num_rows_L, num_cols_offd_L, num_nonzeros_offd);
    hypre_CSRMatrixInitialize(L_diag);
    hypre_CSRMatrixInitialize(L_offd);
+
    if (num_nonzeros_diag)
    {
       L_diag_data = hypre_CSRMatrixData(L_diag);
@@ -604,6 +665,26 @@
    }
    L_offd_i = hypre_CSRMatrixI(L_offd);
 
+   if (ns > 1)  
+   {
+      Atilde_diag = hypre_CSRMatrixCreate(num_rows_L, num_rows_L, num_nonzeros_diag);
+      Atilde_offd = hypre_CSRMatrixCreate(num_rows_L, num_cols_offd_L, num_nonzeros_offd);
+      hypre_CSRMatrixInitialize(Atilde_diag);
+      hypre_CSRMatrixInitialize(Atilde_offd);
+      if (num_nonzeros_diag)
+      {
+         Atilde_diag_data = hypre_CSRMatrixData(Atilde_diag);
+         Atilde_diag_j = hypre_CSRMatrixJ(Atilde_diag);
+      }
+      Atilde_diag_i = hypre_CSRMatrixI(Atilde_diag);
+      if (num_nonzeros_offd)
+      {
+         Atilde_offd_data = hypre_CSRMatrixData(Atilde_offd);
+         Atilde_offd_j = hypre_CSRMatrixJ(Atilde_offd);
+      }
+      Atilde_offd_i = hypre_CSRMatrixI(Atilde_offd);
+   }
+
    if (num_rows_L) D_data = hypre_CTAlloc(HYPRE_Real,num_rows_L);
    if (send_data_L)
    {
@@ -639,7 +720,14 @@
    cnt_row = 1; 
    L_diag_i[0] = 0;
    L_offd_i[0] = 0;
-   for (level=addlvl; level < num_levels; level++)
+   if (ns > 1) 
+   {
+      A_cnt_diag = 0; 
+      A_cnt_offd = 0; 
+      Atilde_diag_i[0] = 0;
+      Atilde_offd_i[0] = 0;
+   }
+   for (level=addlvl; level < add_end; level++)
    {
       row_start = level_start[cnt_level];
       if (level != 0)
@@ -716,11 +804,17 @@
 #pragma omp for private(i) HYPRE_SMP_SCHEDULE
 #endif
          for (i=0; i < num_rows_tmp; i++)
-        {
+         {
            D_data[i] = add_rlx_wt/A_tmp_diag_data[A_tmp_diag_i[i]];
            L_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
            L_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
-        }
+         }
+         if (ns > 1)
+           for (i=0; i < num_rows_tmp; i++)
+           {
+             Atilde_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
+             Atilde_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
+           }
       }
       else
       {
@@ -734,6 +828,12 @@
            L_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
            L_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
         }
+        if (ns > 1)
+          for (i=0; i < num_rows_tmp; i++)
+          {
+            Atilde_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
+            Atilde_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
+          }
       }
  
       if (num_procs > 1)
@@ -754,6 +854,11 @@
       for (i = 0; i < num_rows_tmp; i++)
       {
          j_indx = A_tmp_diag_i[i];
+         if (ns > 1)
+         {
+            Atilde_diag_data[A_cnt_diag] = A_tmp_diag_data[j_indx];
+            Atilde_diag_j[A_cnt_diag++] = i+row_start;
+         }
          L_diag_data[cnt_diag] = (2.0 - A_tmp_diag_data[j_indx]*D_data[i])*D_data[i];
          L_diag_j[cnt_diag++] = i+row_start;
          for (j=A_tmp_diag_i[i]+1; j < A_tmp_diag_i[i+1]; j++)
@@ -768,6 +873,21 @@
              L_offd_data[cnt_offd] = (- A_tmp_offd_data[j]*D_data_offd[j_indx])*D_data[i];
              L_offd_j[cnt_offd++] = remap[j_indx];
          }
+         if (ns > 1)
+         {
+            for (j=A_tmp_diag_i[i]+1; j < A_tmp_diag_i[i+1]; j++)
+            {
+               j_indx = A_tmp_diag_j[j];
+               Atilde_diag_data[A_cnt_diag] = A_tmp_diag_data[j];
+               Atilde_diag_j[A_cnt_diag++] = j_indx+row_start;
+            }
+            for (j=A_tmp_offd_i[i]; j < A_tmp_offd_i[i+1]; j++)
+            {
+               j_indx = A_tmp_offd_j[j];
+               Atilde_offd_data[A_cnt_offd] = A_tmp_offd_data[j];
+               Atilde_offd_j[A_cnt_offd++] = remap[j_indx];
+            }
+         }
       }
       cnt_row += num_rows_tmp;
    }
@@ -809,6 +929,52 @@
    hypre_ParCSRMatrixComm(Lambda) = comm;
    hypre_ParCSRMatrixOwnsData(Lambda) = 1;
 
+   if (ns > 1)
+   {
+      /*hypre_ParCSRCommPkg *A_comm_pkg = NULL;
+      HYPRE_Int *A_recv_ptr = NULL;
+      HYPRE_Int *A_send_ptr = NULL;
+      HYPRE_Int *A_recv_procs = NULL;
+      HYPRE_Int *A_send_procs = NULL;
+      HYPRE_Int *A_send_map_elmts = NULL;
+
+      A_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg,1);
+
+      A_recv_ptr = hypre_CTAlloc(HYPRE_Int, num_recvs+1);
+      A_send_ptr = hypre_CTAlloc(HYPRE_Int, num_sends+1);
+      A_recv_procs = hypre_CTAlloc(HYPRE_Int, num_recvs_L);
+      A_send_procs = hypre_CTAlloc(HYPRE_Int, num_sends_L);
+      A_send_map_elmts = hypre_CTAlloc(HYPRE_Int, L_send_ptr[num_sends_L]);
+
+      for (i=0; i<num_recvs_L+1; i++)
+	 A_recv_ptr[i] = L_recv_ptr[i];
+      for (i=0; i<num_sends_L+1; i++)
+	 A_send_ptr[i] = L_send_ptr[i];
+      for (i=0; i<num_recvs_L; i++)
+	 A_recv_procs[i] = L_recv_procs[i];
+      for (i=0; i<num_sends_L; i++)
+	 A_send_procs[i] = L_send_procs[i];
+      for (i=0; i < L_send_ptr[num_sends_L]; i++)
+	 A_send_map_elmts[i] = L_send_map_elmts[i];
+    
+      hypre_ParCSRCommPkgNumRecvs(A_comm_pkg) = num_recvs_L;
+      hypre_ParCSRCommPkgNumSends(A_comm_pkg) = num_sends_L;
+      hypre_ParCSRCommPkgRecvProcs(A_comm_pkg) = A_recv_procs;
+      hypre_ParCSRCommPkgSendProcs(A_comm_pkg) = A_send_procs;
+      hypre_ParCSRCommPkgRecvVecStarts(A_comm_pkg) = A_recv_ptr;
+      hypre_ParCSRCommPkgSendMapStarts(A_comm_pkg) = A_send_ptr;
+      hypre_ParCSRCommPkgSendMapElmts(A_comm_pkg) = A_send_map_elmts;
+      hypre_ParCSRCommPkgComm(A_comm_pkg) = comm; */
+
+      Atilde = hypre_CTAlloc(hypre_ParCSRMatrix, 1);
+      hypre_ParCSRMatrixDiag(Atilde) = Atilde_diag;
+      hypre_ParCSRMatrixOffd(Atilde) = Atilde_offd;
+      hypre_ParCSRMatrixCommPkg(Atilde) = L_comm_pkg;
+      hypre_ParCSRMatrixComm(Atilde) = comm;
+      hypre_ParCSRMatrixOwnsData(Atilde) = 1;
+      hypre_ParAMGDataAtilde(amg_data) = Atilde;
+   }
+
    hypre_ParAMGDataLambda(amg_data) = Lambda;
    hypre_ParAMGDataRtilde(amg_data) = Rtilde;
    hypre_ParAMGDataXtilde(amg_data) = Xtilde;
@@ -852,6 +1018,7 @@
    HYPRE_Int       level, i;
    HYPRE_Int       add_rlx;
    HYPRE_Real      add_rlx_wt;
+   HYPRE_Int       add_last_lvl, add_end;
 
  /* Local variables  */ 
    HYPRE_Int       Solve_err_flag = 0;
@@ -869,14 +1036,17 @@
    num_levels        = hypre_ParAMGDataNumLevels(amg_data);
    add_rlx_wt        = hypre_ParAMGDataAddRelaxWt(amg_data);
    add_rlx           = hypre_ParAMGDataAddRelaxType(amg_data);
+   add_last_lvl      = hypre_ParAMGDataAddLastLvl(amg_data);
    /*relax_weight      = hypre_ParAMGDataRelaxWeight(amg_data);
    relax_type        = hypre_ParAMGDataGridRelaxType(amg_data)[1];*/
 
    l1_norms_ptr      = hypre_ParAMGDataL1Norms(amg_data); 
    /* smooth_option       = hypre_ParAMGDataSmoothOption(amg_data); */
+   if (add_last_lvl == -1 ) add_end = num_levels;
+   else add_end = add_last_lvl;
 
    num_rows_L  = 0;
-   for (i=addlvl; i < num_levels; i++)
+   for (i=addlvl; i < add_end; i++)
    {
       A_tmp = A_array[i];
       A_tmp_diag = hypre_ParCSRMatrixDiag(A_tmp);
@@ -901,7 +1071,7 @@
    D_inv = hypre_CTAlloc(HYPRE_Real, num_rows_L);
 
    l1_start = 0;
-   for (level=addlvl; level < num_levels; level++)
+   for (level=addlvl; level < add_end; level++)
    {
       if (level != 0)
       {
diff -Nru hypre-2.11.2/src/parcsr_ls/par_amg.c hypre-2.13.0/src/parcsr_ls/par_amg.c
--- hypre-2.11.2/src/parcsr_ls/par_amg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_amg.c	2017-10-20 17:42:22.000000000 +0000
@@ -97,6 +97,9 @@
    char    *euclidfile;
 
    HYPRE_Int cheby_order;
+   HYPRE_Int cheby_eig_est;
+   HYPRE_Int cheby_variant;
+   HYPRE_Int cheby_scale;
    HYPRE_Real cheby_eig_ratio;
 
    HYPRE_Int block_mode;
@@ -104,6 +107,7 @@
    HYPRE_Int        additive;
    HYPRE_Int        mult_additive;
    HYPRE_Int        simple;
+   HYPRE_Int        add_last_lvl;
    HYPRE_Real   add_trunc_factor;
    HYPRE_Int      add_P_max_elmts;
    HYPRE_Int      add_rlx_type;
@@ -198,6 +202,9 @@
    outer_wt = 1.0;
 
    cheby_order = 2;
+   cheby_variant = 0;
+   cheby_scale = 1;
+   cheby_eig_est = 10;
    cheby_eig_ratio = .3;
 
    block_mode = 0;
@@ -205,6 +212,7 @@
    additive = -1;
    mult_additive = -1;
    simple = -1;
+   add_last_lvl = -1;
    add_trunc_factor = 0.0;
    add_P_max_elmts = 0;
    add_rlx_type = 18;
@@ -301,6 +309,9 @@
 
    hypre_BoomerAMGSetChebyOrder(amg_data, cheby_order);
    hypre_BoomerAMGSetChebyFraction(amg_data, cheby_eig_ratio);
+   hypre_BoomerAMGSetChebyEigEst(amg_data, cheby_eig_est);
+   hypre_BoomerAMGSetChebyVariant(amg_data, cheby_variant);
+   hypre_BoomerAMGSetChebyScale(amg_data, cheby_scale);
 
    hypre_BoomerAMGSetNumIterations(amg_data, num_iterations);
 
@@ -311,6 +322,7 @@
    hypre_BoomerAMGSetMultAddTruncFactor(amg_data, add_trunc_factor);
    hypre_BoomerAMGSetAddRelaxType(amg_data, add_rlx_type);
    hypre_BoomerAMGSetAddRelaxWt(amg_data, add_rlx_wt);
+   hypre_ParAMGDataAddLastLvl(amg_data) = add_last_lvl;
    hypre_ParAMGDataLambda(amg_data) = NULL;
    hypre_ParAMGDataXtilde(amg_data) = NULL;
    hypre_ParAMGDataRtilde(amg_data) = NULL;
@@ -353,8 +365,11 @@
    /* this can not be set by the user currently */
    hypre_ParAMGDataBlockMode(amg_data) = block_mode;
 
+   /* Stuff for Chebyshev smoothing */
    hypre_ParAMGDataMaxEigEst(amg_data) = NULL;
    hypre_ParAMGDataMinEigEst(amg_data) = NULL;
+   hypre_ParAMGDataChebyDS(amg_data) = NULL;
+   hypre_ParAMGDataChebyCoefs(amg_data) = NULL;
 
    /* BM Oct 22, 2006 */
    hypre_ParAMGDataPlotGrids(amg_data) = 0;
@@ -396,9 +411,14 @@
 
    hypre_ParAMGDataRAP2(amg_data) = 0;
    hypre_ParAMGDataKeepTranspose(amg_data) = 0;
-
-   HYPRE_ANNOTATION_END("BoomerAMG.create");
    
+/* information for preserving indices as coarse grid points */
+   hypre_ParAMGDataCPointKeepMarkerArray(amg_data) = NULL;
+   hypre_ParAMGDataCPointKeepLevel(amg_data) = 0;
+   hypre_ParAMGDataNumCPointKeep(amg_data)   = 0;
+   
+   HYPRE_ANNOTATION_END("BoomerAMG.create");
+
    return (void *) amg_data;
 }
 
@@ -497,12 +517,19 @@
     
         if (hypre_ParAMGDataPBlockArray(amg_data)[i-1])
            hypre_ParCSRBlockMatrixDestroy(hypre_ParAMGDataPBlockArray(amg_data)[i-1]);
-
    }
 
    if (hypre_ParAMGDataLambda(amg_data))
       hypre_ParCSRMatrixDestroy(hypre_ParAMGDataLambda(amg_data));
 
+   if (hypre_ParAMGDataAtilde(amg_data))
+   {
+      hypre_ParCSRMatrix *Atilde = hypre_ParAMGDataAtilde(amg_data);
+      hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(Atilde));
+      hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(Atilde));
+      hypre_TFree (Atilde);
+   }
+
    if (hypre_ParAMGDataXtilde(amg_data))
       hypre_ParVectorDestroy(hypre_ParAMGDataXtilde(amg_data));
 
@@ -517,6 +544,22 @@
       hypre_TFree(hypre_ParAMGDataL1Norms(amg_data));
    }
 
+   if (hypre_ParAMGDataChebyCoefs(amg_data))
+   {
+      for (i=0; i < num_levels; i++)
+         if (hypre_ParAMGDataChebyCoefs(amg_data)[i])
+           hypre_TFree(hypre_ParAMGDataChebyCoefs(amg_data)[i]);
+      hypre_TFree(hypre_ParAMGDataChebyCoefs(amg_data));
+   }
+
+   if (hypre_ParAMGDataChebyDS(amg_data))
+   {
+      for (i=0; i < num_levels; i++)
+         if (hypre_ParAMGDataChebyDS(amg_data)[i])
+           hypre_TFree(hypre_ParAMGDataChebyDS(amg_data)[i]);
+      hypre_TFree(hypre_ParAMGDataChebyDS(amg_data));
+   }
+
    if (hypre_ParAMGDataDinv(amg_data))
       hypre_TFree(hypre_ParAMGDataDinv(amg_data));
 
@@ -539,6 +582,7 @@
    hypre_TFree(hypre_ParAMGDataPBlockArray(amg_data));
    hypre_TFree(hypre_ParAMGDataPArray(amg_data));
    hypre_TFree(hypre_ParAMGDataCFMarkerArray(amg_data));
+
    if (hypre_ParAMGDataRtemp(amg_data))
       hypre_ParVectorDestroy(hypre_ParAMGDataRtemp(amg_data));
    if (hypre_ParAMGDataPtemp(amg_data))
@@ -646,6 +690,21 @@
    if (hypre_ParAMGDataFCoarse(amg_data))
       hypre_ParVectorDestroy(hypre_ParAMGDataFCoarse(amg_data));
 
+   /* destroy Cpoint_keep data */
+   if(hypre_ParAMGDataCPointKeepMarkerArray(amg_data))
+   {
+      for(i=0; i<hypre_ParAMGDataCPointKeepLevel(amg_data); i++)
+      {        
+          if(hypre_ParAMGDataCPointKeepMarkerArray(amg_data)[i])
+          {
+             hypre_TFree(hypre_ParAMGDataCPointKeepMarkerArray(amg_data)[i]);
+             hypre_ParAMGDataCPointKeepMarkerArray(amg_data)[i] = NULL;
+          }
+      }
+      hypre_TFree(hypre_ParAMGDataCPointKeepMarkerArray(amg_data));
+      hypre_ParAMGDataCPointKeepMarkerArray(amg_data) = NULL;
+   }
+
    if (hypre_ParAMGDataAMat(amg_data)) hypre_TFree(hypre_ParAMGDataAMat(amg_data));
    if (hypre_ParAMGDataBVec(amg_data)) hypre_TFree(hypre_ParAMGDataBVec(amg_data));
    if (hypre_ParAMGDataCommInfo(amg_data)) hypre_TFree(hypre_ParAMGDataCommInfo(amg_data));
@@ -3464,6 +3523,57 @@
 
    return hypre_error_flag;
 }
+HYPRE_Int
+hypre_BoomerAMGSetChebyEigEst( void     *data,
+                              HYPRE_Int     cheby_eig_est)
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+ 
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   } 
+   if (cheby_eig_est < 0)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   } 
+   hypre_ParAMGDataChebyEigEst(amg_data) = cheby_eig_est;
+
+   return hypre_error_flag;
+}
+HYPRE_Int
+hypre_BoomerAMGSetChebyVariant( void     *data,
+                              HYPRE_Int     cheby_variant)
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+ 
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   } 
+   hypre_ParAMGDataChebyVariant(amg_data) = cheby_variant;
+
+   return hypre_error_flag;
+}
+HYPRE_Int
+hypre_BoomerAMGSetChebyScale( void     *data,
+                              HYPRE_Int     cheby_scale)
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+ 
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   } 
+   hypre_ParAMGDataChebyScale(amg_data) = cheby_scale;
+
+   return hypre_error_flag;
+}
+
 
 /*--------------------------------------------------------------------------
  * hypre_BoomerAMGSetInterpVectors
@@ -3719,6 +3829,23 @@
 }
 
 HYPRE_Int
+hypre_BoomerAMGSetAddLastLvl( void *data,
+                          HYPRE_Int   add_last_lvl )
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   hypre_ParAMGDataAddLastLvl(amg_data) = add_last_lvl;
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int
 hypre_BoomerAMGSetNonGalerkinTol( void   *data,
                             HYPRE_Real nongalerkin_tol)
 {
@@ -3825,3 +3952,74 @@
   return hypre_error_flag;
 }
 
+HYPRE_Int
+hypre_BoomerAMGSetCpointsToKeep(void      *data,
+				HYPRE_Int  cpt_coarse_level,
+				HYPRE_Int  num_cpt_coarse,
+				HYPRE_Int *cpt_coarse_index)
+{
+	hypre_ParAMGData *amg_data = (hypre_ParAMGData*) data;
+
+	HYPRE_Int **C_point_marker_array = NULL;
+	HYPRE_Int *C_point_marker = NULL;
+	HYPRE_Int cpt_level;
+	HYPRE_Int i;
+
+	if (!amg_data)
+	{
+	    hypre_printf("Warning! AMG object empty!\n");
+	    hypre_error_in_arg(1);
+	    return hypre_error_flag;
+	}
+	if(cpt_coarse_level < 0)
+	{
+	   hypre_printf("Warning! cpt_coarse_level < 0 !\n");
+	   hypre_error_in_arg(2);
+           return hypre_error_flag;
+	}
+	if(num_cpt_coarse < 0)
+	{
+	   hypre_printf("Warning! num_cpt_coarse < 0 !\n");
+	   hypre_error_in_arg(2);
+           return hypre_error_flag;
+	}
+
+	/* free data not previously destroyed */
+	if(hypre_ParAMGDataCPointKeepLevel(amg_data))
+	{
+	   for(i=0; i<hypre_ParAMGDataCPointKeepLevel(amg_data); i++)
+	   {
+	      if(hypre_ParAMGDataCPointKeepMarkerArray(amg_data)[i])
+	      {
+	         hypre_TFree(hypre_ParAMGDataCPointKeepMarkerArray(amg_data)[i]);
+	         hypre_ParAMGDataCPointKeepMarkerArray(amg_data)[i] = NULL;
+	      }
+	   }
+	   hypre_TFree(hypre_ParAMGDataCPointKeepMarkerArray(amg_data));
+	   hypre_ParAMGDataCPointKeepMarkerArray(amg_data) = NULL;
+	}
+	/* set Cpoint_keep data */
+	if (hypre_ParAMGDataMaxLevels(amg_data) < cpt_coarse_level)
+	{
+		cpt_level = hypre_ParAMGDataNumLevels(amg_data);
+	} else {
+		cpt_level = cpt_coarse_level;
+	}
+        
+        if(cpt_level)
+        {
+           C_point_marker_array = hypre_CTAlloc(HYPRE_Int*, cpt_level);
+           C_point_marker = hypre_CTAlloc(HYPRE_Int, num_cpt_coarse);
+           /* copy Cpoint indexes */
+           for(i=0; i<num_cpt_coarse; i++)
+           {
+              C_point_marker[i] = cpt_coarse_index[i];
+           }
+           C_point_marker_array[0] = C_point_marker;     
+        }
+	hypre_ParAMGDataCPointKeepMarkerArray(amg_data) = C_point_marker_array;        
+	hypre_ParAMGDataNumCPointKeep(amg_data) = num_cpt_coarse;	
+	hypre_ParAMGDataCPointKeepLevel(amg_data) = cpt_level;
+
+	return hypre_error_flag;
+}
diff -Nru hypre-2.11.2/src/parcsr_ls/par_amg.h hypre-2.13.0/src/parcsr_ls/par_amg.h
--- hypre-2.11.2/src/parcsr_ls/par_amg.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_amg.h	2017-10-20 17:42:22.000000000 +0000
@@ -105,7 +105,6 @@
    HYPRE_Int                  num_levels;
    HYPRE_Real         **l1_norms;
 
-
    /* Block data */
    hypre_ParCSRBlockMatrix **A_block_array;
    hypre_ParCSRBlockMatrix **P_block_array;
@@ -136,8 +135,13 @@
 
    HYPRE_Real          *max_eig_est;
    HYPRE_Real          *min_eig_est;
-   HYPRE_Int                  cheby_order;
+   HYPRE_Int            cheby_eig_est;
+   HYPRE_Int            cheby_order;
+   HYPRE_Int            cheby_variant;
+   HYPRE_Int            cheby_scale;
    HYPRE_Real           cheby_fraction;
+   HYPRE_Real         **cheby_ds;
+   HYPRE_Real         **cheby_coefs;
 
    /* data needed for non-Galerkin option */
    HYPRE_Int           nongalerk_num_tol;
@@ -209,11 +213,13 @@
    HYPRE_Int      additive;
    HYPRE_Int      mult_additive;
    HYPRE_Int      simple;
+   HYPRE_Int      add_last_lvl;
    HYPRE_Int      add_P_max_elmts;
    HYPRE_Real     add_trunc_factor;
    HYPRE_Int      add_rlx_type;
    HYPRE_Real     add_rlx_wt;
    hypre_ParCSRMatrix *Lambda;
+   hypre_ParCSRMatrix *Atilde;
    hypre_ParVector *Rtilde;
    hypre_ParVector *Xtilde;
    HYPRE_Real *D_inv;
@@ -222,7 +228,11 @@
    HYPRE_Int rap2;
 
    HYPRE_Int keepTranspose;
-
+	
+/* information for preserving indeces as coarse grid points */
+	HYPRE_Int C_point_coarse_level;
+	HYPRE_Int num_C_point_marker;
+	HYPRE_Int   **C_point_marker_array;
 } hypre_ParAMGData;
 
 /*--------------------------------------------------------------------------
@@ -296,6 +306,7 @@
 
 /* data generated by the setup phase */
 #define hypre_ParAMGDataCFMarkerArray(amg_data) ((amg_data)-> CF_marker_array)
+#define hypre_ParAMGDataCPointMarkerArray(amg_data) ((amg_data)-> C_point_marker_array)
 #define hypre_ParAMGDataAArray(amg_data) ((amg_data)->A_array)
 #define hypre_ParAMGDataFArray(amg_data) ((amg_data)->F_array)
 #define hypre_ParAMGDataUArray(amg_data) ((amg_data)->U_array)
@@ -332,9 +343,13 @@
 
 #define hypre_ParAMGDataMaxEigEst(amg_data) ((amg_data)->max_eig_est)	
 #define hypre_ParAMGDataMinEigEst(amg_data) ((amg_data)->min_eig_est)	
+#define hypre_ParAMGDataChebyEigEst(amg_data) ((amg_data)->cheby_eig_est)
+#define hypre_ParAMGDataChebyVariant(amg_data) ((amg_data)->cheby_variant)
+#define hypre_ParAMGDataChebyScale(amg_data) ((amg_data)->cheby_scale)
 #define hypre_ParAMGDataChebyOrder(amg_data) ((amg_data)->cheby_order)
 #define hypre_ParAMGDataChebyFraction(amg_data) ((amg_data)->cheby_fraction)
-
+#define hypre_ParAMGDataChebyDS(amg_data) ((amg_data)->cheby_ds)
+#define hypre_ParAMGDataChebyCoefs(amg_data) ((amg_data)->cheby_coefs)
 
 /* block */
 #define hypre_ParAMGDataABlockArray(amg_data) ((amg_data)->A_block_array)
@@ -408,11 +423,13 @@
 #define hypre_ParAMGDataAdditive(amg_data) ((amg_data)->additive)
 #define hypre_ParAMGDataMultAdditive(amg_data) ((amg_data)->mult_additive)
 #define hypre_ParAMGDataSimple(amg_data) ((amg_data)->simple)
+#define hypre_ParAMGDataAddLastLvl(amg_data) ((amg_data)->add_last_lvl)
 #define hypre_ParAMGDataMultAddPMaxElmts(amg_data) ((amg_data)->add_P_max_elmts)
 #define hypre_ParAMGDataMultAddTruncFactor(amg_data) ((amg_data)->add_trunc_factor)
 #define hypre_ParAMGDataAddRelaxType(amg_data) ((amg_data)->add_rlx_type)
 #define hypre_ParAMGDataAddRelaxWt(amg_data) ((amg_data)->add_rlx_wt)
 #define hypre_ParAMGDataLambda(amg_data) ((amg_data)->Lambda)
+#define hypre_ParAMGDataAtilde(amg_data) ((amg_data)->Atilde)
 #define hypre_ParAMGDataRtilde(amg_data) ((amg_data)->Rtilde)
 #define hypre_ParAMGDataXtilde(amg_data) ((amg_data)->Xtilde)
 #define hypre_ParAMGDataDinv(amg_data) ((amg_data)->D_inv)
@@ -425,6 +442,8 @@
 
 #define hypre_ParAMGDataRAP2(amg_data) ((amg_data)->rap2)
 #define hypre_ParAMGDataKeepTranspose(amg_data) ((amg_data)->keepTranspose)
+
+
 #endif
 
 
diff -Nru hypre-2.11.2/src/parcsr_ls/par_amg_setup.c hypre-2.13.0/src/parcsr_ls/par_amg_setup.c
--- hypre-2.11.2/src/parcsr_ls/par_amg_setup.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_amg_setup.c	2017-10-20 17:42:22.000000000 +0000
@@ -79,10 +79,11 @@
    HYPRE_Int      mult_additive = hypre_ParAMGDataMultAdditive(amg_data);
    HYPRE_Int      additive = hypre_ParAMGDataAdditive(amg_data);
    HYPRE_Int      simple = hypre_ParAMGDataSimple(amg_data);
+   HYPRE_Int      add_last_lvl = hypre_ParAMGDataAddLastLvl(amg_data);
    HYPRE_Int      add_P_max_elmts = hypre_ParAMGDataMultAddPMaxElmts(amg_data);
    HYPRE_Real     add_trunc_factor = hypre_ParAMGDataMultAddTruncFactor(amg_data);
    HYPRE_Int      add_rlx = hypre_ParAMGDataAddRelaxType(amg_data);
-   HYPRE_Real      add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data);
+   HYPRE_Real     add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data);
 
    hypre_ParCSRBlockMatrix **A_block_array, **P_block_array;
  
@@ -99,8 +100,11 @@
    hypre_ParCSRMatrix  *AN = NULL;
    hypre_ParCSRMatrix  *P1;
    hypre_ParCSRMatrix  *P2;
+   hypre_ParCSRMatrix  *Pnew = NULL;
    HYPRE_Real          *SmoothVecs = NULL;
    HYPRE_Real         **l1_norms = NULL;
+   HYPRE_Real         **cheby_ds = NULL;
+   HYPRE_Real         **cheby_coefs = NULL;
 
    HYPRE_Int       old_num_levels, num_levels;
    HYPRE_Int       level;
@@ -183,7 +187,28 @@
    HYPRE_Int rap2 = hypre_ParAMGDataRAP2(amg_data);
    HYPRE_Int keepTranspose = hypre_ParAMGDataKeepTranspose(amg_data);
 
+   HYPRE_Int                **C_point_marker_array;
+   HYPRE_Int    local_coarse_size;
+   HYPRE_Int    num_C_point_coarse = hypre_ParAMGDataNumCPointKeep(amg_data);
+   HYPRE_Int   *C_point_keep;
+   
+   HYPRE_Int *num_grid_sweeps = hypre_ParAMGDataNumGridSweeps(amg_data);
+   HYPRE_Int ns = num_grid_sweeps[1];
    HYPRE_Real    wall_time;   /* for debugging instrumentation */
+   HYPRE_Int      add_end;
+
+#ifdef HYPRE_USE_GPU
+   if (!hypre_ParCSRMatrixIsManaged(A)){
+     hypre_fprintf(stderr,"ERROR:: INVALID A in hypre_BoomerAMGSetup::Address %p\n",A);
+     //exit(2);
+   } else if(!hypre_ParVectorIsManaged(f)){
+     hypre_fprintf(stderr,"ERROR:: INVALID f in hypre_BoomerAMGSetup::Address %p\n",f);
+     //exit(2);
+   } else if (!hypre_ParVectorIsManaged(u)){
+     hypre_fprintf(stderr,"ERROR:: INVALID u in hypre_BoomerAMGSetup::Address %p\n",u);
+     //exit(2);
+   } 
+#endif
 
    /*hypre_CSRMatrix *A_new;*/
 
@@ -196,6 +221,8 @@
    hypre_CSRMatrixPrint(A_new, "Atestnew"); */
    old_num_levels = hypre_ParAMGDataNumLevels(amg_data);
    max_levels = hypre_ParAMGDataMaxLevels(amg_data);
+   add_end = hypre_min(add_last_lvl, max_levels-1);
+   if (add_end == -1) add_end = max_levels-1;
    amg_logging = hypre_ParAMGDataLogging(amg_data);
    amg_print_level = hypre_ParAMGDataPrintLevel(amg_data);
    coarsen_type = hypre_ParAMGDataCoarsenType(amg_data);
@@ -246,6 +273,8 @@
 
    grid_relax_type[3] = hypre_ParAMGDataUserCoarseRelaxType(amg_data); 
 
+   C_point_marker_array = hypre_ParAMGDataCPointKeepMarkerArray(amg_data);
+
    HYPRE_ANNOTATION_BEGIN("BoomerAMG.setup");
    
    /* change in definition of standard and multipass interpolation, by
@@ -596,6 +625,7 @@
 
    dof_func_array[0] = dof_func;
    hypre_ParAMGDataCFMarkerArray(amg_data) = CF_marker_array;
+   hypre_ParAMGDataCPointKeepMarkerArray(amg_data) = C_point_marker_array;
    hypre_ParAMGDataDofFuncArray(amg_data) = dof_func_array;
    hypre_ParAMGDataAArray(amg_data) = A_array;
    hypre_ParAMGDataPArray(amg_data) = P_array;
@@ -745,7 +775,6 @@
    while (not_finished_coarsening)
    {
 
-
       /* only do nodal coarsening on a fixed number of levels */
       if (level >= nodal_levels)
       {
@@ -1086,6 +1115,43 @@
               AN = NULL;
            }
          }
+
+		 		 /**************************************************/
+		 /*********Set the fixed index to CF_marker*********/
+		 //num_C_point_coarse
+
+		 if (hypre_ParAMGDataCPointKeepLevel(amg_data) > 0)
+		 {
+			 if (block_mode)
+			 {
+				 printf("Keeping coarse nodes in block mode is not implemented\n");
+			 }
+			 else if  (level < hypre_ParAMGDataCPointKeepLevel(amg_data))
+			 {
+				 C_point_keep = C_point_marker_array[level];
+				 if (level < hypre_ParAMGDataCPointKeepLevel(amg_data)-1)
+					 C_point_marker_array[level+1] = hypre_CTAlloc(HYPRE_Int, num_C_point_coarse);
+				 
+				 for(j = 0;j < num_C_point_coarse;j++)
+				 {
+					 CF_marker[C_point_keep[j]] = 2;
+				 }
+				 
+				 local_coarse_size = 0;
+				 k = 0;
+				 for (j = 0; j < local_num_vars; j ++)
+				 {
+					 if (CF_marker[j] == 1) local_coarse_size++;
+					 if (CF_marker[j] == 2) {
+						 if (level < hypre_ParAMGDataCPointKeepLevel(amg_data)-1)
+							 C_point_marker_array[level+1][k++] = local_coarse_size;
+						 local_coarse_size++;
+						 CF_marker[j] = 1;					 
+					 }
+				 }
+			 }
+		 }
+		 
    /*****xxxxxxxxxxxxx changes for min_coarse_size */
          /* here we will determine the coarse grid size to be able to determine if it is not smaller 
 	    than requested minimal size */
@@ -1144,7 +1210,8 @@
 	    if (S) hypre_ParCSRMatrixDestroy(S);
 	    if (SN) hypre_ParCSRMatrixDestroy(SN);
 	    if (AN) hypre_ParCSRMatrixDestroy(AN);
-            hypre_TFree(CF_marker);
+	    if (num_functions > 1) hypre_TFree(coarse_dof_func);
+	    hypre_TFree(CF_marker);
             hypre_TFree(coarse_pnts_global);
             if (level > 0)
             {
@@ -1389,7 +1456,6 @@
 	    fflush(NULL);
          }
 
-
             if (debug_flag==1) wall_time = time_getWallclockSeconds();
 
             if (interp_type == 4) 
@@ -1895,11 +1961,9 @@
 
       if (!block_mode)
       {
-         if (mult_addlvl > -1 && level >= mult_addlvl)
+         if (mult_addlvl > -1 && level >= mult_addlvl && level <= add_end)
          {
             HYPRE_Real *d_diag;
-            hypre_ParCSRMatrix *Q = NULL;
-            Q = hypre_ParMatmul(A_array[level],P);
             if (add_rlx == 0)
             {
                hypre_CSRMatrix *lvl_Adiag = hypre_ParCSRMatrixDiag(A_array[level]);
@@ -1919,52 +1983,110 @@
                else 
                   hypre_ParCSRComputeL1NormsThreads(A_array[level], 1, num_threads, NULL, &d_diag);
             }
-            hypre_ParCSRMatrixAminvDB(P,Q,d_diag,&P_array[level]);
-            A_H = hypre_ParTMatmul(P,Q);
-            hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H);
-            hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
-            hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
-            hypre_ParCSRMatrixOwnsColStarts(P) = 0; 
-            if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); 
-            /*hypre_ParCSRMatrixDestroy(P); */
-            hypre_TFree(d_diag); 
-	    /*hypre_BoomerAMGBuildCoarseOperator(P, A_array[level] , P, &A_H); 
-            hypre_ParCSRMatrix *C = NULL;
-            HYPRE_Int *num_grid_sweeps
-                        = hypre_ParAMGDataNumGridSweeps(amg_data);
-            if (grid_relax_type[1] == 18)
-		C = hypre_CreateC(A_array[level], 0.0);
-            else
-		C = hypre_CreateC(A_array[level], relax_weight[level]);
-            if (num_grid_sweeps[1] > 1)
+            if (ns == 1)
             {
-                  hypre_ParCSRMatrix *Pnew = NULL;
-                  Pnew = hypre_ParMatmul(C,P);
-                  P_array[level] = hypre_ParMatmul(C,Pnew);
-                  hypre_ParCSRMatrixDestroy(Pnew);
+               hypre_ParCSRMatrix *Q = NULL;
+               Q = hypre_ParMatmul(A_array[level],P);
+               hypre_ParCSRMatrixAminvDB(P,Q,d_diag,&P_array[level]);
+               A_H = hypre_ParTMatmul(P,Q);
+               hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H);
+               hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
+               hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
+               hypre_ParCSRMatrixOwnsColStarts(P) = 0; 
+               if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); 
+               /*hypre_ParCSRMatrixDestroy(P); */
+               hypre_TFree(d_diag); 
+               /* Set NonGalerkin drop tol on each level */
+               if (level < nongalerk_num_tol) nongalerk_tol_l = nongalerk_tol[level];
+               if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level];
+               if (nongalerk_tol_l > 0.0)
+               {
+               /* Build Non-Galerkin Coarse Grid */
+                  hypre_ParCSRMatrix *Q = NULL;
+                  hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q,
+                    0.333*strong_threshold, max_row_sum, num_functions, 
+                    dof_func_array[level+1], S_commpkg_switch, CF_marker_array[level], 
+                    /* nongalerk_tol, sym_collapse, lump_percent, beta );*/
+                      nongalerk_tol_l,      1,            0.5,    1.0 );
+            
+                  hypre_ParCSRMatrixColStarts(P_array[level]) = hypre_ParCSRMatrixRowStarts(A_H);
+                  if (!hypre_ParCSRMatrixCommPkg(A_H))
+                     hypre_MatvecCommPkgCreate(A_H);
+               }
+               hypre_ParCSRMatrixDestroy(Q);
+			
             }
-            else
-                  P_array[level] = hypre_ParMatmul(C,P);
-            hypre_ParCSRMatrixDestroy(C); */
-
-            /* Set NonGalerkin drop tol on each level */
-            if (level < nongalerk_num_tol) nongalerk_tol_l = nongalerk_tol[level];
-            if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level];
-            if (nongalerk_tol_l > 0.0)
+            else 
             {
-            /* Build Non-Galerkin Coarse Grid */
-               hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q,
+               HYPRE_Int ns_tmp = ns;
+               hypre_ParCSRMatrix *C = NULL;
+               hypre_ParCSRMatrix *Ptmp = NULL;
+               /* Set NonGalerkin drop tol on each level */
+               if (level < nongalerk_num_tol)
+                   nongalerk_tol_l = nongalerk_tol[level];
+               if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level];
+
+               if (nongalerk_tol_l > 0.0)
+               {
+                  /* Construct AP, and then RAP */
+                  hypre_ParCSRMatrix *Q = NULL;
+                  Q = hypre_ParMatmul(A_array[level],P_array[level]);
+                  A_H = hypre_ParTMatmul(P_array[level],Q);
+                  hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H);
+                  hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
+                  hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
+                  hypre_ParCSRMatrixOwnsColStarts(P_array[level]) = 0;
+                  if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H);
+            
+                  /* Build Non-Galerkin Coarse Grid */
+                  hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q,
                     0.333*strong_threshold, max_row_sum, num_functions, 
                     dof_func_array[level+1], S_commpkg_switch, CF_marker_array[level], 
                     /* nongalerk_tol, sym_collapse, lump_percent, beta );*/
                       nongalerk_tol_l,      1,            0.5,    1.0 );
             
-               hypre_ParCSRMatrixColStarts(P_array[level]) = hypre_ParCSRMatrixRowStarts(A_H);
-               if (!hypre_ParCSRMatrixCommPkg(A_H))
-                   hypre_MatvecCommPkgCreate(A_H);
-			
+                  if (!hypre_ParCSRMatrixCommPkg(A_H))
+                     hypre_MatvecCommPkgCreate(A_H);
+            
+                  /* Delete AP */
+                  hypre_ParCSRMatrixDestroy(Q);
+               }
+               else if (rap2)
+               {
+                  /* Use two matrix products to generate A_H */
+                  hypre_ParCSRMatrix *Q = NULL;
+                  Q = hypre_ParMatmul(A_array[level],P_array[level]);
+                  A_H = hypre_ParTMatmul(P_array[level],Q);
+                  hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
+                  hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
+                  hypre_ParCSRMatrixOwnsColStarts(P_array[level]) = 0;
+                  if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H);
+                  /* Delete AP */
+                  hypre_ParCSRMatrixDestroy(Q);
+               }
+               else
+	          hypre_BoomerAMGBuildCoarseOperatorKT(P, A_array[level] , P, 
+			keepTranspose, &A_H); 
+	
+               if (add_rlx == 18)
+	          C = hypre_CreateC(A_array[level], 0.0);
+               else
+		  C = hypre_CreateC(A_array[level], add_rlx_wt);
+               Ptmp = P;
+	       while (ns_tmp > 0)
+               {
+                  Pnew = Ptmp;
+                  Ptmp = NULL;
+		  Ptmp = hypre_ParMatmul(C,Pnew);
+                  if (ns_tmp < ns)
+			hypre_ParCSRMatrixDestroy(Pnew);
+		  ns_tmp--;
+               }
+               Pnew = Ptmp;
+               P_array[level] = Pnew;
+               hypre_ParCSRMatrixDestroy(C); 
             }
-            hypre_ParCSRMatrixDestroy(Q);
+
 
 
             if (add_P_max_elmts || add_trunc_factor)
@@ -2012,7 +2134,7 @@
          A_block_array[level+1] = A_H_block;
 
       }
-      else if (mult_addlvl == -1 || level < mult_addlvl)
+      else if (mult_addlvl == -1 || level < mult_addlvl || level > add_end)
       {
          /* Set NonGalerkin drop tol on each level */
          if (level < nongalerk_num_tol)
@@ -2062,6 +2184,12 @@
             /* Compute standard Galerkin coarse-grid product */
             hypre_BoomerAMGBuildCoarseOperatorKT(P_array[level], A_array[level] , 
                                         P_array[level], keepTranspose, &A_H);
+            if (Pnew && ns==1) 
+            {
+               hypre_ParCSRMatrixDestroy(P);
+               P_array[level] = Pnew;
+            }
+	
          }
 
       }
@@ -2103,6 +2231,7 @@
    if (  (seq_threshold >= coarse_threshold) && (coarse_size > coarse_threshold) && (level != max_levels-1))
    {
       hypre_seqAMGSetup( amg_data, level, coarse_threshold);
+
    }
    else if (grid_relax_type[3] == 9 || grid_relax_type[3] == 99)  /*use of Gaussian elimination on coarsest level */
    {
@@ -2153,7 +2282,7 @@
          hypre_ParVectorSetPartitioningOwner(U_array[level],0);
       }   
    }
-   
+
    /*-----------------------------------------------------------------------
     * enter all the stuff created, A[level], P[level], CF_marker[level],
     * for levels 1 through coarsest, into amg_data data structure
@@ -2170,6 +2299,7 @@
     *-----------------------------------------------------------------------*/
 
    if (addlvl > -1 || 
+	grid_relax_type[1] == 7 || grid_relax_type[2] == 7 || grid_relax_type[3] == 7 ||
 	grid_relax_type[1] == 8 || grid_relax_type[2] == 8 || grid_relax_type[3] == 8 ||
 	grid_relax_type[1] == 13 || grid_relax_type[2] == 13 || grid_relax_type[3] == 13 ||
 	grid_relax_type[1] == 14 || grid_relax_type[2] == 14 || grid_relax_type[3] == 14 ||
@@ -2185,6 +2315,10 @@
       min_eig_est = hypre_CTAlloc(HYPRE_Real, num_levels);
       hypre_ParAMGDataMaxEigEst(amg_data) = max_eig_est;
       hypre_ParAMGDataMinEigEst(amg_data) = min_eig_est;
+      cheby_ds = hypre_CTAlloc(HYPRE_Real *, num_levels);
+      cheby_coefs = hypre_CTAlloc(HYPRE_Real *, num_levels);
+      hypre_ParAMGDataChebyDS(amg_data) = cheby_ds;
+      hypre_ParAMGDataChebyCoefs(amg_data) = cheby_coefs;
    }
    if (grid_relax_type[0] == 15 ||grid_relax_type[1] == 15 ||  grid_relax_type[2] == 15 || grid_relax_type[3] == 15)
       /* CG */
@@ -2253,7 +2387,7 @@
          }
       }
    }
-   for (j = addlvl; j < num_levels; j++)
+   for (j = addlvl; j < hypre_min(add_end+1, num_levels) ; j++)
    {
       if (add_rlx == 18 )
       {
@@ -2263,15 +2397,92 @@
                hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
       }
    }
+   for (j = add_end+1; j < num_levels; j++)
+   {
+      if (num_threads == 1)
+      {
+         if (j < num_levels-1 && (grid_relax_type[1] == 8 || grid_relax_type[1] == 13 || 
+		grid_relax_type[1] == 14 || grid_relax_type[2] == 8 || grid_relax_type[2] == 13 ||
+		grid_relax_type[2] == 14)) 
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1Norms(A_array[j], 4, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]);
+         }
+         else if ((grid_relax_type[3] == 8 || grid_relax_type[3] == 13 || grid_relax_type[3] == 14) 
+		&& j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]);
+         }
+         if ((grid_relax_type[1] == 18 || grid_relax_type[2] == 18)  && j < num_levels-1)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1Norms(A_array[j], 1, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]);
+         }
+         else if (grid_relax_type[3] == 18 && j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]);
+         }
+      }
+      else
+      {
+         if (j < num_levels-1 && (grid_relax_type[1] == 8 || grid_relax_type[1] == 13 || 
+		grid_relax_type[1] == 14 || grid_relax_type[2] == 8 || grid_relax_type[2] == 13 ||
+		grid_relax_type[2] == 14)) 
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, CF_marker_array[j] , &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]);
+         }
+         else if ((grid_relax_type[3] == 8 || grid_relax_type[3] == 13 || grid_relax_type[3] == 14) 
+		&& j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]);
+         }
+         if ((grid_relax_type[1] == 18 || grid_relax_type[2] == 18)  && j < num_levels-1)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
+         }
+         else if (grid_relax_type[3] == 18 && j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
+         }
+      }
+   }
    for (j = 0; j < num_levels; j++)
    {
-      if (grid_relax_type[1] == 16 || grid_relax_type[2] == 16 || (grid_relax_type[3] == 16 && j== (num_levels-1)))
+      if (grid_relax_type[1] == 7 || grid_relax_type[2] == 7 || (grid_relax_type[3] == 7 && j== (num_levels-1)))
       {
-         HYPRE_Int scale = 1;
-         HYPRE_Real temp_d, temp_d2;
-         hypre_ParCSRMaxEigEstimateCG(A_array[j], scale, 10, &temp_d, &temp_d2);
-         max_eig_est[j] = temp_d;
-         min_eig_est[j] = temp_d2;
+          hypre_ParCSRComputeL1Norms(A_array[j], 5, NULL, &l1_norms[j]);
+      }
+      else if (grid_relax_type[1] == 16 || grid_relax_type[2] == 16 || (grid_relax_type[3] == 16 && j== (num_levels-1)))
+      {
+         HYPRE_Int scale = hypre_ParAMGDataChebyScale(amg_data);;
+         HYPRE_Int variant = hypre_ParAMGDataChebyVariant(amg_data);
+         HYPRE_Real max_eig, min_eig = 0;
+         HYPRE_Real *coefs = NULL;
+         HYPRE_Real *ds = NULL;
+         HYPRE_Int cheby_order = hypre_ParAMGDataChebyOrder(amg_data);
+         HYPRE_Int cheby_eig_est = hypre_ParAMGDataChebyEigEst(amg_data);
+         HYPRE_Real cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);
+         if (cheby_eig_est)
+	    hypre_ParCSRMaxEigEstimateCG(A_array[j], scale, cheby_eig_est, 
+		&max_eig, &min_eig);
+         else
+	    hypre_ParCSRMaxEigEstimate(A_array[j], scale, &max_eig);
+         max_eig_est[j] = max_eig;
+         min_eig_est[j] = min_eig;
+         hypre_ParCSRRelax_Cheby_Setup(A_array[j],max_eig, min_eig, 
+		cheby_fraction, cheby_order, scale, variant, &coefs, &ds);
+         cheby_coefs[j] = coefs;
+         cheby_ds[j] = ds;
       }
      else if (grid_relax_type[1] == 15 || (grid_relax_type[3] == 15 && j == (num_levels-1))  )
      {
diff -Nru hypre-2.11.2/src/parcsr_ls/par_cheby.c hypre-2.13.0/src/parcsr_ls/par_cheby.c
--- hypre-2.11.2/src/parcsr_ls/par_cheby.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_cheby.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,338 @@
+/******************************************************************************
+ *
+ * Chebyshev setup and solve
+ *
+ *****************************************************************************/
+
+#include "_hypre_parcsr_ls.h"
+#include "_hypre_parcsr_mv.h"
+#include "float.h"
+
+
+/******************************************************************************
+
+Chebyshev relaxation
+
+ 
+Can specify order 1-4 (this is the order of the resid polynomial)- here we
+explicitly code the coefficients (instead of
+iteratively determining)
+
+
+variant 0: standard chebyshev
+this is rlx 11 if scale = 0, and 16 if scale == 1
+
+variant 1: modified cheby: T(t)* f(t) where f(t) = (1-b/t)
+this is rlx 15 if scale = 0, and 17 if scale == 1
+
+ratio indicates the percentage of the whole spectrum to use (so .5
+means half, and .1 means 10percent)
+
+
+*******************************************************************************/
+
+HYPRE_Int hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A, /* matrix to relax with */
+                            HYPRE_Real max_eig,      
+                            HYPRE_Real min_eig,     
+                            HYPRE_Real fraction,   
+                            HYPRE_Int order,            /* polynomial order */
+                            HYPRE_Int scale,            /* scale by diagonal?*/
+                            HYPRE_Int variant,           
+                            HYPRE_Real **coefs_ptr,
+                            HYPRE_Real **ds_ptr)   /* initial/updated approximation */
+{
+   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Real     *A_diag_data  = hypre_CSRMatrixData(A_diag);
+   HYPRE_Int            *A_diag_i     = hypre_CSRMatrixI(A_diag);
+
+   HYPRE_Real theta, delta;
+   
+   HYPRE_Real den;
+   HYPRE_Real upper_bound, lower_bound;
+   
+   HYPRE_Int j;
+   HYPRE_Int num_rows = hypre_CSRMatrixNumRows(A_diag);
+ 
+   HYPRE_Real *coefs = NULL;
+   
+   HYPRE_Int cheby_order;
+
+   HYPRE_Real *ds_data = NULL;
+   HYPRE_Real  diag;
+
+   /* u = u + p(A)r */
+
+   if (order > 4)
+      order = 4;
+   if (order < 1)
+      order = 1;
+
+   coefs = hypre_CTAlloc(HYPRE_Real, order+1);
+   /* we are using the order of p(A) */
+   cheby_order = order -1;
+   
+    /* make sure we are large enough -  Adams et al. 2003 */
+   upper_bound = max_eig * 1.1;
+   /* lower_bound = max_eig/fraction; */
+   lower_bound = (upper_bound - min_eig)* fraction + min_eig; 
+
+
+   /* theta and delta */
+   theta = (upper_bound + lower_bound)/2;
+   delta = (upper_bound - lower_bound)/2;
+
+   if (variant == 1 )
+   {
+      switch ( cheby_order ) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0  - so order is
+                                one less that  resid poly: r(t) = 1 - t*s(t) */ 
+      {
+         case 0: 
+            coefs[0] = 1.0/theta;     
+            
+            break;
+            
+         case 1:  /* (del - t + 2*th)/(th^2 + del*th) */
+            den = (theta*theta + delta*theta);
+            
+            coefs[0] = (delta + 2*theta)/den;     
+            coefs[1] = -1.0/den;
+            
+            break;
+            
+         case 2:  /* (4*del*th - del^2 - t*(2*del + 6*th) + 2*t^2 + 6*th^2)/(2*del*th^2 - del^2*th - del^3 + 2*th^3)*/
+            den = 2*delta*theta*theta - delta*delta*theta - pow(delta,3) + 2*pow(theta,3);
+            
+            coefs[0] = (4*delta*theta - pow(delta,2) +  6*pow(theta,2))/den;
+            coefs[1] = -(2*delta + 6*theta)/den;
+            coefs[2] =  2/den;
+            
+            break;
+            
+         case 3: /* -(6*del^2*th - 12*del*th^2 - t^2*(4*del + 16*th) + t*(12*del*th - 3*del^2 + 24*th^2) + 3*del^3 + 4*t^3 - 16*th^3)/(4*del*th^3 - 3*del^2*th^2 - 3*del^3*th + 4*th^4)*/
+            den = - (4*delta*pow(theta,3) - 3*pow(delta,2)*pow(theta,2) - 3*pow(delta,3)*theta + 4*pow(theta,4) );
+            
+            coefs[0] = (6*pow(delta,2)*theta - 12*delta*pow(theta,2) + 3*pow(delta,3) - 16*pow(theta,3)   )/den;
+            coefs[1] = (12*delta*theta - 3*pow(delta,2) + 24*pow(theta,2))/den;
+            coefs[2] =  -( 4*delta + 16*theta)/den;
+            coefs[3] = 4/den;
+            
+            break;
+      }
+   }
+   
+   else /* standard chebyshev */
+   {
+   
+      switch ( cheby_order ) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0  - so order is
+                                one less thatn resid poly: r(t) = 1 - t*s(t) */ 
+      {
+         case 0: 
+            coefs[0] = 1.0/theta;     
+            break;
+            
+         case 1:  /* (  2*t - 4*th)/(del^2 - 2*th^2) */
+            den = delta*delta - 2*theta*theta;
+            
+            coefs[0] = -4*theta/den;     
+            coefs[1] = 2/den;   
+            
+            break;
+            
+         case 2: /* (3*del^2 - 4*t^2 + 12*t*th - 12*th^2)/(3*del^2*th - 4*th^3)*/
+            den = 3*(delta*delta)*theta - 4*(theta*theta*theta);
+            
+            coefs[0] = (3*delta*delta - 12 *theta*theta)/den;
+            coefs[1] = 12*theta/den;
+            coefs[2] = -4/den; 
+            
+            break;
+            
+         case 3: /*(t*(8*del^2 - 48*th^2) - 16*del^2*th + 32*t^2*th - 8*t^3 + 32*th^3)/(del^4 - 8*del^2*th^2 + 8*th^4)*/
+            den = pow(delta,4) - 8*delta*delta*theta*theta + 8*pow(theta,4);
+            
+            coefs[0] = (32*pow(theta,3)- 16*delta*delta*theta)/den;
+            coefs[1] = (8*delta*delta - 48*theta*theta)/den;
+            coefs[2] = 32*theta/den;
+            coefs[3] = -8/den;
+            
+            break;
+      }
+   }
+   *coefs_ptr = coefs;
+
+   if (scale)
+   {
+      /*grab 1/sqrt(diagonal) */
+      ds_data = hypre_CTAlloc(HYPRE_Real, num_rows);
+      
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j,diag) HYPRE_SMP_SCHEDULE 
+#endif
+      for (j = 0; j < num_rows; j++)
+      {
+         diag = A_diag_data[A_diag_i[j]];
+         ds_data[j] = 1/sqrt(diag);
+      }
+
+   }/* end of scaling code */
+   *ds_ptr = ds_data;
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int hypre_ParCSRRelax_Cheby_Solve(hypre_ParCSRMatrix *A, /* matrix to relax with */
+                            hypre_ParVector *f,    /* right-hand side */
+                            HYPRE_Real *ds_data,
+                            HYPRE_Real *coefs,
+                            HYPRE_Int order,            /* polynomial order */
+                            HYPRE_Int scale,            /* scale by diagonal?*/
+                            HYPRE_Int variant,           
+                            hypre_ParVector *u,   /* initial/updated approximation */
+                            hypre_ParVector *v    /* temporary vector */,
+                            hypre_ParVector *r    /*another temp vector */  )
+{
+   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Real *u_data = hypre_VectorData(hypre_ParVectorLocalVector(u));
+   HYPRE_Real *f_data = hypre_VectorData(hypre_ParVectorLocalVector(f));
+   HYPRE_Real *v_data = hypre_VectorData(hypre_ParVectorLocalVector(v));
+
+   HYPRE_Real  *r_data = hypre_VectorData(hypre_ParVectorLocalVector(r));
+
+   HYPRE_Int i, j;
+   HYPRE_Int num_rows = hypre_CSRMatrixNumRows(A_diag);
+ 
+   HYPRE_Real mult;
+   HYPRE_Real *orig_u;
+   
+   HYPRE_Int cheby_order;
+
+   HYPRE_Real  *tmp_data;
+
+   hypre_ParVector    *tmp_vec;
+
+   /* u = u + p(A)r */
+
+   if (order > 4)
+      order = 4;
+   if (order < 1)
+      order = 1;
+
+   /* we are using the order of p(A) */
+   cheby_order = order -1;
+   
+   orig_u = hypre_CTAlloc(HYPRE_Real, num_rows);
+
+   if (!scale)
+   {
+      /* get residual: r = f - A*u */
+      hypre_ParVectorCopy(f, r); 
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
+
+      for ( i = 0; i < num_rows; i++ ) 
+      {
+         orig_u[i] = u_data[i];
+         u_data[i] = r_data[i] * coefs[cheby_order]; 
+      }
+      for (i = cheby_order - 1; i >= 0; i-- ) 
+      {
+         hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v);
+         mult = coefs[i];
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+         for ( j = 0; j < num_rows; j++ )
+         {
+            u_data[j] = mult * r_data[j] + v_data[j];
+         }
+      }
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( i = 0; i < num_rows; i++ ) 
+      {
+         u_data[i] = orig_u[i] + u_data[i];
+      }
+   }
+   else /* scaling! */
+   {
+      
+      /*grab 1/sqrt(diagonal) */
+      
+      tmp_vec = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
+                                      hypre_ParCSRMatrixGlobalNumRows(A),
+                                      hypre_ParCSRMatrixRowStarts(A));
+      hypre_ParVectorInitialize(tmp_vec);
+      hypre_ParVectorSetPartitioningOwner(tmp_vec,0);
+      tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
+
+    /* get ds_data and get scaled residual: r = D^(-1/2)f -
+       * D^(-1/2)A*u */
+
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( j = 0; j < num_rows; j++ ) 
+      {
+         r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]);
+      }
+
+      /* save original u, then start 
+         the iteration by multiplying r by the cheby coef.*/
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( j = 0; j < num_rows; j++ ) 
+      {
+         orig_u[j] = u_data[j]; /* orig, unscaled u */
+
+         u_data[j] = r_data[j] * coefs[cheby_order]; 
+      }
+
+      /* now do the other coefficients */   
+      for (i = cheby_order - 1; i >= 0; i-- ) 
+      {
+         /* v = D^(-1/2)AD^(-1/2)u */
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+         for ( j = 0; j < num_rows; j++ )
+         {
+            tmp_data[j]  =  ds_data[j] * u_data[j];
+         }
+         hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v);
+
+         /* u_new = coef*r + v*/
+         mult = coefs[i];
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+         for ( j = 0; j < num_rows; j++ )
+         {
+            u_data[j] = mult * r_data[j] + ds_data[j]*v_data[j];
+         }
+         
+      } /* end of cheby_order loop */
+
+      /* now we have to scale u_data before adding it to u_orig*/
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( j = 0; j < num_rows; j++ ) 
+      {
+         u_data[j] = orig_u[j] + ds_data[j]*u_data[j];
+      }
+   
+      hypre_ParVectorDestroy(tmp_vec);  
+
+   }/* end of scaling code */
+
+   hypre_TFree(orig_u);
+  
+   return hypre_error_flag;
+}
+
diff -Nru hypre-2.11.2/src/parcsr_ls/par_coarsen.c hypre-2.13.0/src/parcsr_ls/par_coarsen.c
--- hypre-2.11.2/src/parcsr_ls/par_coarsen.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_coarsen.c	2017-10-20 17:42:22.000000000 +0000
@@ -1118,7 +1118,8 @@
          }
          else
          {
-            if (measure < 0) hypre_printf("negative measure!\n");
+	    if (measure < 0) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"negative measure!\n");
+            /*if (measure < 0) hypre_printf("negative measure!\n");*/
             CF_marker[j] = f_pnt;
             for (k = S_i[j]; k < S_i[j+1]; k++)
             {
diff -Nru hypre-2.11.2/src/parcsr_ls/par_cr.c hypre-2.13.0/src/parcsr_ls/par_cr.c
--- hypre-2.11.2/src/parcsr_ls/par_cr.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_cr.c	2017-10-20 17:42:22.000000000 +0000
@@ -90,7 +90,7 @@
    hypre_fprintf(stdout,"-----------------------\n");
 
    for (i = 0; i < n; i++) 
-      e1[i] = 1.0e0+.1*rand();
+      e1[i] = 1.0e0+.1*hypre_RandI();
    
   /* stages */
    while(1){
@@ -133,8 +133,8 @@
 	    if (cf[i] ==  cpt) 
                nc+=1.0e0;
 	    else if (cf[i] ==  fpt){ 
-               e0[i] = 1.0e0+.1*rand();
-               e1[i] = 1.0e0+.1*rand();
+               e0[i] = 1.0e0+.1*hypre_RandI();
+               e1[i] = 1.0e0+.1*hypre_RandI();
             }
          }
          nstages += 1;
@@ -2735,7 +2735,7 @@
 
    for (i = 0; i < num_variables; i++) 
       e1[i] = 1.0e0;
-      /*e1[i] = 1.0e0+.1*rand();*/
+      /*e1[i] = 1.0e0+.1*hypre_RandI();*/
    
   /* stages */
    while(1)
@@ -2852,7 +2852,7 @@
                if (CF_marker[i] ==  fpt)
                {
                   e1[i] = 1.0e0;
-                  /*e1[i] = 1.0e0+.1*rand();*/
+                  /*e1[i] = 1.0e0+.1*hypre_RandI();*/
                   e0[i] = e1[i];
                }
             }
@@ -3011,8 +3011,8 @@
                   num_coarse++;
 	       else if (CF_marker[i] ==  fpt)
                { 
-                  e0[i] = 1.0e0+.1*rand();
-                  e1[i] = 1.0e0+.1*rand();
+                  e0[i] = 1.0e0+.1*hypre_RandI();
+                  e1[i] = 1.0e0+.1*hypre_RandI();
                }
             }
          }
@@ -3035,8 +3035,8 @@
                   for (j=0; j < num_functions; j++)
                   { 
 		     /*CF_marker[jj] = CFN_marker[i];
-                     e0[jj] = 1.0e0+.1*rand();
-                     e1[jj++] = 1.0e0+.1*rand();*/
+                     e0[jj] = 1.0e0+.1*hypre_RandI();
+                     e1[jj++] = 1.0e0+.1*hypre_RandI();*/
                      e0[jj] = 1.0e0;
                      e1[jj++] = 1.0e0;
                   } 
diff -Nru hypre-2.11.2/src/parcsr_ls/par_cycle.c hypre-2.13.0/src/parcsr_ls/par_cycle.c
--- hypre-2.11.2/src/parcsr_ls/par_cycle.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_cycle.c	2017-10-20 17:42:22.000000000 +0000
@@ -76,10 +76,7 @@
 
    HYPRE_Int     block_mode;
 
-   HYPRE_Real  *max_eig_est;
-   HYPRE_Real  *min_eig_est;
    HYPRE_Int      cheby_order;
-   HYPRE_Real   cheby_fraction;
 
  /* Local variables  */
    HYPRE_Int      *lev_counter;
@@ -111,6 +108,8 @@
    HYPRE_Real    alpha;
    HYPRE_Real  **l1_norms = NULL;
    HYPRE_Real   *l1_norms_level;
+   HYPRE_Real   **ds = hypre_ParAMGDataChebyDS(amg_data);
+   HYPRE_Real   **coefs = hypre_ParAMGDataChebyCoefs(amg_data);
 
    HYPRE_Int seq_cg = 0;
 
@@ -158,10 +157,10 @@
    l1_norms            = hypre_ParAMGDataL1Norms(amg_data);
    /* smooth_option       = hypre_ParAMGDataSmoothOption(amg_data); */
 
-   max_eig_est = hypre_ParAMGDataMaxEigEst(amg_data);
+   /*max_eig_est = hypre_ParAMGDataMaxEigEst(amg_data);
    min_eig_est = hypre_ParAMGDataMinEigEst(amg_data);
+   cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);*/
    cheby_order = hypre_ParAMGDataChebyOrder(amg_data);
-   cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);
 
    cycle_op_count = hypre_ParAMGDataCycleOpCount(amg_data);
 
@@ -434,6 +433,18 @@
                  }
                  else /* not CF - so use through AMS */
                  {
+#ifdef HYPRE_USE_GPU
+		   hypre_ParCSRRelax(A_array[level], 
+                                       Aux_F,
+                                       1,
+                                       1,
+                                       l1_norms_level,
+                                       relax_weight[level],
+                                       omega[level],0,0,0,0,
+                                       Aux_U,
+                                       Vtemp, 
+				       Ztemp);
+#else
                     if (num_threads == 1)
                        hypre_ParCSRRelax(A_array[level],
                                        Aux_F,
@@ -457,6 +468,7 @@
                                               Aux_U,
                                               Vtemp,
                                               Ztemp);
+#endif
                  }
               }
               else if (relax_type == 15)
@@ -470,13 +482,11 @@
               }
               else if (relax_type == 16)
               { /* scaled Chebyshev */
-                 HYPRE_Int scale = 1;
-                 HYPRE_Int variant = 0;
-                 hypre_ParCSRRelax_Cheby(A_array[level],
-                                       Aux_F,
-                                       max_eig_est[level],
-                                       min_eig_est[level],
-                                       cheby_fraction, cheby_order, scale,
+                 HYPRE_Int scale = hypre_ParAMGDataChebyScale(amg_data);
+                 HYPRE_Int variant = hypre_ParAMGDataChebyVariant(amg_data);
+                 hypre_ParCSRRelax_Cheby_Solve(A_array[level], Aux_F,
+                                       ds[level], coefs[level],
+                                       cheby_order, scale,
                                        variant, Aux_U, Vtemp, Ztemp );
               }
               else if (relax_type ==17)
diff -Nru hypre-2.11.2/src/parcsr_ls/par_gsmg.c hypre-2.13.0/src/parcsr_ls/par_gsmg.c
--- hypre-2.11.2/src/parcsr_ls/par_gsmg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_gsmg.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,10 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
-
 /******************************************************************************
  *
  * Geometrically smooth interpolation multigrid
@@ -27,20 +23,7 @@
 #include "_hypre_parcsr_ls.h"
 #include "par_amg.h"
 
-#ifdef HYPRE_USING_ESSL
-#include <essl.h>
-#else
-#include "fortran.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-HYPRE_Int hypre_F90_NAME_LAPACK(dgels, DGELS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, 
-  HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+#include "_hypre_lapack.h"
 
 #ifndef ABS
 #define ABS(x) ((x)>0 ? (x) : -(x))
@@ -563,7 +546,7 @@
    for (sample=0; sample<nsamples; sample++)
    {
        for (i=0; i<n_local; i++)
-           datax[i] = (rand()/(HYPRE_Real)RAND_MAX) - .5;
+           datax[i] = hypre_Rand() - .5;
 
        for (i=0; i<num_sweeps; i++)
        {
@@ -732,14 +715,10 @@
    for (i=0; i<num; i++)
       b[i] = V[i*n+ip];
 
-#ifdef HYPRE_USING_ESSL
-   dgells(0, a, num, b, num, val, nc, NULL, 1.e-12, num, nc, 1, 
-      &info, work, work_size);
-#else
    {
    char trans = 'N';
    HYPRE_Int  one   = 1;
-   hypre_F90_NAME_LAPACK(dgels, DGELS)(&trans, &num, &nc, &one, a, &num,
+   hypre_dgels(&trans, &num, &nc, &one, a, &num,
       b, &temp, work, &work_size, &info);
 
    if (info != 0)
@@ -749,7 +728,6 @@
    for (j=0; j<nc; j++)
       val[j] = b[j];
    }
-#endif
 
    hypre_TFree(b);
    hypre_TFree(a);
diff -Nru hypre-2.11.2/src/parcsr_ls/par_jacobi_interp.c hypre-2.13.0/src/parcsr_ls/par_jacobi_interp.c
--- hypre-2.11.2/src/parcsr_ls/par_jacobi_interp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_jacobi_interp.c	2017-10-20 17:42:22.000000000 +0000
@@ -189,7 +189,7 @@
    {
       if ( J_marker[i]<0 )
       {
-         if ( ((HYPRE_Real)rand())/RAND_MAX < randthresh )
+         if ( ((HYPRE_Real)hypre_Rand()) < randthresh )
          {
             hypre_printf( "%i: ", i );
             for ( m=P_diag_i[i]; m<P_diag_i[i+1]; ++m )
diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr.c hypre-2.13.0/src/parcsr_ls/par_mgr.c
--- hypre-2.11.2/src/parcsr_ls/par_mgr.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_mgr.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,2680 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2015,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Two-grid system solver
+ *
+ *****************************************************************************/
+
+#include "_hypre_parcsr_ls.h"
+#include "par_amg.h"
+#include "par_mgr.h"
+#include <assert.h>
+
+/* Create */
+void *
+hypre_MGRCreate()
+{
+  hypre_ParMGRData  *mgr_data;
+
+  mgr_data = hypre_CTAlloc(hypre_ParMGRData, 1);
+
+  /* block data */
+  (mgr_data -> block_size) = 1;
+  (mgr_data -> num_coarse_indexes) = 1;
+  (mgr_data -> block_num_coarse_indexes) = NULL;
+  (mgr_data -> block_cf_marker) = NULL;
+
+  /* general data */
+  (mgr_data -> max_num_coarse_levels) = 10;
+  (mgr_data -> A_array) = NULL;
+  (mgr_data -> P_array) = NULL;
+  (mgr_data -> RT_array) = NULL;
+  (mgr_data -> RAP) = NULL;
+  (mgr_data -> CF_marker_array) = NULL;
+  (mgr_data -> coarse_indices_lvls) = NULL;
+
+  (mgr_data -> F_array) = NULL;
+  (mgr_data -> U_array) = NULL;
+  (mgr_data -> residual) = NULL;
+  (mgr_data -> rel_res_norms) = NULL;
+  (mgr_data -> Vtemp) = NULL;
+  (mgr_data -> Ztemp) = NULL;
+  (mgr_data -> Utemp) = NULL;
+  (mgr_data -> Ftemp) = NULL;
+
+  (mgr_data -> num_iterations) = 0;
+  (mgr_data -> num_interp_sweeps) = 1;
+  (mgr_data -> trunc_factor) = 0.0;
+  (mgr_data -> max_row_sum) = 0.9;
+  (mgr_data -> strong_threshold) = 0.25;
+  (mgr_data -> S_commpkg_switch) = 1.0;
+  (mgr_data -> P_max_elmts) = 0;
+
+  (mgr_data -> coarse_grid_solver) = NULL;
+  (mgr_data -> coarse_grid_solver_setup) = NULL;
+  (mgr_data -> coarse_grid_solver_solve) = NULL;
+
+  (mgr_data -> global_smoother) = NULL;
+
+  (mgr_data -> use_default_cgrid_solver) = 1;
+  (mgr_data -> omega) = 1.;
+  (mgr_data -> max_iter) = 20;
+  (mgr_data -> tol) = 1.0e-7;
+  (mgr_data -> relax_type) = 0;
+  (mgr_data -> relax_order) = 1;
+  (mgr_data -> interp_type) = 2;
+  (mgr_data -> restrict_type) = 0;
+  (mgr_data -> num_relax_sweeps) = 1;
+  (mgr_data -> relax_weight) = 1.0;
+
+  (mgr_data -> logging) = 0;
+  (mgr_data -> print_level) = 0;
+
+  (mgr_data -> l1_norms) = NULL;
+
+  (mgr_data -> reserved_coarse_size) = 0;
+  (mgr_data -> reserved_coarse_indexes) = NULL;
+  (mgr_data -> reserved_Cpoint_local_indexes) = NULL;    
+  
+  (mgr_data -> diaginv) = NULL;
+  (mgr_data -> global_smooth_iters) = 1;
+  (mgr_data -> global_smooth_type) = 0;
+  
+  (mgr_data -> set_non_Cpoints_to_F) = 0;
+  
+  (mgr_data -> Frelax_method) = 0;
+  (mgr_data -> FrelaxVcycleData) = NULL;
+  (mgr_data -> max_local_lvls) = 10;
+
+  return (void *) mgr_data;
+}
+
+/*--------------------------------------------------------------------------
+ *--------------------------------------------------------------------------*/
+/* Destroy */
+HYPRE_Int
+hypre_MGRDestroy( void *data )
+{
+  hypre_ParMGRData * mgr_data = (hypre_ParMGRData*) data;
+
+  HYPRE_Int i;
+  HYPRE_Int num_coarse_levels = (mgr_data -> num_coarse_levels);
+
+  /* block info data */
+  if ((mgr_data -> block_cf_marker))
+  {
+    for (i=0; i < (mgr_data -> max_num_coarse_levels); i++)
+    {
+       if ((mgr_data -> block_cf_marker)[i])
+       {
+         hypre_TFree((mgr_data -> block_cf_marker)[i]);
+       }
+    }
+    hypre_TFree((mgr_data -> block_cf_marker));
+    (mgr_data -> block_cf_marker) = NULL;
+  }
+
+  if(mgr_data -> block_num_coarse_indexes)
+  {
+     hypre_TFree(mgr_data -> block_num_coarse_indexes);
+     (mgr_data -> block_num_coarse_indexes) = NULL;
+  }
+  
+  /* final residual vector */
+  if((mgr_data -> residual))
+  {
+    hypre_ParVectorDestroy( (mgr_data -> residual) );
+    (mgr_data -> residual) = NULL;
+  }
+  if((mgr_data -> rel_res_norms))
+  {
+    hypre_TFree( (mgr_data -> rel_res_norms) );
+    (mgr_data -> rel_res_norms) = NULL;
+  }
+  /* temp vectors for solve phase */
+  if((mgr_data -> Vtemp))
+  {
+    hypre_ParVectorDestroy( (mgr_data -> Vtemp) );
+    (mgr_data -> Vtemp) = NULL;
+  }
+  if((mgr_data -> Ztemp))
+  {
+    hypre_ParVectorDestroy( (mgr_data -> Ztemp) );
+    (mgr_data -> Ztemp) = NULL;
+  }
+  if((mgr_data -> Utemp))
+  {
+    hypre_ParVectorDestroy( (mgr_data -> Utemp) );
+    (mgr_data -> Utemp) = NULL;
+  }
+  if((mgr_data -> Ftemp))
+  {
+    hypre_ParVectorDestroy( (mgr_data -> Ftemp) );
+    (mgr_data -> Ftemp) = NULL;
+  }
+  /* coarse grid solver */
+  if((mgr_data -> use_default_cgrid_solver))
+  {
+    if((mgr_data -> coarse_grid_solver))
+       hypre_BoomerAMGDestroy( (mgr_data -> coarse_grid_solver) );
+    (mgr_data -> coarse_grid_solver) = NULL;
+  }
+  /* l1_norms */
+  if ((mgr_data -> l1_norms))
+  {
+    for (i=0; i < (num_coarse_levels); i++)
+       if ((mgr_data -> l1_norms)[i])
+         hypre_TFree((mgr_data -> l1_norms)[i]);
+    hypre_TFree((mgr_data -> l1_norms));
+  }
+
+  /* coarse_indices_lvls */
+  if ((mgr_data -> coarse_indices_lvls))
+  {
+    for (i=0; i < (num_coarse_levels); i++)
+       if ((mgr_data -> coarse_indices_lvls)[i])
+         hypre_TFree((mgr_data -> coarse_indices_lvls)[i]);
+    hypre_TFree((mgr_data -> coarse_indices_lvls));
+  }
+
+  /* linear system and cf marker array */
+  if(mgr_data -> A_array || mgr_data -> P_array || mgr_data -> RT_array || mgr_data -> CF_marker_array)
+  {
+    for (i=1; i < num_coarse_levels+1; i++) {
+      hypre_ParVectorDestroy((mgr_data -> F_array)[i]);
+      hypre_ParVectorDestroy((mgr_data -> U_array)[i]);
+
+      if ((mgr_data -> P_array)[i-1])
+          hypre_ParCSRMatrixDestroy((mgr_data -> P_array)[i-1]);
+
+      if ((mgr_data -> RT_array)[i-1])
+          hypre_ParCSRMatrixDestroy((mgr_data -> RT_array)[i-1]);
+
+      hypre_TFree((mgr_data -> CF_marker_array)[i-1]);
+    }
+    for (i=1; i < (num_coarse_levels); i++) {
+      if ((mgr_data -> A_array)[i])
+        hypre_ParCSRMatrixDestroy((mgr_data -> A_array)[i]);
+    }
+  }
+
+  if((mgr_data -> F_array))
+  {
+  	hypre_TFree((mgr_data -> F_array));
+  	(mgr_data -> F_array) = NULL;
+  }
+  if((mgr_data -> U_array))
+  {
+  	hypre_TFree((mgr_data -> U_array));
+  	(mgr_data -> U_array) = NULL;
+  }
+  if((mgr_data -> A_array))
+  {
+  	hypre_TFree((mgr_data -> A_array));
+  	(mgr_data -> A_array) = NULL;
+  }
+  if((mgr_data -> P_array))
+  {
+  	hypre_TFree((mgr_data -> P_array));
+  	(mgr_data -> P_array) = NULL;
+  }
+  if((mgr_data -> RT_array))
+  {
+  	hypre_TFree((mgr_data -> RT_array));
+  	(mgr_data -> RT_array) = NULL;
+  }
+  if((mgr_data -> CF_marker_array))
+  {
+  	hypre_TFree((mgr_data -> CF_marker_array));
+  	(mgr_data -> CF_marker_array) = NULL;
+  }
+  if((mgr_data -> reserved_Cpoint_local_indexes))
+  {
+  	hypre_TFree((mgr_data -> reserved_Cpoint_local_indexes));
+  	(mgr_data -> reserved_Cpoint_local_indexes) = NULL;
+  }
+
+  /* data for V-cycle F-relaxation */
+  if (mgr_data -> FrelaxVcycleData) {
+    for (i = 0; i < num_coarse_levels; i++) {
+      if ((mgr_data -> FrelaxVcycleData)[i]) {
+        hypre_MGRDestroyFrelaxVcycleData((mgr_data -> FrelaxVcycleData)[i]);
+        (mgr_data -> FrelaxVcycleData)[i] = NULL;
+      }
+    }
+    hypre_TFree(mgr_data -> FrelaxVcycleData);
+    mgr_data -> FrelaxVcycleData = NULL;
+  }  
+  /* data for reserved coarse nodes */
+  if(mgr_data -> reserved_coarse_indexes)
+  {
+     hypre_TFree(mgr_data -> reserved_coarse_indexes);
+     (mgr_data -> reserved_coarse_indexes) = NULL;
+  }
+  /* coarse level matrix - RAP */
+  if ((mgr_data -> RAP))
+    hypre_ParCSRMatrixDestroy((mgr_data -> RAP));
+  if ((mgr_data -> diaginv))
+    hypre_TFree((mgr_data -> diaginv));
+  /* mgr data */
+  hypre_TFree(mgr_data);
+
+  return hypre_error_flag;
+}
+
+/* Create data for V-cycle F-relaxtion */
+void *
+hypre_MGRCreateFrelaxVcycleData()
+{
+  hypre_ParAMGData  *vdata = hypre_CTAlloc(hypre_ParAMGData, 1);
+
+   hypre_ParAMGDataAArray(vdata) = NULL;
+   hypre_ParAMGDataPArray(vdata) = NULL;
+   hypre_ParAMGDataFArray(vdata) = NULL;
+   hypre_ParAMGDataCFMarkerArray(vdata) = NULL;
+   hypre_ParAMGDataVtemp(vdata)  = NULL;
+   hypre_ParAMGDataAMat(vdata)  = NULL;
+   hypre_ParAMGDataBVec(vdata)  = NULL;
+   hypre_ParAMGDataZtemp(vdata)  = NULL;
+   hypre_ParAMGDataCommInfo(vdata) = NULL;
+   hypre_ParAMGDataUArray(vdata) = NULL;
+   hypre_ParAMGDataNewComm(vdata) = hypre_MPI_COMM_NULL;
+   hypre_ParAMGDataNumLevels(vdata) = 0;
+   hypre_ParAMGDataMaxLevels(vdata) = 10;
+
+  return (void *) vdata;
+}
+/* Destroy data for V-cycle F-relaxation */
+HYPRE_Int
+hypre_MGRDestroyFrelaxVcycleData( void *data )
+{
+  hypre_ParAMGData * vdata = (hypre_ParAMGData*) data;
+  HYPRE_Int i;
+  HYPRE_Int num_levels = hypre_ParAMGDataNumLevels(vdata);
+  MPI_Comm new_comm = hypre_ParAMGDataNewComm(vdata);
+
+  for (i=1; i < num_levels; i++)
+  {
+	hypre_ParVectorDestroy(hypre_ParAMGDataFArray(vdata)[i]);
+	hypre_ParVectorDestroy(hypre_ParAMGDataUArray(vdata)[i]);
+
+        if (hypre_ParAMGDataAArray(vdata)[i])
+           hypre_ParCSRMatrixDestroy(hypre_ParAMGDataAArray(vdata)[i]);
+
+        if (hypre_ParAMGDataPArray(vdata)[i-1])
+           hypre_ParCSRMatrixDestroy(hypre_ParAMGDataPArray(vdata)[i-1]);
+
+	hypre_TFree(hypre_ParAMGDataCFMarkerArray(vdata)[i-1]);
+  }
+
+  /* see comments in par_coarsen.c regarding special case for CF_marker */
+  if (num_levels == 1)
+  {
+     hypre_TFree(hypre_ParAMGDataCFMarkerArray(vdata)[0]);
+  }
+
+/* Points to vtemp of mgr_data, which is already destroyed */
+//  hypre_ParVectorDestroy(hypre_ParAMGDataVtemp(vdata)); 
+  hypre_TFree(hypre_ParAMGDataFArray(vdata));
+  hypre_TFree(hypre_ParAMGDataUArray(vdata));
+  hypre_TFree(hypre_ParAMGDataAArray(vdata));
+  hypre_TFree(hypre_ParAMGDataPArray(vdata));
+  hypre_TFree(hypre_ParAMGDataCFMarkerArray(vdata));
+
+/* Points to ztemp of mgr_data, which is already destroyed */
+/*
+  if (hypre_ParAMGDataZtemp(vdata))
+      hypre_ParVectorDestroy(hypre_ParAMGDataZtemp(vdata));
+*/
+
+  if (hypre_ParAMGDataAMat(vdata)) hypre_TFree(hypre_ParAMGDataAMat(vdata));
+  if (hypre_ParAMGDataBVec(vdata)) hypre_TFree(hypre_ParAMGDataBVec(vdata));
+  if (hypre_ParAMGDataCommInfo(vdata)) hypre_TFree(hypre_ParAMGDataCommInfo(vdata));
+   
+  if (new_comm != hypre_MPI_COMM_NULL) 
+  {
+     hypre_MPI_Comm_free (&new_comm);
+  }
+  hypre_TFree(vdata);
+
+  return hypre_error_flag;
+}
+
+/* Set C-point variables for each reduction level */
+/* Currently not implemented */
+HYPRE_Int
+hypre_MGRSetReductionLevelCpoints( void      *mgr_vdata,
+                         HYPRE_Int  nlevels,
+                         HYPRE_Int *num_coarse_points,
+                         HYPRE_Int  **level_coarse_indexes)
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> num_coarse_levels) = nlevels;
+   (mgr_data -> num_coarse_per_level) = num_coarse_points;
+   (mgr_data -> level_coarse_indexes) = level_coarse_indexes;      
+   return hypre_error_flag;
+}                         
+
+/* Initialize some data */
+/* Set whether non-coarse points on each level should be explicitly tagged as F-points */
+HYPRE_Int
+hypre_MGRSetNonCpointsToFpoints( void      *mgr_vdata, HYPRE_Int nonCptToFptFlag)
+{
+   hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> set_non_Cpoints_to_F) = nonCptToFptFlag;
+   
+   return hypre_error_flag;
+}
+
+/* Initialize/ set block data information */
+HYPRE_Int
+hypre_MGRSetCpointsByBlock( void      *mgr_vdata,
+                         HYPRE_Int  block_size,
+                         HYPRE_Int  max_num_levels,
+                         HYPRE_Int  *block_num_coarse_points,
+                         HYPRE_Int  **block_coarse_indexes)
+{
+  HYPRE_Int  i,j;
+  HYPRE_Int  **block_cf_marker = NULL;
+  HYPRE_Int *block_num_coarse_indexes = NULL;
+
+  hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+  /* free block cf_marker data if not previously destroyed */
+  if((mgr_data -> block_cf_marker) != NULL)
+  {
+    for (i=0; i < (mgr_data -> max_num_coarse_levels); i++)
+    {
+      if ((mgr_data -> block_cf_marker)[i])
+      {
+        hypre_TFree ((mgr_data -> block_cf_marker)[i]);
+        (mgr_data -> block_cf_marker)[i] = NULL;
+      }
+    }
+    hypre_TFree (mgr_data -> block_cf_marker);
+    (mgr_data -> block_cf_marker) = NULL;
+  }
+   if((mgr_data -> block_num_coarse_indexes))
+   {
+      hypre_TFree((mgr_data -> block_num_coarse_indexes));
+      (mgr_data -> block_num_coarse_indexes) = NULL;
+   }
+   
+   /* store block cf_marker */
+   block_cf_marker = hypre_CTAlloc(HYPRE_Int *,max_num_levels);
+   for (i = 0; i < max_num_levels; i++) 
+   {
+     block_cf_marker[i] = hypre_CTAlloc(HYPRE_Int,block_size);
+     memset(block_cf_marker[i], FMRK, block_size*sizeof(HYPRE_Int));
+   }  
+   for (i = 0; i < max_num_levels; i++) 
+   {
+     for(j=0; j<block_num_coarse_points[i]; j++) 
+     {
+       (block_cf_marker[i])[block_coarse_indexes[i][j]] = CMRK;
+     }
+   }
+   
+   /* store block_num_coarse_points */
+   if(max_num_levels > 0)
+   {
+      block_num_coarse_indexes = hypre_CTAlloc(HYPRE_Int, max_num_levels);
+      for(i=0; i<max_num_levels; i++)
+         block_num_coarse_indexes[i] = block_num_coarse_points[i];
+   }   
+   /* set block data */
+   (mgr_data -> max_num_coarse_levels) = max_num_levels;
+   (mgr_data -> block_size) = block_size;
+   (mgr_data -> block_num_coarse_indexes) = block_num_coarse_indexes;
+   (mgr_data -> block_cf_marker) = block_cf_marker;
+  
+   return hypre_error_flag;
+}
+
+/*Set number of points that remain part of the coarse grid throughout the hierarchy */
+HYPRE_Int
+hypre_MGRSetReservedCoarseNodes(void      *mgr_vdata,
+					   HYPRE_Int reserved_coarse_size,
+					   HYPRE_Int *reserved_cpt_index)
+{
+	hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+	HYPRE_Int *reserved_coarse_indexes = NULL;
+	HYPRE_Int i;
+
+	if (!mgr_data)
+	{
+		hypre_printf("Warning! MGR object empty!\n");
+		hypre_error_in_arg(1);
+		return hypre_error_flag;
+	}
+	
+	if(reserved_coarse_size < 0)
+	{
+	  hypre_error_in_arg(2);
+          return hypre_error_flag;
+	}
+	/* free data not previously destroyed */
+	if((mgr_data -> reserved_coarse_indexes))
+	{
+	   hypre_TFree((mgr_data -> reserved_coarse_indexes));
+	   (mgr_data -> reserved_coarse_indexes) = NULL;
+	}
+        
+        /* set reserved coarse nodes */
+        if(reserved_coarse_size > 0)
+        {
+           reserved_coarse_indexes = hypre_CTAlloc(HYPRE_Int, reserved_coarse_size);
+           for(i=0; i<reserved_coarse_size; i++)
+              reserved_coarse_indexes[i] = reserved_cpt_index[i];
+        }
+	(mgr_data -> reserved_coarse_size) = reserved_coarse_size;
+	(mgr_data -> reserved_coarse_indexes) = reserved_coarse_indexes;
+	
+	return hypre_error_flag;
+}
+
+/* Set CF marker array */
+HYPRE_Int
+hypre_MGRCoarsen(hypre_ParCSRMatrix *S,
+				   hypre_ParCSRMatrix *A,
+				   HYPRE_Int fixed_coarse_size,
+				   HYPRE_Int *fixed_coarse_indexes,
+				   HYPRE_Int debug_flag,
+				   HYPRE_Int **CF_marker,
+				   HYPRE_Int cflag)
+{
+  HYPRE_Int *cf_marker, i, row, nc;
+  HYPRE_Int *cindexes = fixed_coarse_indexes;
+
+  HYPRE_Int nloc =  hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
+
+  /* If this is the last level, coarsen onto fixed coarse set */
+  if(cflag)
+  {
+    if(*CF_marker != NULL)
+    {
+       hypre_TFree (*CF_marker);
+    }
+    cf_marker = hypre_CTAlloc(HYPRE_Int,nloc);
+    memset(cf_marker, FMRK, nloc*sizeof(HYPRE_Int));
+
+    /* first mark fixed coarse set */
+    nc = fixed_coarse_size;
+    for(i = 0; i < nc; i++)
+    {
+       cf_marker[cindexes[i]] = CMRK;
+    }
+  }
+  else {
+    /* First coarsen to get initial CF splitting.
+     * This is then followed by updating the CF marker to pass
+     * coarse information to the next levels. NOTE: It may be
+     * convenient to implement this way (allows the use of multiple
+     * coarsening strategies without changing too much code),
+     * but not necessarily the best option, compared to initializing
+     * CF_marker first and then coarsening on subgraph which excludes
+     * the initialized coarse nodes.
+    */
+    hypre_BoomerAMGCoarsen(S, A, 0, debug_flag, &cf_marker);
+
+    /* Update CF_marker to correct Cpoints marked as Fpoints. */
+    nc = fixed_coarse_size;
+    for(i = 0; i < nc; i++)
+    {
+       cf_marker[cindexes[i]] = CMRK;
+    }
+    /* set F-points to FMRK. This is necessary since the different coarsening schemes differentiate
+     * between type of F-points (example Ruge coarsening). We do not need that distinction here.
+    */
+    for (row = 0; row <nloc; row++)
+    {
+       if(cf_marker[row] == CMRK) continue;
+       cf_marker[row] = FMRK;
+    }    
+#if 0
+    /* IMPORTANT: Update coarse_indexes array to define the positions of the fixed coarse points
+     * in the next level.
+     */
+    nc = 0;
+    index_i = 0;
+    for (row = 0; row <nloc; row++)
+    {
+       /* loop through new c-points */
+       if(cf_marker[row] == CMRK) nc++;
+       else if(cf_marker[row] == S_CMRK)
+       {
+          /* previously marked c-point is part of fixed coarse set. Track its current local index */
+          cindexes[index_i++] = nc;
+          /* reset c-point from S_CMRK to CMRK */
+          cf_marker[row] = CMRK;
+          nc++;
+       }
+       /* set F-points to FMRK. This is necessary since the different coarsening schemes differentiate
+        * between type of F-points (example Ruge coarsening). We do not need that distinction here.
+        */
+       else
+       {
+          cf_marker[row] = FMRK;
+       }
+    }
+    /* check if this should be last level */
+    if( nc == fixed_coarse_size)
+       last_level = 1;
+    //printf(" nc = %d and fixed coarse size = %d \n", nc, fixed_coarse_size);
+#endif
+  }
+  /* set CF_marker */
+  *CF_marker = cf_marker;
+
+  return hypre_error_flag;
+}
+
+/* Interpolation for MGR - Adapted from BoomerAMGBuildInterp */
+HYPRE_Int
+hypre_MGRBuildP( hypre_ParCSRMatrix   *A,
+				   HYPRE_Int            *CF_marker,
+				   HYPRE_Int            *num_cpts_global,
+				   HYPRE_Int             method,
+				   HYPRE_Int             debug_flag,
+				   hypre_ParCSRMatrix  **P_ptr)
+{
+	MPI_Comm 	      comm = hypre_ParCSRMatrixComm(A);
+	hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+	hypre_ParCSRCommHandle  *comm_handle;
+
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
+	HYPRE_Int             *A_diag_i = hypre_CSRMatrixI(A_diag);
+	HYPRE_Int             *A_diag_j = hypre_CSRMatrixJ(A_diag);
+
+	hypre_CSRMatrix *A_offd         = hypre_ParCSRMatrixOffd(A);
+	HYPRE_Real      *A_offd_data    = hypre_CSRMatrixData(A_offd);
+	HYPRE_Int             *A_offd_i = hypre_CSRMatrixI(A_offd);
+	HYPRE_Int             *A_offd_j = hypre_CSRMatrixJ(A_offd);
+	HYPRE_Int              num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
+	HYPRE_Real       *a_diag;
+
+	hypre_ParCSRMatrix    *P;
+	HYPRE_Int		      *col_map_offd_P;
+
+	HYPRE_Int             *CF_marker_offd = NULL;
+
+	hypre_CSRMatrix    *P_diag;
+	hypre_CSRMatrix    *P_offd;
+
+	HYPRE_Real      *P_diag_data;
+	HYPRE_Int             *P_diag_i;
+	HYPRE_Int             *P_diag_j;
+	HYPRE_Real      *P_offd_data;
+	HYPRE_Int             *P_offd_i;
+	HYPRE_Int             *P_offd_j;
+
+	HYPRE_Int              P_diag_size, P_offd_size;
+
+	HYPRE_Int             *P_marker, *P_marker_offd;
+
+	HYPRE_Int              jj_counter,jj_counter_offd;
+	HYPRE_Int             *jj_count, *jj_count_offd;
+//	HYPRE_Int              jj_begin_row,jj_begin_row_offd;
+//	HYPRE_Int              jj_end_row,jj_end_row_offd;
+
+	HYPRE_Int              start_indexing = 0; /* start indexing for P_data at 0 */
+
+	HYPRE_Int              n_fine = hypre_CSRMatrixNumRows(A_diag);
+
+	HYPRE_Int             *fine_to_coarse;
+	HYPRE_Int             *fine_to_coarse_offd;
+	HYPRE_Int             *coarse_counter;
+	HYPRE_Int              coarse_shift;
+	HYPRE_Int              total_global_cpts;
+	HYPRE_Int              num_cols_P_offd,my_first_cpt;
+
+	HYPRE_Int              i,i1;
+	HYPRE_Int              j,jl,jj;
+	HYPRE_Int              start;
+
+	HYPRE_Real       one  = 1.0;
+
+	HYPRE_Int              my_id;
+	HYPRE_Int              num_procs;
+	HYPRE_Int              num_threads;
+	HYPRE_Int              num_sends;
+	HYPRE_Int              index;
+	HYPRE_Int              ns, ne, size, rest;
+	
+	HYPRE_Int             *int_buf_data;
+
+	HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+
+	hypre_MPI_Comm_size(comm, &num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+	num_threads = hypre_NumThreads();
+
+#ifdef HYPRE_NO_GLOBAL_PARTITION
+	my_first_cpt = num_cpts_global[0];
+	if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
+	hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_INT, num_procs-1, comm);
+#else
+	my_first_cpt = num_cpts_global[my_id];
+	total_global_cpts = num_cpts_global[num_procs];
+#endif
+
+	/*-------------------------------------------------------------------
+	 * Get the CF_marker data for the off-processor columns
+	 *-------------------------------------------------------------------*/
+
+	if (debug_flag < 0)
+	{
+		debug_flag = -debug_flag;
+	}
+
+	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+	if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+
+	if (!comm_pkg)
+	{
+		hypre_MatvecCommPkgCreate(A);
+		comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+	}
+
+	num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
+	int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg,
+																			num_sends));
+
+	index = 0;
+	for (i = 0; i < num_sends; i++)
+	{
+		start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
+		for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
+			int_buf_data[index++]
+				= CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
+	}
+
+	comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
+												CF_marker_offd);
+	hypre_ParCSRCommHandleDestroy(comm_handle);
+
+	if (debug_flag==4)
+	{
+		wall_time = time_getWallclockSeconds() - wall_time;
+		hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
+					 my_id, wall_time);
+		fflush(NULL);
+	}
+
+	/*-----------------------------------------------------------------------
+	 *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
+	 *-----------------------------------------------------------------------*/
+
+	/*-----------------------------------------------------------------------
+	 *  Intialize counters and allocate mapping vector.
+	 *-----------------------------------------------------------------------*/
+
+	coarse_counter = hypre_CTAlloc(HYPRE_Int, num_threads);
+	jj_count = hypre_CTAlloc(HYPRE_Int, num_threads);
+	jj_count_offd = hypre_CTAlloc(HYPRE_Int, num_threads);
+
+	fine_to_coarse = hypre_CTAlloc(HYPRE_Int, n_fine);
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
+
+	jj_counter = start_indexing;
+	jj_counter_offd = start_indexing;
+
+	/*-----------------------------------------------------------------------
+	 *  Loop over fine grid.
+	 *-----------------------------------------------------------------------*/
+
+/* RDF: this looks a little tricky, but doable */
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (j = 0; j < num_threads; j++)
+	{
+		size = n_fine/num_threads;
+		rest = n_fine - size*num_threads;
+
+		if (j < rest)
+		{
+			ns = j*size+j;
+			ne = (j+1)*size+j+1;
+		}
+		else
+		{
+			ns = j*size+rest;
+			ne = (j+1)*size+rest;
+		}
+		for (i = ns; i < ne; i++)
+		{
+			/*--------------------------------------------------------------------
+			 *  If i is a C-point, interpolation is the identity. Also set up
+			 *  mapping vector.
+			 *--------------------------------------------------------------------*/
+
+			if (CF_marker[i] >= 0)
+			{
+				jj_count[j]++;
+				fine_to_coarse[i] = coarse_counter[j];
+				coarse_counter[j]++;
+			}
+			/*--------------------------------------------------------------------
+			 *  If i is an F-point, interpolation is the approximation of A_{ff}^{-1}A_{fc}
+			 *--------------------------------------------------------------------*/
+			 else
+			 {
+				 for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+				 {
+					 i1 = A_diag_j[jj];
+					 if (CF_marker[i1] >= 0)
+					 {
+						 jj_count[j]++;
+					 }
+				 }
+
+				 if (num_procs > 1)
+				 {
+					 for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
+					 {
+						 i1 = A_offd_j[jj];
+						 if (CF_marker_offd[i1] >= 0)
+						 {
+							 jj_count_offd[j]++;
+						 }
+					 }
+				 }
+			 }
+		}
+	}
+
+	/*-----------------------------------------------------------------------
+	 *  Allocate  arrays.
+	 *-----------------------------------------------------------------------*/
+	for (i=0; i < num_threads-1; i++)
+	{
+		coarse_counter[i+1] += coarse_counter[i];
+		jj_count[i+1] += jj_count[i];
+		jj_count_offd[i+1] += jj_count_offd[i];
+	}
+	i = num_threads-1;
+	jj_counter = jj_count[i];
+	jj_counter_offd = jj_count_offd[i];
+
+	P_diag_size = jj_counter;
+
+	P_diag_i    = hypre_CTAlloc(HYPRE_Int, n_fine+1);
+	P_diag_j    = hypre_CTAlloc(HYPRE_Int, P_diag_size);
+	P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_size);
+
+	P_diag_i[n_fine] = jj_counter;
+
+
+	P_offd_size = jj_counter_offd;
+
+	P_offd_i    = hypre_CTAlloc(HYPRE_Int, n_fine+1);
+	P_offd_j    = hypre_CTAlloc(HYPRE_Int, P_offd_size);
+	P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_size);
+
+	/*-----------------------------------------------------------------------
+	 *  Intialize some stuff.
+	 *-----------------------------------------------------------------------*/
+
+	jj_counter = start_indexing;
+	jj_counter_offd = start_indexing;
+
+	if (debug_flag==4)
+	{
+		wall_time = time_getWallclockSeconds() - wall_time;
+		hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
+					 my_id, wall_time);
+		fflush(NULL);
+	}
+
+	/*-----------------------------------------------------------------------
+	 *  Send and receive fine_to_coarse info.
+	 *-----------------------------------------------------------------------*/
+
+	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+	fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (j = 0; j < num_threads; j++)
+	{
+		coarse_shift = 0;
+		if (j > 0) coarse_shift = coarse_counter[j-1];
+		size = n_fine/num_threads;
+		rest = n_fine - size*num_threads;
+		if (j < rest)
+		{
+			ns = j*size+j;
+			ne = (j+1)*size+j+1;
+		}
+		else
+		{
+			ns = j*size+rest;
+			ne = (j+1)*size+rest;
+		}
+		for (i = ns; i < ne; i++)
+			fine_to_coarse[i] += my_first_cpt+coarse_shift;
+	}
+
+	index = 0;
+	for (i = 0; i < num_sends; i++)
+	{
+		start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
+		for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
+			int_buf_data[index++]
+				= fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
+	}
+
+	comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
+												fine_to_coarse_offd);
+
+	hypre_ParCSRCommHandleDestroy(comm_handle);
+
+	if (debug_flag==4)
+	{
+		wall_time = time_getWallclockSeconds() - wall_time;
+		hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
+					 my_id, wall_time);
+		fflush(NULL);
+	}
+
+	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt;
+
+	/*-----------------------------------------------------------------------
+	 *  Loop over fine grid points.
+	 *-----------------------------------------------------------------------*/
+	a_diag = hypre_CTAlloc(HYPRE_Real, n_fine);
+	for (i = 0; i < n_fine; i++)
+	{
+		for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+		{
+			i1 = A_diag_j[jj];
+			if ( i==i1 )  /* diagonal of A only */
+			{
+				a_diag[i] = 1.0/A_diag_data[jj];
+			}
+		}
+	}
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i,j,jl,i1,jj,ns,ne,size,rest,P_marker,P_marker_offd,jj_counter,jj_counter_offd,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (jl = 0; jl < num_threads; jl++)
+	{
+		size = n_fine/num_threads;
+		rest = n_fine - size*num_threads;
+		if (jl < rest)
+		{
+			ns = jl*size+jl;
+			ne = (jl+1)*size+jl+1;
+		}
+		else
+		{
+			ns = jl*size+rest;
+			ne = (jl+1)*size+rest;
+		}
+		jj_counter = 0;
+		if (jl > 0) jj_counter = jj_count[jl-1];
+		jj_counter_offd = 0;
+		if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
+		P_marker = hypre_CTAlloc(HYPRE_Int, n_fine);
+		if (num_cols_A_offd)
+			P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+		else
+			P_marker_offd = NULL;
+
+		for (i = 0; i < n_fine; i++)
+		{
+			P_marker[i] = -1;
+		}
+		for (i = 0; i < num_cols_A_offd; i++)
+		{
+			P_marker_offd[i] = -1;
+		}
+		for (i = ns; i < ne; i++)
+		{
+			/*--------------------------------------------------------------------
+			 *  If i is a c-point, interpolation is the identity.
+			 *--------------------------------------------------------------------*/
+			if (CF_marker[i] >= 0)
+			{
+				P_diag_i[i] = jj_counter;
+				P_diag_j[jj_counter]    = fine_to_coarse[i];
+				P_diag_data[jj_counter] = one;
+				jj_counter++;
+			}
+			/*--------------------------------------------------------------------
+			 *  If i is an F-point, build interpolation.
+			 *--------------------------------------------------------------------*/
+			else
+			{
+				/* Diagonal part of P */
+				P_diag_i[i] = jj_counter;
+				for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+				{
+					i1 = A_diag_j[jj];
+
+					/*--------------------------------------------------------------
+					 * If neighbor i1 is a C-point, set column number in P_diag_j
+					 * and initialize interpolation weight to zero.
+					 *--------------------------------------------------------------*/
+
+					if (CF_marker[i1] >= 0)
+					{
+						P_marker[i1] = jj_counter;
+						P_diag_j[jj_counter]    = fine_to_coarse[i1];
+						if(method == 0)
+						{
+							P_diag_data[jj_counter] = 0.0;
+						}
+						else if (method == 1)
+						{
+							P_diag_data[jj_counter] = - A_diag_data[jj];
+						}
+						else if (method == 2)
+						{
+								P_diag_data[jj_counter] = - A_diag_data[jj]*a_diag[i];
+
+						}
+						jj_counter++;
+					}
+				}
+
+				/* Off-Diagonal part of P */
+				P_offd_i[i] = jj_counter_offd;
+
+				if (num_procs > 1)
+				{
+					for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
+					{
+						i1 = A_offd_j[jj];
+
+						/*-----------------------------------------------------------
+						 * If neighbor i1 is a C-point, set column number in P_offd_j
+						 * and initialize interpolation weight to zero.
+						 *-----------------------------------------------------------*/
+
+						if (CF_marker_offd[i1] >= 0)
+						{
+							P_marker_offd[i1] = jj_counter_offd;
+							/*P_offd_j[jj_counter_offd]  = fine_to_coarse_offd[i1];*/
+							P_offd_j[jj_counter_offd]  = i1;
+							if(method == 0)
+						{
+						    P_offd_data[jj_counter_offd] = 0.0;
+						}
+						else if (method == 1)
+						{
+							P_offd_data[jj_counter_offd] = - A_offd_data[jj];
+						}
+						else if (method == 2)
+						{
+							P_offd_data[jj_counter_offd] = - A_offd_data[jj]*a_diag[i];
+						}
+
+							jj_counter_offd++;
+						}
+					}
+				}
+			}
+			P_offd_i[i+1] = jj_counter_offd;
+		}
+		hypre_TFree(P_marker);
+		hypre_TFree(P_marker_offd);
+	}
+	hypre_TFree(a_diag);
+	P = hypre_ParCSRMatrixCreate(comm,
+								 hypre_ParCSRMatrixGlobalNumRows(A),
+								 total_global_cpts,
+								 hypre_ParCSRMatrixColStarts(A),
+								 num_cpts_global,
+								 0,
+								 P_diag_i[n_fine],
+								 P_offd_i[n_fine]);
+
+	P_diag = hypre_ParCSRMatrixDiag(P);
+	hypre_CSRMatrixData(P_diag) = P_diag_data;
+	hypre_CSRMatrixI(P_diag) = P_diag_i;
+	hypre_CSRMatrixJ(P_diag) = P_diag_j;
+	P_offd = hypre_ParCSRMatrixOffd(P);
+	hypre_CSRMatrixData(P_offd) = P_offd_data;
+	hypre_CSRMatrixI(P_offd) = P_offd_i;
+	hypre_CSRMatrixJ(P_offd) = P_offd_j;
+	hypre_ParCSRMatrixOwnsRowStarts(P) = 0;
+
+	num_cols_P_offd = 0;
+
+	if (P_offd_size)
+	{
+		P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+		for (i=0; i < num_cols_A_offd; i++)
+			P_marker[i] = 0;
+		num_cols_P_offd = 0;
+		for (i=0; i < P_offd_size; i++)
+		{
+			index = P_offd_j[i];
+			if (!P_marker[index])
+			{
+				num_cols_P_offd++;
+				P_marker[index] = 1;
+			}
+		}
+
+		col_map_offd_P = hypre_CTAlloc(HYPRE_Int,num_cols_P_offd);
+		index = 0;
+		for (i=0; i < num_cols_P_offd; i++)
+		{
+			while (P_marker[index]==0) index++;
+			col_map_offd_P[i] = index++;
+		}
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+		for (i=0; i < P_offd_size; i++)
+			P_offd_j[i] = hypre_BinarySearch(col_map_offd_P,
+											 P_offd_j[i],
+											 num_cols_P_offd);
+		hypre_TFree(P_marker);
+	}
+
+	for (i=0; i < n_fine; i++)
+		if (CF_marker[i] == -3) CF_marker[i] = -1;
+	if (num_cols_P_offd)
+	{
+		hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
+		hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
+	}
+	hypre_GetCommPkgRTFromCommPkgA(P,A, fine_to_coarse_offd);
+
+	*P_ptr = P;
+
+	hypre_TFree(CF_marker_offd);
+	hypre_TFree(int_buf_data);
+	hypre_TFree(fine_to_coarse);
+	hypre_TFree(fine_to_coarse_offd);
+	hypre_TFree(coarse_counter);
+	hypre_TFree(jj_count);
+	hypre_TFree(jj_count_offd);
+
+	return(0);
+}
+
+
+/* Interpolation for MGR - Dynamic Row Sum method */
+
+HYPRE_Int
+hypre_MGRBuildPDRS( hypre_ParCSRMatrix   *A,
+					  HYPRE_Int            *CF_marker,
+					  HYPRE_Int            *num_cpts_global,
+					  HYPRE_Int             blk_size,
+					  HYPRE_Int             reserved_coarse_size,
+					  HYPRE_Int             debug_flag,
+					  hypre_ParCSRMatrix  **P_ptr)
+{
+	MPI_Comm 	      comm = hypre_ParCSRMatrixComm(A);
+	hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+	hypre_ParCSRCommHandle  *comm_handle;
+
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
+	HYPRE_Int             *A_diag_i = hypre_CSRMatrixI(A_diag);
+	HYPRE_Int             *A_diag_j = hypre_CSRMatrixJ(A_diag);
+
+	hypre_CSRMatrix *A_offd         = hypre_ParCSRMatrixOffd(A);
+	HYPRE_Real      *A_offd_data    = hypre_CSRMatrixData(A_offd);
+	HYPRE_Int             *A_offd_i = hypre_CSRMatrixI(A_offd);
+	HYPRE_Int             *A_offd_j = hypre_CSRMatrixJ(A_offd);
+	HYPRE_Int              num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
+	HYPRE_Real       *a_diag;
+
+	hypre_ParCSRMatrix    *P;
+	HYPRE_Int		      *col_map_offd_P;
+
+	HYPRE_Int             *CF_marker_offd = NULL;
+
+	hypre_CSRMatrix    *P_diag;
+	hypre_CSRMatrix    *P_offd;
+
+	HYPRE_Real      *P_diag_data;
+	HYPRE_Int             *P_diag_i;
+	HYPRE_Int             *P_diag_j;
+	HYPRE_Real      *P_offd_data;
+	HYPRE_Int             *P_offd_i;
+	HYPRE_Int             *P_offd_j;
+
+	HYPRE_Int              P_diag_size, P_offd_size;
+
+	HYPRE_Int             *P_marker, *P_marker_offd;
+
+	HYPRE_Int              jj_counter,jj_counter_offd;
+	HYPRE_Int             *jj_count, *jj_count_offd;
+//	HYPRE_Int              jj_begin_row,jj_begin_row_offd;
+//	HYPRE_Int              jj_end_row,jj_end_row_offd;
+
+	HYPRE_Int              start_indexing = 0; /* start indexing for P_data at 0 */
+
+	HYPRE_Int              n_fine  = hypre_CSRMatrixNumRows(A_diag);
+
+	HYPRE_Int             *fine_to_coarse;
+	HYPRE_Int             *fine_to_coarse_offd;
+	HYPRE_Int             *coarse_counter;
+	HYPRE_Int              coarse_shift;
+	HYPRE_Int              total_global_cpts;
+	HYPRE_Int              num_cols_P_offd,my_first_cpt;
+
+	HYPRE_Int              i,i1;
+	HYPRE_Int              j,jl,jj;
+	HYPRE_Int              start;
+
+	HYPRE_Real       one  = 1.0;
+
+	HYPRE_Int              my_id;
+	HYPRE_Int              num_procs;
+	HYPRE_Int              num_threads;
+	HYPRE_Int              num_sends;
+	HYPRE_Int              index;
+	HYPRE_Int              ns, ne, size, rest;
+
+	HYPRE_Int             *int_buf_data;
+
+	HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+
+	hypre_MPI_Comm_size(comm, &num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+	num_threads = hypre_NumThreads();
+
+#ifdef HYPRE_NO_GLOBAL_PARTITION
+	my_first_cpt = num_cpts_global[0];
+	if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
+	hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_INT, num_procs-1, comm);
+#else
+	my_first_cpt = num_cpts_global[my_id];
+	total_global_cpts = num_cpts_global[num_procs];
+#endif
+
+	/*-------------------------------------------------------------------
+	 * Get the CF_marker data for the off-processor columns
+	 *-------------------------------------------------------------------*/
+
+	if (debug_flag < 0)
+	{
+		debug_flag = -debug_flag;
+	}
+
+	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+	if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+
+	if (!comm_pkg)
+	{
+		hypre_MatvecCommPkgCreate(A);
+		comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+	}
+
+	num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
+	int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg,
+																			num_sends));
+
+	index = 0;
+	for (i = 0; i < num_sends; i++)
+	{
+		start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
+		for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
+			int_buf_data[index++]
+				= CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
+	}
+
+	comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
+												CF_marker_offd);
+	hypre_ParCSRCommHandleDestroy(comm_handle);
+
+	if (debug_flag==4)
+	{
+		wall_time = time_getWallclockSeconds() - wall_time;
+		hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
+					 my_id, wall_time);
+		fflush(NULL);
+	}
+
+	/*-----------------------------------------------------------------------
+	 *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
+	 *-----------------------------------------------------------------------*/
+
+	/*-----------------------------------------------------------------------
+	 *  Intialize counters and allocate mapping vector.
+	 *-----------------------------------------------------------------------*/
+
+	coarse_counter = hypre_CTAlloc(HYPRE_Int, num_threads);
+	jj_count = hypre_CTAlloc(HYPRE_Int, num_threads);
+	jj_count_offd = hypre_CTAlloc(HYPRE_Int, num_threads);
+
+	fine_to_coarse = hypre_CTAlloc(HYPRE_Int, n_fine);
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
+
+	jj_counter = start_indexing;
+	jj_counter_offd = start_indexing;
+
+	/*-----------------------------------------------------------------------
+	 *  Loop over fine grid.
+	 *-----------------------------------------------------------------------*/
+
+/* RDF: this looks a little tricky, but doable */
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (j = 0; j < num_threads; j++)
+	{
+		size = n_fine/num_threads;
+		rest = n_fine - size*num_threads;
+
+		if (j < rest)
+		{
+			ns = j*size+j;
+			ne = (j+1)*size+j+1;
+		}
+		else
+		{
+			ns = j*size+rest;
+			ne = (j+1)*size+rest;
+		}
+		for (i = ns; i < ne; i++)
+		{
+			/*--------------------------------------------------------------------
+			 *  If i is a C-point, interpolation is the identity. Also set up
+			 *  mapping vector.
+			 *--------------------------------------------------------------------*/
+
+			if (CF_marker[i] >= 0)
+			{
+				jj_count[j]++;
+				fine_to_coarse[i] = coarse_counter[j];
+				coarse_counter[j]++;
+			}
+			/*--------------------------------------------------------------------
+			 *  If i is an F-point, interpolation is the approximation of A_{ff}^{-1}A_{fc}
+			 *--------------------------------------------------------------------*/
+			 else
+			 {
+				 for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+				 {
+					 i1 = A_diag_j[jj];
+					 if (CF_marker[i1] >= 0)
+					 {
+						 jj_count[j]++;
+					 }
+				 }
+
+				 if (num_procs > 1)
+				 {
+					 for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
+					 {
+						 i1 = A_offd_j[jj];
+						 if (CF_marker_offd[i1] >= 0)
+						 {
+							 jj_count_offd[j]++;
+						 }
+					 }
+				 }
+			 }
+			/*--------------------------------------------------------------------
+			 *  Set up the indexes for the DRS method
+			 *--------------------------------------------------------------------*/
+
+		}
+	}
+
+	/*-----------------------------------------------------------------------
+	 *  Allocate  arrays.
+	 *-----------------------------------------------------------------------*/
+	for (i=0; i < num_threads-1; i++)
+	{
+		coarse_counter[i+1] += coarse_counter[i];
+		jj_count[i+1] += jj_count[i];
+		jj_count_offd[i+1] += jj_count_offd[i];
+	}
+	i = num_threads-1;
+	jj_counter = jj_count[i];
+	jj_counter_offd = jj_count_offd[i];
+
+	P_diag_size = jj_counter;
+
+	P_diag_i    = hypre_CTAlloc(HYPRE_Int, n_fine+1);
+	P_diag_j    = hypre_CTAlloc(HYPRE_Int, P_diag_size);
+	P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_size);
+
+	P_diag_i[n_fine] = jj_counter;
+
+
+	P_offd_size = jj_counter_offd;
+
+	P_offd_i    = hypre_CTAlloc(HYPRE_Int, n_fine+1);
+	P_offd_j    = hypre_CTAlloc(HYPRE_Int, P_offd_size);
+	P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_size);
+
+	/*-----------------------------------------------------------------------
+	 *  Intialize some stuff.
+	 *-----------------------------------------------------------------------*/
+
+	jj_counter = start_indexing;
+	jj_counter_offd = start_indexing;
+
+	if (debug_flag==4)
+	{
+		wall_time = time_getWallclockSeconds() - wall_time;
+		hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
+					 my_id, wall_time);
+		fflush(NULL);
+	}
+
+	/*-----------------------------------------------------------------------
+	 *  Send and receive fine_to_coarse info.
+	 *-----------------------------------------------------------------------*/
+
+	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+	fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (j = 0; j < num_threads; j++)
+	{
+		coarse_shift = 0;
+		if (j > 0) coarse_shift = coarse_counter[j-1];
+		size = n_fine/num_threads;
+		rest = n_fine - size*num_threads;
+		if (j < rest)
+		{
+			ns = j*size+j;
+			ne = (j+1)*size+j+1;
+		}
+		else
+		{
+			ns = j*size+rest;
+			ne = (j+1)*size+rest;
+		}
+		for (i = ns; i < ne; i++)
+			fine_to_coarse[i] += my_first_cpt+coarse_shift;
+	}
+
+	index = 0;
+	for (i = 0; i < num_sends; i++)
+	{
+		start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
+		for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
+			int_buf_data[index++]
+				= fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
+	}
+
+	comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
+												fine_to_coarse_offd);
+
+	hypre_ParCSRCommHandleDestroy(comm_handle);
+
+	if (debug_flag==4)
+	{
+		wall_time = time_getWallclockSeconds() - wall_time;
+		hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
+					 my_id, wall_time);
+		fflush(NULL);
+	}
+
+	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt;
+
+	/*-----------------------------------------------------------------------
+	 *  Loop over fine grid points.
+	 *-----------------------------------------------------------------------*/
+	a_diag = hypre_CTAlloc(HYPRE_Real, n_fine);
+	for (i = 0; i < n_fine; i++)
+	{
+		for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+		{
+			i1 = A_diag_j[jj];
+			if ( i==i1 )  /* diagonal of A only */
+			{
+				a_diag[i] = 1.0/A_diag_data[jj];
+			}
+		}
+	}
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i,j,jl,i1,jj,ns,ne,size,rest,P_marker,P_marker_offd,jj_counter,jj_counter_offd,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (jl = 0; jl < num_threads; jl++)
+	{
+		size = n_fine/num_threads;
+		rest = n_fine - size*num_threads;
+		if (jl < rest)
+		{
+			ns = jl*size+jl;
+			ne = (jl+1)*size+jl+1;
+		}
+		else
+		{
+			ns = jl*size+rest;
+			ne = (jl+1)*size+rest;
+		}
+		jj_counter = 0;
+		if (jl > 0) jj_counter = jj_count[jl-1];
+		jj_counter_offd = 0;
+		if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
+		P_marker = hypre_CTAlloc(HYPRE_Int, n_fine);
+		if (num_cols_A_offd)
+			P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+		else
+			P_marker_offd = NULL;
+
+		for (i = 0; i < n_fine; i++)
+		{
+			P_marker[i] = -1;
+		}
+		for (i = 0; i < num_cols_A_offd; i++)
+		{
+			P_marker_offd[i] = -1;
+		}
+		for (i = ns; i < ne; i++)
+		{
+			/*--------------------------------------------------------------------
+			 *  If i is a c-point, interpolation is the identity.
+			 *--------------------------------------------------------------------*/
+			if (CF_marker[i] >= 0)
+			{
+				P_diag_i[i] = jj_counter;
+				P_diag_j[jj_counter]    = fine_to_coarse[i];
+				P_diag_data[jj_counter] = one;
+				jj_counter++;
+			}
+			/*--------------------------------------------------------------------
+			 *  If i is an F-point, build interpolation.
+			 *--------------------------------------------------------------------*/
+			else
+			{
+				/* Diagonal part of P */
+				P_diag_i[i] = jj_counter;
+				for (jj = A_diag_i[i]; jj < A_diag_i[i+1]; jj++)
+				{
+					i1 = A_diag_j[jj];
+
+					/*--------------------------------------------------------------
+					 * If neighbor i1 is a C-point, set column number in P_diag_j
+					 * and initialize interpolation weight to zero.
+					 *--------------------------------------------------------------*/
+
+					if (CF_marker[i1] >= 0)
+					{
+						P_marker[i1] = jj_counter;
+						P_diag_j[jj_counter]    = fine_to_coarse[i1];
+						P_diag_data[jj_counter] = - A_diag_data[jj]*a_diag[i];
+
+						jj_counter++;
+					}
+				}
+
+				/* Off-Diagonal part of P */
+				P_offd_i[i] = jj_counter_offd;
+
+				if (num_procs > 1)
+				{
+					for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
+					{
+						i1 = A_offd_j[jj];
+
+						/*-----------------------------------------------------------
+						 * If neighbor i1 is a C-point, set column number in P_offd_j
+						 * and initialize interpolation weight to zero.
+						 *-----------------------------------------------------------*/
+
+						if (CF_marker_offd[i1] >= 0)
+						{
+							P_marker_offd[i1] = jj_counter_offd;
+							/*P_offd_j[jj_counter_offd]  = fine_to_coarse_offd[i1];*/
+							P_offd_j[jj_counter_offd]  = i1;
+							P_offd_data[jj_counter_offd] = - A_offd_data[jj]*a_diag[i];
+
+							jj_counter_offd++;
+						}
+					}
+				}
+			}
+			P_offd_i[i+1] = jj_counter_offd;
+		}
+		hypre_TFree(P_marker);
+		hypre_TFree(P_marker_offd);
+	}
+	hypre_TFree(a_diag);
+	P = hypre_ParCSRMatrixCreate(comm,
+								 hypre_ParCSRMatrixGlobalNumRows(A),
+								 total_global_cpts,
+								 hypre_ParCSRMatrixColStarts(A),
+								 num_cpts_global,
+								 0,
+								 P_diag_i[n_fine],
+								 P_offd_i[n_fine]);
+
+	P_diag = hypre_ParCSRMatrixDiag(P);
+	hypre_CSRMatrixData(P_diag) = P_diag_data;
+	hypre_CSRMatrixI(P_diag) = P_diag_i;
+	hypre_CSRMatrixJ(P_diag) = P_diag_j;
+	P_offd = hypre_ParCSRMatrixOffd(P);
+	hypre_CSRMatrixData(P_offd) = P_offd_data;
+	hypre_CSRMatrixI(P_offd) = P_offd_i;
+	hypre_CSRMatrixJ(P_offd) = P_offd_j;
+	hypre_ParCSRMatrixOwnsRowStarts(P) = 0;
+
+	num_cols_P_offd = 0;
+
+	if (P_offd_size)
+	{
+		P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd);
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+		for (i=0; i < num_cols_A_offd; i++)
+			P_marker[i] = 0;
+		num_cols_P_offd = 0;
+		for (i=0; i < P_offd_size; i++)
+		{
+			index = P_offd_j[i];
+			if (!P_marker[index])
+			{
+				num_cols_P_offd++;
+				P_marker[index] = 1;
+			}
+		}
+
+		col_map_offd_P = hypre_CTAlloc(HYPRE_Int,num_cols_P_offd);
+		index = 0;
+		for (i=0; i < num_cols_P_offd; i++)
+		{
+			while (P_marker[index]==0) index++;
+			col_map_offd_P[i] = index++;
+		}
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+		for (i=0; i < P_offd_size; i++)
+			P_offd_j[i] = hypre_BinarySearch(col_map_offd_P,
+											 P_offd_j[i],
+											 num_cols_P_offd);
+		hypre_TFree(P_marker);
+	}
+
+	for (i=0; i < n_fine; i++)
+		if (CF_marker[i] == -3) CF_marker[i] = -1;
+	if (num_cols_P_offd)
+	{
+		hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
+		hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
+	}
+	hypre_GetCommPkgRTFromCommPkgA(P,A, fine_to_coarse_offd);
+
+	*P_ptr = P;
+
+	hypre_TFree(CF_marker_offd);
+	hypre_TFree(int_buf_data);
+	hypre_TFree(fine_to_coarse);
+	hypre_TFree(fine_to_coarse_offd);
+	hypre_TFree(coarse_counter);
+	hypre_TFree(jj_count);
+	hypre_TFree(jj_count_offd);
+
+	return(0);
+}
+
+/* Setup interpolation operator */
+HYPRE_Int
+hypre_MGRBuildInterp(hypre_ParCSRMatrix     *A,
+			HYPRE_Int             *CF_marker,
+                         hypre_ParCSRMatrix   *S,
+                         HYPRE_Int            *num_cpts_global,
+                         HYPRE_Int            num_functions,
+                         HYPRE_Int            *dof_func,
+                         HYPRE_Int            debug_flag,
+                         HYPRE_Real           trunc_factor,
+                         HYPRE_Int	      max_elmts,
+                         HYPRE_Int 	      *col_offd_S_to_A,
+			hypre_ParCSRMatrix    **P,
+			HYPRE_Int	      last_level,
+			HYPRE_Int	      method,
+			HYPRE_Int	      numsweeps)
+{
+   HYPRE_Int i;
+   hypre_ParCSRMatrix    *P_ptr = NULL;
+   HYPRE_Real		 jac_trunc_threshold = trunc_factor;
+   HYPRE_Real		 jac_trunc_threshold_minus = 0.5*jac_trunc_threshold;
+
+   /* Build interpolation operator using (hypre default) */
+   if(!last_level)
+   {
+           hypre_MGRBuildP( A,CF_marker,num_cpts_global,2,debug_flag,&P_ptr);
+         
+//           hypre_BoomerAMGBuildInterp(A, CF_marker, S, num_cpts_global,1, NULL,debug_flag,
+//	   	        trunc_factor, max_elmts, col_offd_S_to_A, &P_ptr);		
+   }
+   /* Do Jacobi interpolation for last level */
+   else
+   {
+	   if (method <3)
+	   {
+		   hypre_MGRBuildP( A,CF_marker,num_cpts_global,method,debug_flag,&P_ptr);
+		   /* Could do a few sweeps of Jacobi to further improve P */
+                   // for(i=0; i<numsweeps; i++)
+		   //	 hypre_BoomerAMGJacobiInterp(A, &P_ptr, S,1, NULL, CF_marker, 0, jac_trunc_threshold, jac_trunc_threshold_minus );
+	   }
+	   else //if (method == 5)
+	   {
+		   /* clone or copy nonzero pattern of A to B */
+		   hypre_ParCSRMatrix	*B;
+		   hypre_ParCSRMatrixClone( A, &B, 0 );
+		   /* Build interp with B to initialize P as injection [0 I] operator */
+		   hypre_BoomerAMGBuildInterp(B, CF_marker, S, num_cpts_global,1, NULL,debug_flag,
+									  trunc_factor, max_elmts, col_offd_S_to_A, &P_ptr);
+		   hypre_ParCSRMatrixDestroy(B);
+
+
+	   /* Do k steps of Jacobi build W for P = [-W I].
+		* Note that BoomerAMGJacobiInterp assumes you have some initial P,
+		* hence we need to initialize P as above, before calling this routine.
+		* If numsweeps = 0, the following step is skipped and P is returned as the
+		* injection operator.
+		* Looping here is equivalent to improving P by Jacobi interpolation
+           */
+		   for(i=0; i<numsweeps; i++)
+			   hypre_BoomerAMGJacobiInterp(A, &P_ptr, S,1, NULL, CF_marker,
+										   0, jac_trunc_threshold,
+										   jac_trunc_threshold_minus );
+
+	   }
+
+   }
+   /* set pointer to P */
+   *P = P_ptr;
+
+   return hypre_error_flag;
+}
+
+void hypre_blas_smat_inv_n4 (HYPRE_Real *a)
+{
+    const HYPRE_Real a11 = a[0],  a12 = a[1],  a13 = a[2],  a14 = a[3];
+    const HYPRE_Real a21 = a[4],  a22 = a[5],  a23 = a[6],  a24 = a[7];
+    const HYPRE_Real a31 = a[8],  a32 = a[9],  a33 = a[10], a34 = a[11];
+    const HYPRE_Real a41 = a[12], a42 = a[13], a43 = a[14], a44 = a[15];
+
+    const HYPRE_Real M11 = a22*a33*a44 + a23*a34*a42 + a24*a32*a43 - a22*a34*a43 - a23*a32*a44 - a24*a33*a42;
+    const HYPRE_Real M12 = a12*a34*a43 + a13*a32*a44 + a14*a33*a42 - a12*a33*a44 - a13*a34*a42 - a14*a32*a43;
+    const HYPRE_Real M13 = a12*a23*a44 + a13*a24*a42 + a14*a22*a43 - a12*a24*a43 - a13*a22*a44 - a14*a23*a42;
+    const HYPRE_Real M14 = a12*a24*a33 + a13*a22*a34 + a14*a23*a32 - a12*a23*a34 - a13*a24*a32 - a14*a22*a33;
+    const HYPRE_Real M21 = a21*a34*a43 + a23*a31*a44 + a24*a33*a41 - a21*a33*a44 - a23*a34*a41 - a24*a31*a43;
+    const HYPRE_Real M22 = a11*a33*a44 + a13*a34*a41 + a14*a31*a43 - a11*a34*a43 - a13*a31*a44 - a14*a33*a41;
+    const HYPRE_Real M23 = a11*a24*a43 + a13*a21*a44 + a14*a23*a41 - a11*a23*a44 - a13*a24*a41 - a14*a21*a43;
+    const HYPRE_Real M24 = a11*a23*a34 + a13*a24*a31 + a14*a21*a33 - a11*a24*a33 - a13*a21*a34 - a14*a23*a31;
+    const HYPRE_Real M31 = a21*a32*a44 + a22*a34*a41 + a24*a31*a42 - a21*a34*a42 - a22*a31*a44 - a24*a32*a41;
+    const HYPRE_Real M32 = a11*a34*a42 + a12*a31*a44 + a14*a32*a41 - a11*a32*a44 - a12*a34*a41 - a14*a31*a42;
+    const HYPRE_Real M33 = a11*a22*a44 + a12*a24*a41 + a14*a21*a42 - a11*a24*a42 - a12*a21*a44 - a14*a22*a41;
+    const HYPRE_Real M34 = a11*a24*a32 + a12*a21*a34 + a14*a22*a31 - a11*a22*a34 - a12*a24*a31 - a14*a21*a32;
+    const HYPRE_Real M41 = a21*a33*a42 + a22*a31*a43 + a23*a32*a41 - a21*a32*a43 - a22*a33*a41 - a23*a31*a42;
+    const HYPRE_Real M42 = a11*a32*a43 + a12*a33*a41 + a13*a31*a42 - a11*a33*a42 - a12*a31*a43 - a13*a32*a41;
+    const HYPRE_Real M43 = a11*a23*a42 + a12*a21*a43 + a13*a22*a41 - a11*a22*a43 - a12*a23*a41 - a13*a21*a42;
+    const HYPRE_Real M44 = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
+
+    const HYPRE_Real det = a11*M11 + a12*M21 + a13*M31 + a14*M41;
+    HYPRE_Real det_inv;
+
+    if ( fabs(det) < 1e-22 ) {
+        hypre_printf("### WARNING: Matrix is nearly singular! det = %e\n", det);
+        /*
+         printf("##----------------------------------------------\n");
+         printf("## %12.5e %12.5e %12.5e \n", a0, a1, a2);
+         printf("## %12.5e %12.5e %12.5e \n", a3, a4, a5);
+         printf("## %12.5e %12.5e %12.5e \n", a5, a6, a7);
+         printf("##----------------------------------------------\n");
+         getchar();
+         */
+    }
+
+    det_inv = 1.0/det;
+
+    a[0] = M11*det_inv;  a[1] = M12*det_inv;  a[2] = M13*det_inv;  a[3] = M14*det_inv;
+    a[4] = M21*det_inv;  a[5] = M22*det_inv;  a[6] = M23*det_inv;  a[7] = M24*det_inv;
+    a[8] = M31*det_inv;  a[9] = M32*det_inv;  a[10] = M33*det_inv; a[11] = M34*det_inv;
+    a[12] = M41*det_inv; a[13] = M42*det_inv; a[14] = M43*det_inv; a[15] = M44*det_inv;
+
+}
+
+void hypre_blas_mat_inv(HYPRE_Real *a,
+						HYPRE_Int n)
+{
+	HYPRE_Int i,j,k,l,u,kn,in;
+	HYPRE_Real alinv;
+	if (n == 4)
+	{
+		hypre_blas_smat_inv_n4(a);
+	}
+	else
+	{
+		for (k=0; k<n; ++k) {
+			kn = k*n;
+			l  = kn+k;
+
+			//if (fabs(a[l]) < SMALLREAL) {
+			//	printf("### WARNING: Diagonal entry is close to zero!");
+			//	printf("### WARNING: diag_%d=%e\n", k, a[l]);
+			//	a[l] = SMALLREAL;
+			//}
+			alinv = 1.0/a[l];
+			a[l] = alinv;
+
+			for (j=0; j<k; ++j) {
+				u = kn+j; a[u] *= alinv;
+			}
+
+			for (j=k+1; j<n; ++j) {
+				u = kn+j; a[u] *= alinv;
+			}
+
+			for (i=0; i<k; ++i) {
+				in = i*n;
+				for (j=0; j<n; ++j)
+					if (j!=k) {
+						u = in+j; a[u] -= a[in+k]*a[kn+j];
+					} // end if (j!=k)
+			}
+
+			for (i=k+1; i<n; ++i) {
+				in = i*n;
+				for (j=0; j<n; ++j)
+					if (j!=k) {
+						u = in+j; a[u] -= a[in+k]*a[kn+j];
+					} // end if (j!=k)
+			}
+
+			for (i=0; i<k; ++i) {
+				u=i*n+k; a[u] *= -alinv;
+			}
+
+			for (i=k+1; i<n; ++i) {
+				u=i*n+k; a[u] *= -alinv;
+			}
+		} // end for (k=0; k<n; ++k)
+	}// end if
+}
+
+HYPRE_Int hypre_block_jacobi_scaling(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix **B_ptr,
+				     void *mgr_vdata, HYPRE_Int debug_flag)
+{
+	MPI_Comm 	         comm = hypre_ParCSRMatrixComm(A);
+
+	hypre_ParMGRData   *mgr_data =  (hypre_ParMGRData*) mgr_vdata;
+
+	HYPRE_Int		   num_procs,  my_id;
+
+	HYPRE_Int    blk_size  = (mgr_data -> block_size);
+	HYPRE_Int    reserved_coarse_size = (mgr_data -> reserved_coarse_size);
+
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
+	HYPRE_Int             *A_diag_i = hypre_CSRMatrixI(A_diag);
+	HYPRE_Int             *A_diag_j = hypre_CSRMatrixJ(A_diag);
+
+	hypre_ParCSRMatrix    *B;
+
+	hypre_CSRMatrix *B_diag;
+	HYPRE_Real      *B_diag_data;
+	HYPRE_Int       *B_diag_i;
+	HYPRE_Int       *B_diag_j;
+
+	hypre_CSRMatrix *B_offd;
+	HYPRE_Int              i,ii;
+	HYPRE_Int              j,jj;
+	HYPRE_Int              k;
+
+	HYPRE_Int              n = hypre_CSRMatrixNumRows(A_diag);
+	HYPRE_Int n_block, left_size,inv_size;
+
+//	HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+	HYPRE_Int        bidx,bidxm1,bidxp1;
+	HYPRE_Real       * diaginv;
+
+	const HYPRE_Int     nb2 = blk_size*blk_size;
+
+	HYPRE_Int block_scaling_error = 0;
+
+	hypre_MPI_Comm_size(comm,&num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+//	HYPRE_Int num_threads = hypre_NumThreads();
+
+	//printf("n = %d\n",n);
+
+	if (my_id == num_procs)
+	{
+		n_block   = (n - reserved_coarse_size) / blk_size;
+		left_size = n - blk_size*n_block;
+	}
+	else
+	{
+		n_block = n / blk_size;
+		left_size = n - blk_size*n_block;
+	}
+
+	inv_size  = nb2*n_block + left_size*left_size;
+
+	//printf("inv_size = %d\n",inv_size);
+
+	hypre_blockRelax_setup(A,blk_size,reserved_coarse_size,&(mgr_data -> diaginv));
+
+//	if (debug_flag==4) wall_time = time_getWallclockSeconds();
+
+	/*-----------------------------------------------------------------------
+	 *  First Pass: Determine size of B and fill in
+	 *-----------------------------------------------------------------------*/
+
+	B_diag_i    = hypre_CTAlloc(HYPRE_Int, n+1);
+	B_diag_j    = hypre_CTAlloc(HYPRE_Int, inv_size);
+	B_diag_data = hypre_CTAlloc(HYPRE_Real, inv_size);
+
+	B_diag_i[n] = inv_size;
+
+	//B_offd_i    = hypre_CTAlloc(HYPRE_Int, n+1);
+	//B_offd_j    = hypre_CTAlloc(HYPRE_Int, 1);
+	//B_offd_data = hypre_CTAlloc(HYPRE_Real,1);
+
+	//B_offd_i[n] = 1;
+    /*-----------------------------------------------------------------
+	 * Get all the diagonal sub-blocks
+	 *-----------------------------------------------------------------*/
+	diaginv = hypre_CTAlloc(HYPRE_Real, nb2);
+	//printf("n_block = %d\n",n_block);
+	for (i = 0;i < n_block; i++)
+	{
+		bidxm1 = i*blk_size;
+		bidxp1 = (i+1)*blk_size;
+
+		for (k = 0;k < blk_size; k++)
+		{
+			for (j = 0;j < blk_size; j++)
+			{
+				bidx = k*blk_size + j;
+				diaginv[bidx] = 0.0;
+			}
+
+			for (ii = A_diag_i[bidxm1+k]; ii < A_diag_i[bidxm1+k+1]; ii++)
+			{
+
+				jj = A_diag_j[ii];
+
+				if (jj >= bidxm1 && jj < bidxp1 && fabs(A_diag_data[ii]) > SMALLREAL)
+				{
+					bidx = k*blk_size + jj - bidxm1;
+					//printf("jj = %d,val = %e, bidx = %d\n",jj,A_diag_data[ii],bidx);
+					diaginv[bidx] = A_diag_data[ii];
+				}
+			}
+		}
+
+		/* for (k = 0;k < blk_size; k++) */
+		/* { */
+		/* 	for (j = 0;j < blk_size; j++) */
+		/* 	{ */
+		/* 		bidx = k*blk_size + j; */
+		/* 		printf("diaginv[%d] = %e\n",bidx,diaginv[bidx]); */
+		/* 	} */
+		/* } */
+
+		hypre_blas_mat_inv(diaginv, blk_size);
+
+		for (k = 0;k < blk_size; k++)
+		{
+			B_diag_i[i*blk_size+k] = i*nb2 + k*blk_size;
+			//B_offd_i[i*nb2+k] = 0;
+
+			for (j = 0;j < blk_size; j++)
+			{
+				bidx = i*nb2 + k*blk_size + j;
+				B_diag_j[bidx] = i*blk_size + j;
+				B_diag_data[bidx] = diaginv[k*blk_size + j];
+			}
+		}
+	}
+
+	//printf("Before create\n");
+	B = hypre_ParCSRMatrixCreate(comm,
+	 							 hypre_ParCSRMatrixGlobalNumRows(A),
+	 							 hypre_ParCSRMatrixGlobalNumCols(A),
+	 							 hypre_ParCSRMatrixRowStarts(A),
+	 							 hypre_ParCSRMatrixColStarts(A),
+	 							 0,
+	 							 inv_size,
+								 0);
+	//printf("After create\n");
+	B_diag = hypre_ParCSRMatrixDiag(B);
+	hypre_CSRMatrixData(B_diag) = B_diag_data;
+	hypre_CSRMatrixI(B_diag) = B_diag_i;
+	hypre_CSRMatrixJ(B_diag) = B_diag_j;
+	B_offd = hypre_ParCSRMatrixOffd(B);
+	hypre_CSRMatrixData(B_offd) = NULL;
+	hypre_CSRMatrixI(B_offd) = NULL;
+	hypre_CSRMatrixJ(B_offd) = NULL;
+	/* hypre_ParCSRMatrixOwnsRowStarts(B) = 0; */
+
+	*B_ptr = B;
+
+	return(block_scaling_error);
+}
+
+HYPRE_Int hypre_block_jacobi (hypre_ParCSRMatrix *A,
+							  hypre_ParVector    *f,
+							  hypre_ParVector    *u,
+							  HYPRE_Real         blk_size,
+							  HYPRE_Int           n_block,
+							  HYPRE_Int           left_size,
+                              HYPRE_Real *diaginv,
+							  hypre_ParVector    *Vtemp)
+{
+	MPI_Comm	   comm = hypre_ParCSRMatrixComm(A);
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Real     *A_diag_data  = hypre_CSRMatrixData(A_diag);
+	HYPRE_Int            *A_diag_i     = hypre_CSRMatrixI(A_diag);
+	HYPRE_Int            *A_diag_j     = hypre_CSRMatrixJ(A_diag);
+	hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
+	HYPRE_Int            *A_offd_i     = hypre_CSRMatrixI(A_offd);
+	HYPRE_Real     *A_offd_data  = hypre_CSRMatrixData(A_offd);
+	HYPRE_Int            *A_offd_j     = hypre_CSRMatrixJ(A_offd);
+	hypre_ParCSRCommPkg  *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+	hypre_ParCSRCommHandle *comm_handle;
+
+	HYPRE_Int             n       = hypre_CSRMatrixNumRows(A_diag);
+	HYPRE_Int             num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
+
+	hypre_Vector   *u_local = hypre_ParVectorLocalVector(u);
+	HYPRE_Real     *u_data  = hypre_VectorData(u_local);
+
+	hypre_Vector   *f_local = hypre_ParVectorLocalVector(f);
+	HYPRE_Real     *f_data  = hypre_VectorData(f_local);
+
+	hypre_Vector   *Vtemp_local = hypre_ParVectorLocalVector(Vtemp);
+	HYPRE_Real     *Vtemp_data = hypre_VectorData(Vtemp_local);
+	HYPRE_Real 	  *Vext_data = NULL;
+	HYPRE_Real 	  *v_buf_data;
+
+	HYPRE_Int             i, j, k;
+	HYPRE_Int             ii, jj;
+	HYPRE_Int             bidx,bidx1;
+	HYPRE_Int             relax_error = 0;
+	HYPRE_Int		   num_sends;
+	HYPRE_Int		   index, start;
+	HYPRE_Int		   num_procs, my_id;
+	HYPRE_Real	    *res;
+
+    const HYPRE_Int     nb2 = blk_size*blk_size;
+
+	hypre_MPI_Comm_size(comm,&num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+//	HYPRE_Int num_threads = hypre_NumThreads();
+
+	res = hypre_CTAlloc(HYPRE_Real, blk_size);
+
+	if (num_procs > 1)
+	{
+		num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
+
+		v_buf_data = hypre_CTAlloc(HYPRE_Real,
+								   hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));
+
+		Vext_data = hypre_CTAlloc(HYPRE_Real,num_cols_offd);
+
+		if (num_cols_offd)
+		{
+			A_offd_j = hypre_CSRMatrixJ(A_offd);
+			A_offd_data = hypre_CSRMatrixData(A_offd);
+		}
+
+		index = 0;
+		for (i = 0; i < num_sends; i++)
+		{
+        	start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
+        	for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
+				v_buf_data[index++]
+                 	= u_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
+		}
+
+		comm_handle = hypre_ParCSRCommHandleCreate( 1, comm_pkg, v_buf_data,
+													Vext_data);
+	}
+
+	/*-----------------------------------------------------------------
+	 * Copy current approximation into temporary vector.
+	 *-----------------------------------------------------------------*/
+
+#if 0
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+#endif
+	for (i = 0; i < n; i++)
+	{
+		Vtemp_data[i] = u_data[i];
+		//printf("u_old[%d] = %e\n",i,Vtemp_data[i]);
+	}
+	if (num_procs > 1)
+	{
+		hypre_ParCSRCommHandleDestroy(comm_handle);
+		comm_handle = NULL;
+	}
+
+	/*-----------------------------------------------------------------
+	 * Relax points block by block
+	 *-----------------------------------------------------------------*/
+	for (i = 0;i < n_block; i++)
+	{
+		for (j = 0;j < blk_size; j++)
+		{
+			bidx = i*blk_size +j;
+			res[j] = f_data[bidx];
+			for (jj = A_diag_i[bidx]; jj < A_diag_i[bidx+1]; jj++)
+			{
+				ii = A_diag_j[jj];
+				res[j] -= A_diag_data[jj] * Vtemp_data[ii];
+				//printf("%d: Au= %e * %e =%e\n",ii,A_diag_data[jj],Vtemp_data[ii], res[j]);
+			}
+			for (jj = A_offd_i[bidx]; jj < A_offd_i[bidx+1]; jj++)
+			{
+				ii = A_offd_j[jj];
+				res[j] -= A_offd_data[jj] * Vext_data[ii];
+			}
+			//printf("%d: res = %e\n",bidx,res[j]);
+		}
+
+		for (j = 0;j < blk_size; j++)
+		{
+			bidx1 = i*blk_size +j;
+			for (k = 0;k < blk_size; k++)
+			{
+				bidx  = i*nb2 +j*blk_size+k;
+				u_data[bidx1] += res[k]*diaginv[bidx];
+				//printf("u[%d] = %e, diaginv[%d] = %e\n",bidx1,u_data[bidx1],bidx,diaginv[bidx]);
+			}
+			//printf("u[%d] = %e\n",bidx1,u_data[bidx1]);
+		}
+	}
+
+	if (num_procs > 1)
+	{
+		hypre_TFree(Vext_data);
+		hypre_TFree(v_buf_data);
+	}
+	hypre_TFree(res);
+	return(relax_error);
+}
+
+/*Block smoother*/
+HYPRE_Int
+hypre_blockRelax_setup(hypre_ParCSRMatrix *A,
+					   HYPRE_Int          blk_size,
+					   HYPRE_Int          reserved_coarse_size,
+					   HYPRE_Real        **diaginvptr)
+{
+	MPI_Comm	   comm = hypre_ParCSRMatrixComm(A);
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Real     *A_diag_data  = hypre_CSRMatrixData(A_diag);
+	HYPRE_Int            *A_diag_i     = hypre_CSRMatrixI(A_diag);
+	HYPRE_Int            *A_diag_j     = hypre_CSRMatrixJ(A_diag);
+	HYPRE_Int             n       = hypre_CSRMatrixNumRows(A_diag);
+
+	HYPRE_Int             i, j,k;
+	HYPRE_Int             ii, jj;
+	HYPRE_Int             bidx,bidxm1,bidxp1;
+	HYPRE_Int		   num_procs, my_id;
+
+    const HYPRE_Int     nb2 = blk_size*blk_size;
+	HYPRE_Int           n_block;
+	HYPRE_Int           left_size,inv_size;
+	HYPRE_Real        *diaginv = *diaginvptr;
+
+
+	hypre_MPI_Comm_size(comm,&num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+//	HYPRE_Int num_threads = hypre_NumThreads();
+
+	if (my_id == num_procs)
+	{
+		n_block   = (n - reserved_coarse_size) / blk_size;
+		left_size = n - blk_size*n_block;
+	}
+	else
+	{
+		n_block = n / blk_size;
+		left_size = n - blk_size*n_block;
+	}
+
+	inv_size  = nb2*n_block + left_size*left_size;
+
+	if (diaginv !=NULL)
+	{
+		hypre_TFree(diaginv);
+		diaginv = hypre_CTAlloc(HYPRE_Real, inv_size);
+	}
+	else {
+		diaginv = hypre_CTAlloc(HYPRE_Real, inv_size);
+	}
+
+	/*-----------------------------------------------------------------
+	 * Get all the diagonal sub-blocks
+	 *-----------------------------------------------------------------*/
+	for (i = 0;i < n_block; i++)
+	{
+		bidxm1 = i*blk_size;
+		bidxp1 = (i+1)*blk_size;
+		//printf("bidxm1 = %d,bidxp1 = %d\n",bidxm1,bidxp1);
+
+		for (k = 0;k < blk_size; k++)
+		{
+			for (j = 0;j < blk_size; j++)
+			{
+				bidx = i*nb2 + k*blk_size + j;
+				diaginv[bidx] = 0.0;
+			}
+
+			for (ii = A_diag_i[bidxm1+k]; ii < A_diag_i[bidxm1+k+1]; ii++)
+			{
+
+				jj = A_diag_j[ii];
+
+				if (jj >= bidxm1 && jj < bidxp1 && fabs(A_diag_data[ii]) > SMALLREAL)
+				{
+					bidx = i*nb2 + k*blk_size + jj - bidxm1;
+					//printf("jj = %d,val = %e, bidx = %d\n",jj,A_diag_data[ii],bidx);
+					diaginv[bidx] = A_diag_data[ii];
+				}
+			}
+		}
+	}
+
+
+
+	for (i = 0;i < left_size; i++)
+	{
+		bidxm1 =n_block*nb2 + i*blk_size;
+		bidxp1 =n_block*nb2 + (i+1)*blk_size;
+		for (j = 0;j < left_size; j++)
+		{
+			bidx = n_block*nb2 + i*blk_size +j;
+			diaginv[bidx] = 0.0;
+		}
+
+		for (ii = A_diag_i[n_block*blk_size + i]; ii < A_diag_i[n_block*blk_size+i+1]; ii++)
+		{
+			jj = A_diag_j[ii];
+			if (jj > n_block*blk_size)
+			{
+				bidx = n_block*nb2 + i*blk_size + jj - n_block*blk_size;
+				diaginv[bidx] = A_diag_data[ii];
+			}
+		}
+	}
+
+
+	/*-----------------------------------------------------------------
+	 * compute the inverses of all the diagonal sub-blocks
+	 *-----------------------------------------------------------------*/
+	if (blk_size > 1)
+	{
+		for (i = 0;i < n_block; i++)
+		{
+			hypre_blas_mat_inv(diaginv+i*nb2, blk_size);
+		}
+		hypre_blas_mat_inv(diaginv+(HYPRE_Int)(blk_size*nb2),left_size);
+	}
+	else
+	{
+		for (i = 0;i < n; i++)
+		{
+			// FIX-ME: zero-diagonal should be tested previously
+			if (fabs(diaginv[i]) < SMALLREAL)
+				diaginv[i] = 0.0;
+			else
+				diaginv[i] = 1.0 / diaginv[i];
+		}
+	}
+
+	*diaginvptr = diaginv;
+
+	return 1;
+}
+
+HYPRE_Int
+hypre_blockRelax(hypre_ParCSRMatrix *A,
+				 hypre_ParVector    *f,
+				 hypre_ParVector    *u,
+				 HYPRE_Int          blk_size,
+				 HYPRE_Int          reserved_coarse_size,
+				 hypre_ParVector    *Vtemp,
+				 hypre_ParVector    *Ztemp)
+{
+	MPI_Comm	   comm = hypre_ParCSRMatrixComm(A);
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Real     *A_diag_data  = hypre_CSRMatrixData(A_diag);
+	HYPRE_Int            *A_diag_i     = hypre_CSRMatrixI(A_diag);
+	HYPRE_Int            *A_diag_j     = hypre_CSRMatrixJ(A_diag);
+	HYPRE_Int             n       = hypre_CSRMatrixNumRows(A_diag);
+
+	HYPRE_Int             i, j,k;
+	HYPRE_Int             ii, jj;
+
+	HYPRE_Int             bidx,bidxm1,bidxp1;
+	HYPRE_Int             relax_error = 0;
+
+	HYPRE_Int		   num_procs, my_id;
+
+    const HYPRE_Int     nb2 = blk_size*blk_size;
+	HYPRE_Int           n_block;
+	HYPRE_Int           left_size,inv_size;
+	HYPRE_Real          *diaginv;
+
+	hypre_MPI_Comm_size(comm,&num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+	
+//	HYPRE_Int num_threads = hypre_NumThreads();
+
+	if (my_id == num_procs)
+	{
+		n_block   = (n - reserved_coarse_size) / blk_size;
+		left_size = n - blk_size*n_block;
+	}
+	else
+	{
+		n_block = n / blk_size;
+		left_size = n - blk_size*n_block;
+	}
+
+	inv_size  = nb2*n_block + left_size*left_size;
+
+	diaginv = hypre_CTAlloc(HYPRE_Real, inv_size);
+	/*-----------------------------------------------------------------
+	 * Get all the diagonal sub-blocks
+	 *-----------------------------------------------------------------*/
+	for (i = 0;i < n_block; i++)
+	{
+		bidxm1 = i*blk_size;
+		bidxp1 = (i+1)*blk_size;
+		//printf("bidxm1 = %d,bidxp1 = %d\n",bidxm1,bidxp1);
+
+		for (k = 0;k < blk_size; k++)
+		{
+			for (j = 0;j < blk_size; j++)
+			{
+				bidx = i*nb2 + k*blk_size + j;
+				diaginv[bidx] = 0.0;
+			}
+
+			for (ii = A_diag_i[bidxm1+k]; ii < A_diag_i[bidxm1+k+1]; ii++)
+			{
+
+				jj = A_diag_j[ii];
+
+				if (jj >= bidxm1 && jj < bidxp1 && fabs(A_diag_data[ii]) > SMALLREAL)
+				{
+					bidx = i*nb2 + k*blk_size + jj - bidxm1;
+					//printf("jj = %d,val = %e, bidx = %d\n",jj,A_diag_data[ii],bidx);
+					diaginv[bidx] = A_diag_data[ii];
+				}
+			}
+		}
+
+	}
+
+	for (i = 0;i < left_size; i++)
+	{
+		bidxm1 =n_block*nb2 + i*blk_size;
+		bidxp1 =n_block*nb2 + (i+1)*blk_size;
+		for (j = 0;j < left_size; j++)
+		{
+			bidx = n_block*nb2 + i*blk_size +j;
+			diaginv[bidx] = 0.0;
+		}
+
+		for (ii = A_diag_i[n_block*blk_size + i]; ii < A_diag_i[n_block*blk_size+i+1]; ii++)
+		{
+			jj = A_diag_j[ii];
+			if (jj > n_block*blk_size)
+			{
+				bidx = n_block*nb2 + i*blk_size + jj - n_block*blk_size;
+				diaginv[bidx] = A_diag_data[ii];
+			}
+		}
+	}
+/*
+	for (i = 0;i < n_block; i++)
+	{
+		for (j = 0;j < blk_size; j++)
+		{
+
+			for (k = 0;k < blk_size; k ++)
+			{
+				bidx = i*nb2 + j*blk_size + k;
+				printf("%e\t",diaginv[bidx]);
+			}
+			printf("\n");
+		}
+		printf("\n");
+	}
+*/
+	/*-----------------------------------------------------------------
+	 * compute the inverses of all the diagonal sub-blocks
+	 *-----------------------------------------------------------------*/
+	if (blk_size > 1)
+	{
+		for (i = 0;i < n_block; i++)
+		{
+			hypre_blas_mat_inv(diaginv+i*nb2, blk_size);
+		}
+		hypre_blas_mat_inv(diaginv+(HYPRE_Int)(blk_size*nb2),left_size);
+		/*
+		for (i = 0;i < n_block; i++)
+		{
+			for (j = 0;j < blk_size; j++)
+			{
+
+				for (k = 0;k < blk_size; k ++)
+				{
+					bidx = i*nb2 + j*blk_size + k;
+					printf("%e\t",diaginv[bidx]);
+				}
+				printf("\n");
+			}
+			printf("\n");
+		}
+		*/
+	}
+	else
+	{
+		for (i = 0;i < n; i++)
+		{
+			// FIX-ME: zero-diagonal should be tested previously
+			if (fabs(diaginv[i]) < SMALLREAL)
+				diaginv[i] = 0.0;
+			else
+				diaginv[i] = 1.0 / diaginv[i];
+		}
+
+	}
+
+	hypre_block_jacobi(A,f,u,blk_size,n_block,left_size,diaginv,Vtemp);
+
+	/*-----------------------------------------------------------------
+	 * Free temperary memeory
+	 *-----------------------------------------------------------------*/
+	hypre_TFree(diaginv);
+
+	return(relax_error);
+}
+
+/* set coarse grid solver */
+HYPRE_Int
+hypre_MGRSetCoarseSolver( void  *mgr_vdata,
+							HYPRE_Int  (*coarse_grid_solver_solve)(void*,void*,void*,void*),
+							HYPRE_Int  (*coarse_grid_solver_setup)(void*,void*,void*,void*),
+							void  *coarse_grid_solver )
+{
+   hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+   if (!mgr_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   (mgr_data -> coarse_grid_solver_solve) = coarse_grid_solver_solve;
+   (mgr_data -> coarse_grid_solver_setup) = coarse_grid_solver_setup;
+   (mgr_data -> coarse_grid_solver)       = (HYPRE_Solver) coarse_grid_solver;
+
+   (mgr_data -> use_default_cgrid_solver) = 0;
+
+   return hypre_error_flag;
+}
+
+/* Set the maximum number of coarse levels.
+ * maxcoarselevs = 1 yields the default 2-grid scheme.
+*/
+HYPRE_Int
+hypre_MGRSetMaxCoarseLevels( void *mgr_vdata, HYPRE_Int maxcoarselevs )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> max_num_coarse_levels) = maxcoarselevs;
+   return hypre_error_flag;
+}
+/* Set the system block size */
+HYPRE_Int
+hypre_MGRSetBlockSize( void *mgr_vdata, HYPRE_Int bsize )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> block_size) = bsize;
+   return hypre_error_flag;
+}
+/* Set the relaxation type for the fine levels of the reduction.
+ * Currently supports the following flavors of relaxation types
+ * as described in the documentation:
+ * relax_types 0 - 8, 13, 14, 18, 19, 98.
+ * See par_relax.c and par_relax_more.c for more details.
+ *
+*/
+HYPRE_Int
+hypre_MGRSetRelaxType( void *mgr_vdata, HYPRE_Int relax_type )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> relax_type) = relax_type;
+   return hypre_error_flag;
+}
+
+/* Set the number of relaxation sweeps */
+HYPRE_Int
+hypre_MGRSetNumRelaxSweeps( void *mgr_vdata, HYPRE_Int nsweeps )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> num_relax_sweeps) = nsweeps;
+   return hypre_error_flag;
+}
+
+/* Set the F-relaxation strategy: 0=single level, 1=multi level
+*/
+HYPRE_Int
+hypre_MGRSetFRelaxMethod( void *mgr_vdata, HYPRE_Int relax_method )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> Frelax_method) = relax_method;
+   return hypre_error_flag;
+}
+/* Set the type of the restriction type
+ * for computing restriction operator
+*/
+HYPRE_Int
+hypre_MGRSetRestrictType( void *mgr_vdata, HYPRE_Int restrict_type)
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> restrict_type) = restrict_type;
+   return hypre_error_flag;
+}
+
+/* Set the type of the interpolation
+ * for computing interpolation operator
+*/
+HYPRE_Int
+hypre_MGRSetInterpType( void *mgr_vdata, HYPRE_Int interpType)
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> interp_type) = interpType;
+   return hypre_error_flag;
+}
+/* Set the number of Jacobi interpolation iterations
+ * for computing interpolation operator
+*/
+HYPRE_Int
+hypre_MGRSetNumInterpSweeps( void *mgr_vdata, HYPRE_Int nsweeps )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> num_interp_sweeps) = nsweeps;
+   return hypre_error_flag;
+}
+/* Set print level for mgr solver */
+HYPRE_Int
+hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> print_level) = print_level;
+   return hypre_error_flag;
+}
+/* Set print level for mgr solver */
+HYPRE_Int
+hypre_MGRSetLogging( void *mgr_vdata, HYPRE_Int logging )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> logging) = logging;
+   return hypre_error_flag;
+}
+/* Set max number of iterations for mgr solver */
+HYPRE_Int
+hypre_MGRSetMaxIter( void *mgr_vdata, HYPRE_Int max_iter )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> max_iter) = max_iter;
+   return hypre_error_flag;
+}
+/* Set convergence tolerance for mgr solver */
+HYPRE_Int
+hypre_MGRSetTol( void *mgr_vdata, HYPRE_Real tol )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> tol) = tol;
+   return hypre_error_flag;
+}
+/* Set max number of iterations for mgr solver */
+HYPRE_Int
+hypre_MGRSetMaxGlobalsmoothIters( void *mgr_vdata, HYPRE_Int max_iter )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> global_smooth_iters) = max_iter;
+   return hypre_error_flag;
+}
+/* Set max number of iterations for mgr solver */
+
+HYPRE_Int
+hypre_MGRSetGlobalsmoothType( void *mgr_vdata, HYPRE_Int iter_type )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   (mgr_data -> global_smooth_type) = iter_type;
+   return hypre_error_flag;
+}
+
+/* Get number of iterations for MGR solver */
+HYPRE_Int
+hypre_MGRGetNumIterations( void *mgr_vdata, HYPRE_Int *num_iterations )
+{
+   hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+   if (!mgr_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   *num_iterations = mgr_data->num_iterations;
+
+   return hypre_error_flag;
+}
+
+/* Get residual norms for MGR solver */
+HYPRE_Int
+hypre_MGRGetFinalRelativeResidualNorm( void *mgr_vdata, HYPRE_Real *res_norm )
+{
+   hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+   if (!mgr_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   *res_norm = mgr_data->final_rel_residual_norm;
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int 
+hypre_MGRBuildAff( MPI_Comm comm, HYPRE_Int local_num_variables, HYPRE_Int num_functions, 
+  HYPRE_Int *dof_func, HYPRE_Int *CF_marker, HYPRE_Int **coarse_dof_func_ptr, HYPRE_Int **coarse_pnts_global_ptr,
+  hypre_ParCSRMatrix *A, HYPRE_Int debug_flag, hypre_ParCSRMatrix **P_f_ptr, hypre_ParCSRMatrix **A_ff_ptr ) 
+{
+  HYPRE_Int *CF_marker_copy = hypre_CTAlloc(HYPRE_Int, local_num_variables);
+  HYPRE_Int i;
+  for (i = 0; i < local_num_variables; i++) {
+    CF_marker_copy[i] = -CF_marker[i];
+  }
+
+  hypre_BoomerAMGCoarseParms(comm, local_num_variables, 1, NULL, CF_marker_copy, coarse_dof_func_ptr, coarse_pnts_global_ptr);
+  hypre_MGRBuildP(A, CF_marker_copy, (*coarse_pnts_global_ptr), 0, debug_flag, P_f_ptr);
+  hypre_BoomerAMGBuildCoarseOperator(*P_f_ptr, A, *P_f_ptr, A_ff_ptr);
+
+  hypre_TFree(CF_marker_copy);
+  return 0;
+}
diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr.h hypre-2.13.0/src/parcsr_ls/par_mgr.h
--- hypre-2.11.2/src/parcsr_ls/par_mgr.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_mgr.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,114 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifndef hypre_ParMGR_DATA_HEADER
+#define hypre_ParMGR_DATA_HEADER
+/*--------------------------------------------------------------------------
+ * hypre_ParMGRData
+ *--------------------------------------------------------------------------*/
+typedef struct
+{
+  // block data
+  HYPRE_Int  block_size;
+  HYPRE_Int  num_coarse_indexes;
+  HYPRE_Int  *block_num_coarse_indexes;
+  HYPRE_Int  **block_cf_marker;
+
+  // initial setup data (user provided)
+  HYPRE_Int num_coarse_levels;
+  HYPRE_Int *num_coarse_per_level;
+  HYPRE_Int **level_coarse_indexes;  
+
+  //general data
+  HYPRE_Int max_num_coarse_levels;
+  hypre_ParCSRMatrix **A_array;
+  hypre_ParCSRMatrix **P_array;
+  hypre_ParCSRMatrix **RT_array;
+  hypre_ParCSRMatrix *RAP;
+  HYPRE_Int **CF_marker_array;
+  HYPRE_Int **coarse_indices_lvls;
+  hypre_ParVector    **F_array;
+  hypre_ParVector    **U_array;
+  hypre_ParVector    *residual;
+  HYPRE_Real    *rel_res_norms;
+
+  HYPRE_Real   max_row_sum;
+  HYPRE_Real	num_interp_sweeps;
+  HYPRE_Int    interp_type;
+  HYPRE_Int   restrict_type;
+  HYPRE_Real   strong_threshold;
+  HYPRE_Real   trunc_factor;
+  HYPRE_Real   S_commpkg_switch;
+  HYPRE_Int	P_max_elmts;
+  HYPRE_Int  	num_iterations;
+
+  HYPRE_Real   **l1_norms;
+  HYPRE_Real	final_rel_residual_norm;
+  HYPRE_Real	tol;
+  HYPRE_Real	relax_weight;
+  HYPRE_Int	relax_type;
+  HYPRE_Int	logging;
+  HYPRE_Int	print_level;
+  HYPRE_Int	max_iter;
+  HYPRE_Int	relax_order;
+  HYPRE_Int	num_relax_sweeps;
+
+  HYPRE_Solver coarse_grid_solver;
+  HYPRE_Int	(*coarse_grid_solver_setup)(void*,void*,void*,void*);
+  HYPRE_Int	(*coarse_grid_solver_solve)(void*,void*,void*,void*);
+
+  HYPRE_Int	use_default_cgrid_solver;
+  HYPRE_Real	omega;
+
+  /* temp vectors for solve phase */
+  hypre_ParVector   *Vtemp;
+  hypre_ParVector   *Ztemp;
+  hypre_ParVector   *Utemp;
+  hypre_ParVector   *Ftemp;
+
+  HYPRE_Real          *diaginv;
+  HYPRE_Int           n_block;
+  HYPRE_Int           left_size;
+  HYPRE_Int           global_smooth_iters;
+  HYPRE_Int           global_smooth_type;
+  HYPRE_Solver global_smoother;
+  /* 
+   Number of points that remain part of the coarse grid throughout the hierarchy.
+   For example, number of well equations
+   */
+  HYPRE_Int reserved_coarse_size;
+  HYPRE_Int *reserved_coarse_indexes;
+  HYPRE_Int *reserved_Cpoint_local_indexes;
+ 
+  HYPRE_Int set_non_Cpoints_to_F;
+  
+  /* F-relaxation method */
+  HYPRE_Int Frelax_method;
+  /* V-cycle F relaxation method */
+  hypre_ParAMGData    **FrelaxVcycleData;
+
+  HYPRE_Int   max_local_lvls;
+  
+} hypre_ParMGRData;
+
+
+#define FMRK  -1
+#define CMRK  1
+#define UMRK  0
+#define S_CMRK  2
+
+#define FPT(i, bsize) (((i) % (bsize)) == FMRK)
+#define CPT(i, bsize) (((i) % (bsize)) == CMRK)
+
+#define SMALLREAL 1e-20
+
+#endif
diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr_setup.c hypre-2.13.0/src/parcsr_ls/par_mgr_setup.c
--- hypre-2.11.2/src/parcsr_ls/par_mgr_setup.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_mgr_setup.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,932 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+#include "_hypre_parcsr_ls.h"
+#include "par_mgr.h"
+#include "par_amg.h"
+
+/* Setup MGR data */
+HYPRE_Int
+hypre_MGRSetup( void               *mgr_vdata,
+                  hypre_ParCSRMatrix *A,
+                  hypre_ParVector    *f,
+                  hypre_ParVector    *u )
+{
+	MPI_Comm 	         comm = hypre_ParCSRMatrixComm(A);
+	hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+	HYPRE_Int       cnt,i,j, final_coarse_size, block_size, idx, row, **block_cf_marker;
+	HYPRE_Int	   lev, num_coarsening_levs, last_level, num_c_levels, num_threads,nc,index_i,cflag;
+	HYPRE_Int	   debug_flag = 0;
+
+	hypre_ParCSRMatrix  *RT = NULL;
+	hypre_ParCSRMatrix  *P = NULL;
+	hypre_ParCSRMatrix  *S = NULL;
+	hypre_ParCSRMatrix  *ST = NULL;
+	hypre_ParCSRMatrix  *AT = NULL;
+
+	HYPRE_Int * col_offd_S_to_A = NULL;
+	HYPRE_Int * col_offd_ST_to_AT = NULL;
+	HYPRE_Int * dof_func_buff = NULL;
+	HYPRE_Int * coarse_pnts_global = NULL;
+	HYPRE_Real         **l1_norms = NULL;
+
+	hypre_ParVector     *Ztemp;
+	hypre_ParVector     *Vtemp;
+	hypre_ParVector     *Utemp;
+	hypre_ParVector     *Ftemp;
+
+	/* pointers to mgr data */
+	HYPRE_Int  use_default_cgrid_solver = (mgr_data -> use_default_cgrid_solver);
+	HYPRE_Int  logging = (mgr_data -> logging);
+	HYPRE_Int  print_level = (mgr_data -> print_level);
+	HYPRE_Int  relax_type = (mgr_data -> relax_type);
+	HYPRE_Int  relax_order = (mgr_data -> relax_order);
+	HYPRE_Int  interp_type = (mgr_data -> interp_type);
+  HYPRE_Int  restrict_type = (mgr_data -> restrict_type);
+	HYPRE_Int num_interp_sweeps = (mgr_data -> num_interp_sweeps);
+	HYPRE_Int num_restrict_sweeps = (mgr_data -> num_interp_sweeps);
+	HYPRE_Int	max_elmts = (mgr_data -> P_max_elmts);
+	HYPRE_Real   max_row_sum = (mgr_data -> max_row_sum);
+	HYPRE_Real   strong_threshold = (mgr_data -> strong_threshold);
+	HYPRE_Real   trunc_factor = (mgr_data -> trunc_factor);
+	HYPRE_Real   S_commpkg_switch = (mgr_data -> S_commpkg_switch);
+	HYPRE_Int  old_num_coarse_levels = (mgr_data -> num_coarse_levels);
+	HYPRE_Int  max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+	HYPRE_Int * reserved_Cpoint_local_indexes = (mgr_data -> reserved_Cpoint_local_indexes);
+	HYPRE_Int ** CF_marker_array = (mgr_data -> CF_marker_array);
+	hypre_ParCSRMatrix  **A_array = (mgr_data -> A_array);
+	hypre_ParCSRMatrix  **P_array = (mgr_data -> P_array);
+	hypre_ParCSRMatrix  **RT_array = (mgr_data -> RT_array);	
+	hypre_ParCSRMatrix  *RAP_ptr = NULL;
+
+	hypre_ParVector    **F_array = (mgr_data -> F_array);
+	hypre_ParVector    **U_array = (mgr_data -> U_array);
+	hypre_ParVector    *residual = (mgr_data -> residual);
+	HYPRE_Real    *rel_res_norms = (mgr_data -> rel_res_norms);
+
+	HYPRE_Solver    	default_cg_solver;
+	HYPRE_Int	(*coarse_grid_solver_setup)(void*,void*,void*,void*) = (HYPRE_Int (*)(void*, void*, void*, void*)) (mgr_data -> coarse_grid_solver_setup);
+	HYPRE_Int	(*coarse_grid_solver_solve)(void*,void*,void*,void*) = (HYPRE_Int (*)(void*, void*, void*, void*)) (mgr_data -> coarse_grid_solver_solve);
+
+	HYPRE_Int    global_smooth_type =  (mgr_data -> global_smooth_type);
+
+	HYPRE_Int    reserved_coarse_size = (mgr_data -> reserved_coarse_size);
+
+	HYPRE_Int		   num_procs,  my_id;
+	hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+	HYPRE_Int             n       = hypre_CSRMatrixNumRows(A_diag);
+	HYPRE_Int    blk_size  = (mgr_data -> block_size);
+  
+	hypre_ParAMGData    **FrelaxVcycleData = (mgr_data -> FrelaxVcycleData);  
+	HYPRE_Int Frelax_method = (mgr_data -> Frelax_method);
+
+	/* ----- begin -----*/
+
+	num_threads = hypre_NumThreads();
+
+  block_size = (mgr_data -> block_size);
+  block_cf_marker = (mgr_data -> block_cf_marker);
+  
+    HYPRE_Int **level_coarse_indexes = NULL;
+    HYPRE_Int *level_coarse_size = NULL;
+    HYPRE_Int setNonCpointToF = (mgr_data -> set_non_Cpoints_to_F);
+    HYPRE_Int *reserved_coarse_indexes = (mgr_data -> reserved_coarse_indexes);
+
+   if (print_level > 0)
+   {
+      hypre_printf("Solver info: \n");
+      hypre_printf("Relax type: %d\n", relax_type);
+      hypre_printf("Number of relax sweeps: %d\n", (mgr_data -> num_relax_sweeps));
+      hypre_printf("Interpolation type: %d\n", interp_type);
+      hypre_printf("Number of interpolation sweeps: %d\n", num_interp_sweeps);
+      hypre_printf("Restriction type: %d\n", restrict_type);
+      hypre_printf("Max number of iterations: %d\n", (mgr_data -> max_iter));
+      hypre_printf("Max number of coarse levels: %d\n", (mgr_data -> max_num_coarse_levels));
+      hypre_printf("Tolerance: %e\n", (mgr_data -> tol));
+   }
+
+
+//  HYPRE_Int num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+  
+  HYPRE_Int nloc =  hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
+  HYPRE_Int ilower =  hypre_ParCSRMatrixFirstRowIndex(A);
+  HYPRE_Int iupper =  hypre_ParCSRMatrixLastRowIndex(A);
+       
+        /* Trivial case: simply solve the coarse level problem */
+	if( block_size < 2 || (mgr_data -> max_num_coarse_levels) < 1)
+	{
+		hypre_printf("Warning: Block size is < 2 or number of coarse levels is < 1. \n");
+		hypre_printf("Solving scalar problem on fine grid using coarse level solver \n");
+
+		if(use_default_cgrid_solver)
+		{
+			hypre_printf("No coarse grid solver provided. Using default AMG solver ... \n");
+			/* create and set default solver parameters here */
+			/* create and initialize default_cg_solver */
+			default_cg_solver = (HYPRE_Solver) hypre_BoomerAMGCreate();
+			hypre_BoomerAMGSetMaxIter ( default_cg_solver, (mgr_data -> max_iter) );
+
+			hypre_BoomerAMGSetRelaxOrder( default_cg_solver, 1);
+			hypre_BoomerAMGSetPrintLevel(default_cg_solver, 3);
+			/* set setup and solve functions */
+			coarse_grid_solver_setup = (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSetup;
+			coarse_grid_solver_solve = (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSolve;
+			(mgr_data -> coarse_grid_solver_setup) = coarse_grid_solver_setup;
+			(mgr_data -> coarse_grid_solver_solve) = coarse_grid_solver_solve;
+			(mgr_data -> coarse_grid_solver) = default_cg_solver;
+		}
+
+		// keep reserved coarse indexes to coarsest grid
+  		if((mgr_data -> reserved_Cpoint_local_indexes) != NULL)
+     			hypre_TFree((mgr_data -> reserved_Cpoint_local_indexes));
+  		if (reserved_coarse_size > 0)
+  		{
+			(mgr_data -> reserved_Cpoint_local_indexes) = hypre_CTAlloc(HYPRE_Int, reserved_coarse_size);
+			reserved_Cpoint_local_indexes = (mgr_data -> reserved_Cpoint_local_indexes);
+			cnt=0;
+			for(i=0; i<reserved_coarse_size; i++)
+			{
+				row = reserved_coarse_indexes[i];
+				reserved_Cpoint_local_indexes[cnt++] = row - ilower;
+			}
+		        HYPRE_BoomerAMGSetCpointsToKeep((mgr_data ->coarse_grid_solver), 25,reserved_coarse_size,reserved_Cpoint_local_indexes);	
+   		}
+
+		/* setup coarse grid solver */
+//		hypre_BoomerAMGSetMaxIter ( (mgr_data -> coarse_grid_solver), (mgr_data -> max_iter) );
+//		hypre_BoomerAMGSetPrintLevel((mgr_data -> coarse_grid_solver), 3);
+		coarse_grid_solver_setup((mgr_data -> coarse_grid_solver), A, f, u);
+		(mgr_data -> num_coarse_levels) = 0;
+
+		return hypre_error_flag;
+	}
+	
+/*
+  if ((mgr_data -> level_coarse_indexes) != NULL)
+  {
+     for(i=0; i<max_num_coarse_levels; i++)
+     {
+        if((mgr_data -> level_coarse_indexes)[i] != NULL)
+        {
+           hypre_TFree((mgr_data -> level_coarse_indexes)[i]);
+        }
+     }
+     hypre_TFree((mgr_data -> level_coarse_indexes));
+     (mgr_data -> level_coarse_indexes) = NULL;
+     hypre_TFree((mgr_data -> num_coarse_per_level));
+     (mgr_data -> num_coarse_per_level) = NULL;     
+  }
+*/
+
+   /* Initialize local indexes of coarse sets at different levels */      
+  level_coarse_indexes = hypre_CTAlloc(HYPRE_Int*, max_num_coarse_levels);
+  for (i = 0; i < max_num_coarse_levels; i++) 
+  {
+     level_coarse_indexes[i] = hypre_CTAlloc(HYPRE_Int,nloc);
+  }
+
+  level_coarse_size = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels);
+
+  // loop over levels
+  for(i=0; i<max_num_coarse_levels; i++)
+  {
+     // loop over rows
+     final_coarse_size = 0;
+     for(row = ilower; row <=iupper; row++)
+     {
+        idx = row % block_size;
+        if(block_cf_marker[i][idx] == CMRK)
+        {
+           level_coarse_indexes[i][final_coarse_size++] = row - ilower;
+        }
+     }
+     level_coarse_size[i] = final_coarse_size;
+  }
+  
+  // Set reserved coarse indexes to be kept to the coarsest level of the MGR solver
+  if((mgr_data -> reserved_Cpoint_local_indexes) != NULL)
+     hypre_TFree((mgr_data -> reserved_Cpoint_local_indexes));
+  if (reserved_coarse_size > 0)
+  {
+	(mgr_data -> reserved_Cpoint_local_indexes) = hypre_CTAlloc(HYPRE_Int, reserved_coarse_size);
+	reserved_Cpoint_local_indexes = (mgr_data -> reserved_Cpoint_local_indexes);
+//	cnt=0;
+	for(i=0; i<reserved_coarse_size; i++)
+	{
+		row = reserved_coarse_indexes[i];
+		idx = row % block_size;
+		for(j=0; j<max_num_coarse_levels; j++)
+		{
+		   if(block_cf_marker[j][idx] != CMRK)
+		   {
+		      level_coarse_indexes[j][level_coarse_size[j]++] = row - ilower;
+		   }
+		}
+		reserved_Cpoint_local_indexes[i] = row - ilower;
+	}
+   }   
+    
+  (mgr_data -> level_coarse_indexes) = level_coarse_indexes;
+  
+  (mgr_data -> num_coarse_per_level) = level_coarse_size;
+
+  /* Free Previously allocated data, if any not destroyed */
+  if (A_array || P_array || RT_array || CF_marker_array)
+  {
+    for (j = 1; j < (old_num_coarse_levels); j++)
+    {
+      if (A_array[j])
+      {
+            hypre_ParCSRMatrixDestroy(A_array[j]);
+            A_array[j] = NULL;
+      }
+    }
+
+    for (j = 0; j < old_num_coarse_levels; j++)
+    {
+         if (P_array[j])
+         {
+            hypre_ParCSRMatrixDestroy(P_array[j]);
+            P_array[j] = NULL;
+         }
+
+         if (RT_array[j])
+         {
+       hypre_ParCSRMatrixDestroy(RT_array[j]);
+       RT_array[j] = NULL;
+         }
+
+         if (CF_marker_array[j])
+         {
+       hypre_TFree(CF_marker_array[j]);
+       CF_marker_array[j] = NULL;
+         }
+    }
+    hypre_TFree(P_array);
+    P_array = NULL;
+    hypre_TFree(RT_array);
+    RT_array = NULL;
+    hypre_TFree(CF_marker_array);
+    CF_marker_array = NULL;   
+  }
+
+   /* Free previously allocated FrelaxVcycleData if not destroyed
+   */
+   if(FrelaxVcycleData)
+   {
+      for (j = 0; j < old_num_coarse_levels; j++)
+      {
+         if (FrelaxVcycleData[j])
+         {
+            hypre_MGRDestroyFrelaxVcycleData(FrelaxVcycleData[j]);
+            FrelaxVcycleData[j] = NULL;
+         }
+      }
+      hypre_TFree(FrelaxVcycleData);
+      FrelaxVcycleData = NULL;   
+   }
+   // reset pointer to NULL
+   (mgr_data -> FrelaxVcycleData) = FrelaxVcycleData;
+
+  /* destroy final coarse grid matrix, if not previously destroyed */
+  if((mgr_data -> RAP))
+  {
+    hypre_ParCSRMatrixDestroy((mgr_data -> RAP));
+    (mgr_data -> RAP) = NULL;
+  }
+
+	/* Setup for global block smoothers*/
+
+	hypre_MPI_Comm_size(comm,&num_procs);
+	hypre_MPI_Comm_rank(comm,&my_id);
+	if (my_id == num_procs)
+	{
+		mgr_data -> n_block   = (n - reserved_coarse_size) / blk_size;
+		mgr_data -> left_size = n - blk_size*(mgr_data -> n_block);
+	}
+	else
+	{
+		mgr_data -> n_block = n / blk_size;
+		mgr_data -> left_size = n - blk_size*(mgr_data -> n_block);
+	}
+	if (global_smooth_type == 0)
+	{
+		hypre_blockRelax_setup(A,blk_size,reserved_coarse_size,&(mgr_data -> diaginv));
+	}
+	else if (global_smooth_type == 8)
+	{
+		HYPRE_EuclidCreate(comm, &(mgr_data -> global_smoother));
+		HYPRE_EuclidSetLevel(mgr_data -> global_smoother, 0);
+		HYPRE_EuclidSetBJ(mgr_data -> global_smoother, 1);
+		HYPRE_EuclidSetup(mgr_data -> global_smoother, A, f, u);
+	}
+
+
+	/* clear old l1_norm data, if created */
+	if((mgr_data -> l1_norms))
+	{
+		for (j = 0; j < (old_num_coarse_levels); j++)
+		{
+			if ((mgr_data -> l1_norms)[j])
+			{
+				hypre_TFree((mgr_data -> l1_norms)[j]);
+				(mgr_data -> l1_norms)[j] = NULL;
+			}
+		}
+		hypre_TFree((mgr_data -> l1_norms));
+	}
+
+	/* setup temporary storage */
+	if ((mgr_data -> Ztemp))
+	{
+		hypre_ParVectorDestroy((mgr_data -> Ztemp));
+		(mgr_data -> Ztemp) = NULL;
+	}
+	if ((mgr_data -> Vtemp))
+	{
+		hypre_ParVectorDestroy((mgr_data -> Vtemp));
+		(mgr_data -> Vtemp) = NULL;
+	}
+	if ((mgr_data -> Utemp))
+	{
+		hypre_ParVectorDestroy((mgr_data -> Utemp));
+		(mgr_data -> Utemp) = NULL;
+	}
+	if ((mgr_data -> Ftemp))
+	{
+		hypre_ParVectorDestroy((mgr_data -> Ftemp));
+		(mgr_data -> Ftemp) = NULL;
+	}
+	if ((mgr_data -> residual))
+	{
+		hypre_ParVectorDestroy((mgr_data -> residual));
+		(mgr_data -> residual) = NULL;
+	}
+	if ((mgr_data -> rel_res_norms))
+	{
+		hypre_TFree((mgr_data -> rel_res_norms));
+		(mgr_data -> rel_res_norms) = NULL;
+	}
+
+	Vtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
+								  hypre_ParCSRMatrixGlobalNumRows(A),
+								  hypre_ParCSRMatrixRowStarts(A));
+	hypre_ParVectorInitialize(Vtemp);
+	hypre_ParVectorSetPartitioningOwner(Vtemp,0);
+	(mgr_data ->Vtemp) = Vtemp;
+
+	Ztemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
+								  hypre_ParCSRMatrixGlobalNumRows(A),
+								  hypre_ParCSRMatrixRowStarts(A));
+	hypre_ParVectorInitialize(Ztemp);
+	hypre_ParVectorSetPartitioningOwner(Ztemp,0);
+	(mgr_data -> Ztemp) = Ztemp;
+
+	Utemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
+								  hypre_ParCSRMatrixGlobalNumRows(A),
+								  hypre_ParCSRMatrixRowStarts(A));
+	hypre_ParVectorInitialize(Utemp);
+	hypre_ParVectorSetPartitioningOwner(Utemp,0);
+	(mgr_data ->Utemp) = Utemp;
+
+	Ftemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
+								  hypre_ParCSRMatrixGlobalNumRows(A),
+								  hypre_ParCSRMatrixRowStarts(A));
+	hypre_ParVectorInitialize(Ftemp);
+	hypre_ParVectorSetPartitioningOwner(Ftemp,0);
+	(mgr_data ->Ftemp) = Ftemp;
+
+  /* Allocate memory for level structure */
+  if (A_array == NULL)
+    A_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels);
+  if (P_array == NULL && max_num_coarse_levels > 0)
+    P_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels);
+  if (RT_array == NULL && max_num_coarse_levels > 0)
+    RT_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels);
+  if (CF_marker_array == NULL)
+    CF_marker_array = hypre_CTAlloc(HYPRE_Int*, max_num_coarse_levels);
+
+  /* set pointers to mgr data */
+  (mgr_data -> A_array) = A_array;
+  (mgr_data -> P_array) = P_array;
+  (mgr_data -> RT_array) = RT_array;
+  (mgr_data -> CF_marker_array) = CF_marker_array;
+
+  /* Set up solution and rhs arrays */
+  if (F_array != NULL || U_array != NULL)
+  {
+    for (j = 1; j < old_num_coarse_levels+1; j++)
+    {
+      if (F_array[j] != NULL)
+      {
+        hypre_ParVectorDestroy(F_array[j]);
+        F_array[j] = NULL;
+      }
+      if (U_array[j] != NULL)
+      {
+        hypre_ParVectorDestroy(U_array[j]);
+        U_array[j] = NULL;
+      }
+    }
+  }
+
+  if (F_array == NULL)
+    F_array = hypre_CTAlloc(hypre_ParVector*, max_num_coarse_levels+1);
+  if (U_array == NULL)
+    U_array = hypre_CTAlloc(hypre_ParVector*, max_num_coarse_levels+1);
+
+  /* set solution and rhs pointers */
+  F_array[0] = f;
+  U_array[0] = u;
+
+  (mgr_data -> F_array) = F_array;
+  (mgr_data -> U_array) = U_array;
+
+  /* begin coarsening loop */
+	num_coarsening_levs = max_num_coarse_levels;
+	/* initialize level data matrix here */
+	RAP_ptr = A;
+	/* loop over levels of coarsening */
+	for(lev = 0; lev < num_coarsening_levs; lev++)
+	{
+		/* check if this is the last level */
+		last_level = ((lev == num_coarsening_levs-1));
+
+		/* initialize A_array */
+		A_array[lev] = RAP_ptr;
+    		nloc = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[lev]));
+
+		/* Compute strength matrix for interpolation operator - use default parameters, to be modified later */
+		hypre_BoomerAMGCreateS(A_array[lev], strong_threshold, max_row_sum, 1, NULL, &S);
+
+		/* use appropriate communication package for Strength matrix */
+		if (strong_threshold > S_commpkg_switch)
+			hypre_BoomerAMGCreateSCommPkg(A_array[lev],S,&col_offd_S_to_A);
+		
+		/* Coarsen: Build CF_marker array based on rows of A */
+                cflag = ((last_level || setNonCpointToF));
+                hypre_MGRCoarsen(S, A_array[lev], level_coarse_size[lev], level_coarse_indexes[lev],debug_flag, &CF_marker_array[lev], cflag);
+
+		/* Get global coarse sizes. Note that we assume num_functions = 1
+		 * so dof_func arrays are NULL */
+		hypre_BoomerAMGCoarseParms(comm, nloc, 1, NULL, CF_marker_array[lev], &dof_func_buff,&coarse_pnts_global);
+		/* Compute Petrov-Galerkin operators */
+		/* Interpolation operator */
+		num_interp_sweeps = (mgr_data -> num_interp_sweeps);
+
+		hypre_MGRBuildInterp(A_array[lev], CF_marker_array[lev], S, coarse_pnts_global, 1, dof_func_buff, 
+                                       debug_flag, trunc_factor, max_elmts, col_offd_S_to_A, &P, 1, interp_type, num_interp_sweeps);
+		
+		P_array[lev] = P;
+
+      		/* Build AT (transpose A) */
+      		hypre_ParCSRMatrixTranspose(A_array[lev], &AT, 1);
+
+      		/* Build new strength matrix */
+      		hypre_BoomerAMGCreateS(AT, strong_threshold, max_row_sum, 1, NULL, &ST);
+      		/* use appropriate communication package for Strength matrix */
+      		if (strong_threshold > S_commpkg_switch)
+         		hypre_BoomerAMGCreateSCommPkg(AT, ST, &col_offd_ST_to_AT);
+
+      		num_restrict_sweeps = 0; /* do injection for restriction */
+      		hypre_MGRBuildInterp(AT, CF_marker_array[lev], ST, coarse_pnts_global, 1, dof_func_buff,
+                         	debug_flag, trunc_factor, max_elmts, col_offd_ST_to_AT, &RT, last_level, 0, num_restrict_sweeps);
+                         	
+      		RT_array[lev] = RT;
+
+      		/* Compute RAP for next level */
+      		hypre_BoomerAMGBuildCoarseOperator(RT, A_array[lev], P, &RAP_ptr);
+      		
+      		/* Update coarse level indexes for next levels */
+      		if (lev < num_coarsening_levs - 1) 
+      		{
+      		   // first mark indexes to be updated
+      		   for(i=0; i<level_coarse_size[lev+1]; i++)
+      		   {
+      		      CF_marker_array[lev][level_coarse_indexes[lev+1][i]] = S_CMRK;
+      		   }
+      		   // next: loop over levels to update indexes
+      		   for(i=lev+1; i<max_num_coarse_levels; i++)
+      		   {
+      		      nc = 0;
+       		      index_i = 0;
+      		      for(j=0; j<nloc; j++)
+      		      {
+      		         if(CF_marker_array[lev][j] == CMRK) nc++;
+      		         if(CF_marker_array[lev][j] == S_CMRK)
+      		         {
+      		            level_coarse_indexes[i][index_i++] = nc++;
+      		         }
+      		         if(index_i == level_coarse_size[i]) break;
+      		      }
+      		   }
+      		   // then: reset previously marked indexes
+      		   for(i=0; i<level_coarse_size[lev]; i++)
+      		   {
+      		      CF_marker_array[lev][level_coarse_indexes[lev][i]] = CMRK;
+      		   }
+      		}
+//      		printf("#### nloc = %d, level_coarse_index = %d \n",nloc, level_coarse_size[lev]);
+      		// update reserved coarse indexes to be kept to coarsest level
+      		// first mark indexes to be updated
+      		for(i=0; i<reserved_coarse_size; i++)
+      		{
+      		   CF_marker_array[lev][reserved_Cpoint_local_indexes[i]] = S_CMRK;
+      		}   
+      		// loop to update reserved Cpoints
+      		nc = 0;
+      		index_i = 0;
+      		for(i=0; i<nloc; i++)
+      		{
+      		   if(CF_marker_array[lev][i] == CMRK) nc++;
+      		   if(CF_marker_array[lev][i] == S_CMRK)
+      		   {
+      		      reserved_Cpoint_local_indexes[index_i++] = nc++;
+      		      // reset modified CF marker array indexes
+      		      CF_marker_array[lev][i] = CMRK;
+      		   }
+      		}      		 
+
+    /* allocate space for solution and rhs arrays */
+    F_array[lev+1] =
+      hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_ptr),
+                            hypre_ParCSRMatrixGlobalNumRows(RAP_ptr),
+                            hypre_ParCSRMatrixRowStarts(RAP_ptr));
+    hypre_ParVectorInitialize(F_array[lev+1]);
+    hypre_ParVectorSetPartitioningOwner(F_array[lev+1],0);
+
+    U_array[lev+1] =
+      hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_ptr),
+                            hypre_ParCSRMatrixGlobalNumRows(RAP_ptr),
+                            hypre_ParCSRMatrixRowStarts(RAP_ptr));
+    hypre_ParVectorInitialize(U_array[lev+1]);
+    hypre_ParVectorSetPartitioningOwner(U_array[lev+1],0);
+
+    /* free memory before starting next level */
+    hypre_ParCSRMatrixDestroy(S);
+    S = NULL;
+    hypre_TFree(col_offd_S_to_A);
+    col_offd_S_to_A = NULL;
+
+    hypre_ParCSRMatrixDestroy(AT);
+    hypre_ParCSRMatrixDestroy(ST);
+    ST = NULL;
+    hypre_TFree(col_offd_ST_to_AT);
+    col_offd_ST_to_AT = NULL;
+
+    /* check if last level */
+    if(last_level) break;
+
+  }
+
+   /* set pointer to last level matrix */
+   num_c_levels = lev+1;
+   (mgr_data->num_coarse_levels) = num_c_levels;
+   (mgr_data->RAP) = RAP_ptr;
+
+   /* setup default coarse grid solver */
+   /* default is BoomerAMG */
+   if(use_default_cgrid_solver)
+   {
+      hypre_printf("No coarse grid solver provided. Using default AMG solver ... \n");
+      /* create and set default solver parameters here */
+      default_cg_solver = (HYPRE_Solver) hypre_BoomerAMGCreate();
+      hypre_BoomerAMGSetMaxIter ( default_cg_solver, 1 );
+      hypre_BoomerAMGSetRelaxOrder( default_cg_solver, 1);
+      hypre_BoomerAMGSetPrintLevel(default_cg_solver, 0);
+      /* set setup and solve functions */
+      coarse_grid_solver_setup =  (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSetup;
+      coarse_grid_solver_solve =  (HYPRE_Int (*)(void*, void*, void*, void*)) hypre_BoomerAMGSolve;
+      (mgr_data -> coarse_grid_solver_setup) =   coarse_grid_solver_setup;
+      (mgr_data -> coarse_grid_solver_solve) =   coarse_grid_solver_solve;
+      (mgr_data -> coarse_grid_solver) = default_cg_solver;
+   }
+   // keep reserved coarse indexes to coarsest grid
+   if(reserved_coarse_size > 0)
+      HYPRE_BoomerAMGSetCpointsToKeep((mgr_data ->coarse_grid_solver), 25,reserved_coarse_size,reserved_Cpoint_local_indexes);
+
+   /* setup coarse grid solver */
+   coarse_grid_solver_setup((mgr_data -> coarse_grid_solver), RAP_ptr, F_array[num_c_levels], U_array[num_c_levels]);
+
+   /* Setup smoother for fine grid */
+   if (	relax_type == 8 || relax_type == 13 || relax_type == 14 || relax_type == 18 )
+   {
+      l1_norms = hypre_CTAlloc(HYPRE_Real *, num_c_levels);
+      (mgr_data -> l1_norms) = l1_norms;
+   }
+   for (j = 0; j < num_c_levels; j++)
+   {
+      if (num_threads == 1)
+      {
+         if (relax_type == 8 || relax_type == 13 || relax_type == 14)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1Norms(A_array[j], 4, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]);
+         }
+         else if (relax_type == 18)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1Norms(A_array[j], 1, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]);
+         }
+      }
+      else
+      {
+         if (relax_type == 8 || relax_type == 13 || relax_type == 14)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, CF_marker_array[j] , &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]);
+         }
+         else if (relax_type == 18)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, CF_marker_array[j] , &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
+         }
+      }
+   }
+   
+   /* Setup Vcycle data for Frelax_method > 0 */
+   if(Frelax_method == 1)
+   {
+      /* allocate memory and set pointer to (mgr_data -> FrelaxVcycleData) */
+      if(FrelaxVcycleData == NULL)
+         FrelaxVcycleData = hypre_CTAlloc(hypre_ParAMGData*, max_num_coarse_levels);
+      (mgr_data -> FrelaxVcycleData) = FrelaxVcycleData;  
+      /* loop over levels */
+      for(i=0; i<(mgr_data->num_coarse_levels); i++)
+      { 
+         FrelaxVcycleData[i] = (hypre_ParAMGData*) hypre_MGRCreateFrelaxVcycleData();
+         (FrelaxVcycleData[i] -> Vtemp) = Vtemp;
+         (FrelaxVcycleData[i] -> Ztemp) = Ztemp;
+      
+         // setup variables for the V-cycle in the F-relaxation step //
+         hypre_MGRSetupFrelaxVcycleData(mgr_data, A_array[i], F_array[i], U_array[i], i);
+      }
+   }
+
+   if ( logging > 1 ) {
+
+      residual =
+	hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_array[0]),
+                              hypre_ParCSRMatrixGlobalNumRows(A_array[0]),
+                              hypre_ParCSRMatrixRowStarts(A_array[0]) );
+      hypre_ParVectorInitialize(residual);
+      hypre_ParVectorSetPartitioningOwner(residual,0);
+      (mgr_data -> residual) = residual;
+   }
+   else{
+      (mgr_data -> residual) = NULL;
+   }
+   rel_res_norms = hypre_CTAlloc(HYPRE_Real,(mgr_data -> max_iter));
+   (mgr_data -> rel_res_norms) = rel_res_norms;
+
+   /* free level_coarse_indexes data */
+   if ( level_coarse_indexes != NULL)
+   {
+      for(i=0; i<max_num_coarse_levels; i++)
+      {
+         hypre_TFree(level_coarse_indexes[i]);
+      }
+      hypre_TFree( level_coarse_indexes);
+      level_coarse_indexes = NULL;
+      hypre_TFree(level_coarse_size);
+      level_coarse_size = NULL;       
+   }   
+
+   return hypre_error_flag;
+}
+
+/* Setup data for Frelax V-cycle */
+HYPRE_Int
+hypre_MGRSetupFrelaxVcycleData( void *mgr_vdata,
+                  hypre_ParCSRMatrix *A,
+                  hypre_ParVector    *f,
+                  hypre_ParVector    *u,
+                  HYPRE_Int	     lev )
+{
+  MPI_Comm           comm = hypre_ParCSRMatrixComm(A);
+  hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+  hypre_ParAMGData    **FrelaxVcycleData = mgr_data -> FrelaxVcycleData;
+  
+  HYPRE_Int i, j, num_procs, my_id;
+  
+  HYPRE_Int max_local_lvls = (mgr_data -> max_local_lvls);  
+  HYPRE_Int lev_local;
+  HYPRE_Int not_finished;
+//  HYPRE_Int min_local_coarse_size = 0;
+  HYPRE_Int max_local_coarse_size = 2;
+  HYPRE_Int ge_relax_type = 9;
+  HYPRE_Int            **CF_marker_array = (mgr_data -> CF_marker_array);
+  HYPRE_Int local_size;
+  HYPRE_Int local_coarse_size;
+
+  HYPRE_Int *coarse_pnts_global_lvl = NULL;
+  HYPRE_Int *coarse_dof_func_lvl = NULL;
+
+  hypre_ParCSRMatrix *RAP_local = NULL;
+  hypre_ParCSRMatrix *P_local = NULL;
+  hypre_ParCSRMatrix *S_local = NULL;
+  
+  HYPRE_Int 		smrk_local = -1;
+  
+  HYPRE_Int	      old_num_levels = (FrelaxVcycleData[lev] -> num_levels);
+  HYPRE_Int            **CF_marker_array_local = (FrelaxVcycleData[lev] -> CF_marker_array);
+  HYPRE_Int            *CF_marker_local = NULL;
+  hypre_ParCSRMatrix   **A_array_local = (FrelaxVcycleData[lev] -> A_array);
+  hypre_ParCSRMatrix   **P_array_local = (FrelaxVcycleData[lev] -> P_array);
+  hypre_ParVector      **F_array_local = (FrelaxVcycleData[lev] -> F_array);
+  hypre_ParVector      **U_array_local = (FrelaxVcycleData[lev] -> U_array);
+
+  hypre_MPI_Comm_size(comm, &num_procs);
+  hypre_MPI_Comm_rank(comm,&my_id);
+  
+  local_size = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
+
+  /* Free any local data not previously destroyed */
+  if (A_array_local || P_array_local || CF_marker_array_local)
+  {
+    for (j = 1; j < old_num_levels; j++)
+    {
+      if (A_array_local[j])
+      {
+         hypre_ParCSRMatrixDestroy(A_array_local[j]);
+         A_array_local[j] = NULL;
+      }
+    }
+
+    for (j = 0; j < old_num_levels-1; j++)
+    {
+       if (P_array_local[j])
+       {
+          hypre_ParCSRMatrixDestroy(P_array_local[j]);
+          P_array_local[j] = NULL;
+       }
+    }
+
+    for (j = 0; j < old_num_levels-1; j++)
+    {
+      if (CF_marker_array_local[j]) 
+      {
+         hypre_TFree(CF_marker_array_local[j]);
+         CF_marker_array_local[j] = NULL;
+      } 
+    }
+    hypre_TFree(A_array_local);
+    A_array_local = NULL;
+    hypre_TFree(P_array_local);
+    P_array_local = NULL;
+    hypre_TFree(CF_marker_array_local);
+    CF_marker_array_local = NULL;
+  }
+  /* free solution arrays not previously destroyed */
+  if (F_array_local != NULL || U_array_local != NULL)
+  {
+    for (j = 1; j < old_num_levels; j++)
+    {
+       if (F_array_local[j] != NULL)
+       {
+          hypre_ParVectorDestroy(F_array_local[j]);
+          F_array_local[j] = NULL;
+       }
+       if (U_array_local[j] != NULL)
+       {
+          hypre_ParVectorDestroy(U_array_local[j]);
+          U_array_local[j] = NULL;
+       }
+    }
+    hypre_TFree(F_array_local);
+    F_array_local = NULL;
+    hypre_TFree(U_array_local);
+    U_array_local = NULL;    
+ }
+
+  /* Initialize some variables and allocate memory */ 
+  not_finished = 1;
+  lev_local = 0;
+  if(A_array_local == NULL)
+     A_array_local = hypre_CTAlloc(hypre_ParCSRMatrix*, max_local_lvls);
+  if(P_array_local == NULL && max_local_lvls > 1)
+    P_array_local = hypre_CTAlloc(hypre_ParCSRMatrix*, max_local_lvls-1);
+  if(F_array_local == NULL)
+    F_array_local = hypre_CTAlloc(hypre_ParVector*, max_local_lvls);
+  if(U_array_local == NULL)
+    U_array_local = hypre_CTAlloc(hypre_ParVector*, max_local_lvls);
+  if(CF_marker_array_local == NULL)
+    CF_marker_array_local = hypre_CTAlloc(HYPRE_Int*, max_local_lvls);
+
+  A_array_local[0] = A;
+  F_array_local[0] = f;
+  U_array_local[0] = u;
+  
+  /* Special case max_local_lvls == 1 */
+  if (max_local_lvls == 1)
+  {
+     CF_marker_local = hypre_CTAlloc(HYPRE_Int, local_size );
+     for (i=0; i < local_size ; i++)
+        CF_marker_local[i] = 1;
+     CF_marker_array_local[0] = CF_marker_local;
+     lev_local = max_local_lvls;
+     not_finished = 0;
+  }
+
+  while (not_finished) 
+  {
+    local_size = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array_local[lev_local]));
+
+    if (lev_local == 0) {
+      /* use the CF_marker from the outer MGR cycle to create the strength connection matrix */
+      hypre_BoomerAMGCreateSFromCFMarker(A_array_local[lev_local], 0.25, 0.9, CF_marker_array[lev], smrk_local, &S_local);
+    } else if (lev_local > 0) {
+      hypre_BoomerAMGCreateS(A_array_local[lev_local], 0.25, 0.9, 1, NULL, &S_local);
+    }
+
+//    hypre_BoomerAMGCoarsenFalgout(S_local, A_array_local[lev_local], 0, 0, &CF_marker_local); 
+    hypre_BoomerAMGCoarsen(S_local, A_array_local[lev_local], 0, 0, &CF_marker_local);     
+    /* For the lev_local=0, the coarsening routine is called on the fine-grid (the whole matrix) 
+     * thus, some C-points of the outer MGR level may have been set to F-points in the coarsening 
+     * routine. We need to reset these back to C-points (before building the interpolation operator.
+    */
+    if (lev_local == 0) {
+      for (i = 0; i < local_size; i++) {
+        if (CF_marker_array[lev][i] == 1) {
+          CF_marker_local[i] = 1;
+        }
+      }
+    }
+
+    hypre_BoomerAMGCoarseParms(comm, local_size,
+                                   1, NULL, CF_marker_local,
+                                   &coarse_dof_func_lvl, &coarse_pnts_global_lvl);
+    hypre_BoomerAMGBuildInterp(A_array_local[lev_local], CF_marker_local, 
+                                   S_local, coarse_pnts_global_lvl, 1, NULL, 
+                                   0, 0.0, 0, NULL, &P_local);
+
+    /* save the CF_marker and interpolation matrix pointers */
+    CF_marker_array_local[lev_local] = CF_marker_local;
+    P_array_local[lev_local] = P_local;
+
+    /* build the coarse grid */
+    hypre_BoomerAMGBuildCoarseOperatorKT(P_local, A_array_local[lev_local], 
+                                    P_local, 0, &RAP_local);
+//    hypre_printf("Coarse size lev %d = %d\n", lev_local+1, hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(RAP_local)));
+
+#ifdef HYPRE_NO_GLOBAL_PARTITION
+        if (my_id == (num_procs -1)) local_coarse_size = coarse_pnts_global_lvl[1];
+        hypre_MPI_Bcast(&local_coarse_size, 1, HYPRE_MPI_INT, num_procs-1, comm);
+#else
+        local_coarse_size = coarse_pnts_global_lvl[num_procs];
+#endif
+
+    lev_local++;
+
+    if (S_local) hypre_ParCSRMatrixDestroy(S_local);
+    S_local = NULL;
+    if ( (lev_local == max_local_lvls-1) || (local_coarse_size <= max_local_coarse_size) )
+    {
+      not_finished = 0;
+    }
+
+    A_array_local[lev_local] = RAP_local;
+    F_array_local[lev_local] = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_local),
+                                          hypre_ParCSRMatrixGlobalNumRows(RAP_local),
+                                          hypre_ParCSRMatrixRowStarts(RAP_local));
+    hypre_ParVectorInitialize(F_array_local[lev_local]);
+    hypre_ParVectorSetPartitioningOwner(F_array_local[lev_local], 0);
+
+    U_array_local[lev_local] = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_local),
+                                          hypre_ParCSRMatrixGlobalNumRows(RAP_local),
+                                          hypre_ParCSRMatrixRowStarts(RAP_local));
+    hypre_ParVectorInitialize(U_array_local[lev_local]);
+    hypre_ParVectorSetPartitioningOwner(U_array_local[lev_local], 0);        
+  
+  } // end while loop
+  
+  // setup Vcycle data
+  (FrelaxVcycleData[lev] -> A_array) = A_array_local;
+  (FrelaxVcycleData[lev] -> P_array) = P_array_local;
+  (FrelaxVcycleData[lev] -> F_array) = F_array_local;
+  (FrelaxVcycleData[lev] -> U_array) = U_array_local;
+  (FrelaxVcycleData[lev] -> CF_marker_array) = CF_marker_array_local;
+  (FrelaxVcycleData[lev] -> num_levels) = lev_local+1;
+
+  if(lev_local > 1)
+    hypre_GaussElimSetup(FrelaxVcycleData[lev], lev_local, ge_relax_type);
+
+  return hypre_error_flag;
+}
diff -Nru hypre-2.11.2/src/parcsr_ls/par_mgr_solve.c hypre-2.13.0/src/parcsr_ls/par_mgr_solve.c
--- hypre-2.11.2/src/parcsr_ls/par_mgr_solve.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_mgr_solve.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,581 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+
+
+
+
+/******************************************************************************
+ *
+ * MGR solve routine
+ *
+ *****************************************************************************/
+#include "_hypre_parcsr_ls.h"
+#include "par_mgr.h"
+#include "par_amg.h"
+
+/*--------------------------------------------------------------------
+ * hypre_MGRSolve
+ *--------------------------------------------------------------------*/
+HYPRE_Int
+hypre_MGRSolve( void               *mgr_vdata,
+                  hypre_ParCSRMatrix *A,
+                  hypre_ParVector    *f,
+                  hypre_ParVector    *u )
+{
+
+   MPI_Comm 	         comm = hypre_ParCSRMatrixComm(A);
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+   hypre_ParCSRMatrix  **A_array = (mgr_data -> A_array);
+   hypre_ParVector    **F_array = (mgr_data -> F_array);
+   hypre_ParVector    **U_array = (mgr_data -> U_array);
+
+   HYPRE_Real		tol = (mgr_data -> tol);
+   HYPRE_Int		logging = (mgr_data -> logging);
+   HYPRE_Int		print_level = (mgr_data -> print_level);
+   HYPRE_Int		max_iter = (mgr_data -> max_iter);
+   HYPRE_Real		*norms = (mgr_data -> rel_res_norms);
+   hypre_ParVector     	*Vtemp = (mgr_data -> Vtemp);
+   hypre_ParVector     	*residual;
+
+   HYPRE_Real           alpha = -1;
+   HYPRE_Real           beta = 1;
+   HYPRE_Real           conv_factor = 0.0;
+   HYPRE_Real   	resnorm = 1.0;
+   HYPRE_Real   	init_resnorm = 0.0;
+   HYPRE_Real   	rel_resnorm;
+   HYPRE_Real   	rhs_norm = 0.0;
+   HYPRE_Real   	old_resnorm;
+   HYPRE_Real   	ieee_check = 0.;
+
+   HYPRE_Int		iter, num_procs, my_id;
+   HYPRE_Int		Solve_err_flag;
+
+/*
+   HYPRE_Real   total_coeffs;
+   HYPRE_Real   total_variables;
+   HYPRE_Real   operat_cmplxty;
+   HYPRE_Real   grid_cmplxty;
+*/
+   HYPRE_Solver    	cg_solver = (mgr_data -> coarse_grid_solver);
+   HYPRE_Int		(*coarse_grid_solver_solve)(void*,void*,void*,void*) = (mgr_data -> coarse_grid_solver_solve);
+
+   HYPRE_Int    blk_size  = (mgr_data -> block_size);
+   HYPRE_Real    *diaginv = (mgr_data -> diaginv);
+   HYPRE_Int      n_block = (mgr_data -> n_block);
+   HYPRE_Int    left_size = (mgr_data -> left_size);
+
+   HYPRE_Int    global_smooth_iters      =  (mgr_data -> global_smooth_iters);
+   HYPRE_Int    global_smooth_type =  (mgr_data -> global_smooth_type);
+
+   HYPRE_Int    i;
+
+   if(logging > 1)
+   {
+      residual = (mgr_data -> residual);
+   }
+
+   (mgr_data -> num_iterations) = 0;
+
+   if((mgr_data -> num_coarse_levels) == 0)
+   {
+      /* Do scalar AMG solve when only one level */
+      coarse_grid_solver_solve(cg_solver, A, f, u);
+      HYPRE_BoomerAMGGetNumIterations(cg_solver, &iter);
+      HYPRE_BoomerAMGGetFinalRelativeResidualNorm(cg_solver, &rel_resnorm);
+      (mgr_data -> num_iterations) = iter;
+      (mgr_data -> final_rel_residual_norm) = rel_resnorm;
+      return hypre_error_flag;
+   }
+
+   U_array[0] = u;
+   F_array[0] = f;
+
+   hypre_MPI_Comm_size(comm, &num_procs);
+   hypre_MPI_Comm_rank(comm,&my_id);
+
+   /*-----------------------------------------------------------------------
+    *    Write the solver parameters
+    *-----------------------------------------------------------------------*/
+//   if (my_id == 0 && print_level > 1)
+//      hypre_MGRWriteSolverParams(mgr_data);
+
+   /*-----------------------------------------------------------------------
+    *    Initialize the solver error flag and assorted bookkeeping variables
+    *-----------------------------------------------------------------------*/
+
+   Solve_err_flag = 0;
+/*
+   total_coeffs = 0;
+   total_variables = 0;
+   operat_cmplxty = 0;
+   grid_cmplxty = 0;
+*/
+   /*-----------------------------------------------------------------------
+    *     write some initial info
+    *-----------------------------------------------------------------------*/
+
+   if (my_id == 0 && print_level > 1 && tol > 0.)
+     hypre_printf("\n\nTWO-GRID SOLVER SOLUTION INFO:\n");
+
+
+   /*-----------------------------------------------------------------------
+    *    Compute initial fine-grid residual and print
+    *-----------------------------------------------------------------------*/
+   if (print_level > 1 || logging > 1 || tol > 0.)
+   {
+     if ( logging > 1 ) {
+        hypre_ParVectorCopy(F_array[0], residual );
+        if (tol > 0)
+	   hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, residual );
+           resnorm = sqrt(hypre_ParVectorInnerProd( residual, residual ));
+     }
+     else {
+        hypre_ParVectorCopy(F_array[0], Vtemp);
+        if (tol > 0)
+           hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, Vtemp);
+        resnorm = sqrt(hypre_ParVectorInnerProd(Vtemp, Vtemp));
+     }
+
+     /* Since it is does not diminish performance, attempt to return an error flag
+        and notify users when they supply bad input. */
+     if (resnorm != 0.) ieee_check = resnorm/resnorm; /* INF -> NaN conversion */
+     if (ieee_check != ieee_check)
+     {
+        /* ...INFs or NaNs in input can make ieee_check a NaN.  This test
+           for ieee_check self-equality works on all IEEE-compliant compilers/
+           machines, c.f. page 8 of "Lecture Notes on the Status of IEEE 754"
+           by W. Kahan, May 31, 1996.  Currently (July 2002) this paper may be
+           found at http://HTTP.CS.Berkeley.EDU/~wkahan/ieee754status/IEEE754.PDF */
+        if (print_level > 0)
+        {
+          hypre_printf("\n\nERROR detected by Hypre ...  BEGIN\n");
+          hypre_printf("ERROR -- hypre_MGRSolve: INFs and/or NaNs detected in input.\n");
+          hypre_printf("User probably placed non-numerics in supplied A, x_0, or b.\n");
+          hypre_printf("ERROR detected by Hypre ...  END\n\n\n");
+        }
+        hypre_error(HYPRE_ERROR_GENERIC);
+        return hypre_error_flag;
+     }
+
+     init_resnorm = resnorm;
+     rhs_norm = sqrt(hypre_ParVectorInnerProd(f, f));
+     if (rhs_norm)
+     {
+       rel_resnorm = init_resnorm / rhs_norm;
+     }
+     else
+     {
+       /* rhs is zero, return a zero solution */
+       hypre_ParVectorSetConstantValues(U_array[0], 0.0);
+       if(logging > 0)
+       {
+          rel_resnorm = 0.0;
+          (mgr_data -> final_rel_residual_norm) = rel_resnorm;
+       }
+       return hypre_error_flag;
+     }
+   }
+   else
+   {
+     rel_resnorm = 1.;
+   }
+
+   if (my_id == 0 && print_level > 1)
+   {
+      hypre_printf("                                            relative\n");
+      hypre_printf("               residual        factor       residual\n");
+      hypre_printf("               --------        ------       --------\n");
+      hypre_printf("    Initial    %e                 %e\n",init_resnorm,
+              rel_resnorm);
+   }
+   /************** Main Solver Loop - always do 1 iteration ************/
+   iter = 0;
+   while ((rel_resnorm >= tol || iter < 1)
+          && iter < max_iter)
+   {
+     if (global_smooth_iters)
+     {
+	if (global_smooth_type == 0)//block Jacobi smoother
+	{
+	   for (i = 0;i < global_smooth_iters;i ++)
+	  	hypre_block_jacobi(A_array[0],F_array[0],U_array[0],blk_size,n_block,left_size,diaginv,Vtemp);
+	}
+	else if ((global_smooth_type > 0) && (global_smooth_type < 7))
+	{
+           for (i = 0;i < global_smooth_iters;i ++)
+		hypre_BoomerAMGRelax(A_array[0], F_array[0], NULL, global_smooth_type-1, 0, 1.0, 0.0, NULL, U_array[0], Vtemp, NULL);
+	}
+	else if (global_smooth_type == 8)//ILU smoother
+	{
+	    for (i = 0;i < global_smooth_iters;i ++)
+		HYPRE_EuclidSolve( (mgr_data -> global_smoother),A_array[0],F_array[0],U_array[0]);
+        }
+    }
+
+      /* Do one cycle of reduction solve on Ae=r */
+      hypre_MGRCycle(mgr_data, F_array, U_array);
+
+      /*---------------------------------------------------------------
+       *    Compute  fine-grid residual and residual norm
+       *----------------------------------------------------------------*/
+
+      if (print_level > 1 || logging > 1 || tol > 0.)
+      {
+        old_resnorm = resnorm;
+
+        if ( logging > 1 ) {
+           hypre_ParVectorCopy(F_array[0], residual);
+           hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, residual );
+           resnorm = sqrt(hypre_ParVectorInnerProd( residual, residual ));
+        }
+        else {
+           hypre_ParVectorCopy(F_array[0], Vtemp);
+           hypre_ParCSRMatrixMatvec(alpha, A_array[0], U_array[0], beta, Vtemp);
+           resnorm = sqrt(hypre_ParVectorInnerProd(Vtemp, Vtemp));
+        }
+
+        if (old_resnorm) conv_factor = resnorm / old_resnorm;
+        else conv_factor = resnorm;
+        if (rhs_norm)
+        {
+           rel_resnorm = resnorm / rhs_norm;
+        }
+        else
+        {
+           rel_resnorm = resnorm;
+        }
+
+        norms[iter] = rel_resnorm;
+      }
+
+      ++iter;
+      (mgr_data -> num_iterations) = iter;
+      (mgr_data -> final_rel_residual_norm) = rel_resnorm;
+
+      if (my_id == 0 && print_level > 1)
+      {
+         hypre_printf("    MGRCycle %2d   %e    %f     %e \n", iter,
+                 resnorm, conv_factor, rel_resnorm);
+      }
+   }
+
+   /* check convergence within max_iter */
+   if (iter == max_iter && tol > 0.)
+   {
+      Solve_err_flag = 1;
+      hypre_error(HYPRE_ERROR_CONV);
+   }
+
+   /*-----------------------------------------------------------------------
+    *    Print closing statistics
+    *	 Add operator and grid complexity stats
+    *-----------------------------------------------------------------------*/
+
+   if (iter > 0 && init_resnorm)
+     conv_factor = pow((resnorm/init_resnorm),(1.0/(HYPRE_Real) iter));
+   else
+     conv_factor = 1.;
+
+   if (print_level > 1)
+   {
+      /*** compute operator and grid complexities here ?? ***/
+      if (my_id == 0)
+      {
+         if (Solve_err_flag == 1)
+         {
+            hypre_printf("\n\n==============================================");
+            hypre_printf("\n NOTE: Convergence tolerance was not achieved\n");
+            hypre_printf("      within the allowed %d iterations\n",max_iter);
+            hypre_printf("==============================================");
+         }
+         hypre_printf("\n\n Average Convergence Factor = %f \n",conv_factor);
+         hypre_printf(" Number of coarse levels = %d \n",(mgr_data -> num_coarse_levels));
+//         hypre_printf("\n\n     Complexity:    grid = %f\n",grid_cmplxty);
+//         hypre_printf("                operator = %f\n",operat_cmplxty);
+//         hypre_printf("                   cycle = %f\n\n\n\n",cycle_cmplxty);
+      }
+   }
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int
+hypre_MGRFrelaxVcycle ( void   *Frelax_vdata )
+{
+  hypre_ParAMGData   *Frelax_data = (hypre_ParAMGData*) Frelax_vdata;
+  
+  HYPRE_Int Not_Finished = 1;
+  HYPRE_Int level = 0;
+  HYPRE_Int cycle_param = 1;
+  HYPRE_Int j, Solve_err_flag, local_size, coarse_grid, fine_grid;
+
+  HYPRE_Int num_sweeps = 1;
+  HYPRE_Int relax_order = 1;
+  HYPRE_Int relax_type = 3;
+  HYPRE_Int relax_weight = 1;
+  HYPRE_Int omega = 1;
+
+  hypre_ParVector    **F_array = (Frelax_data) -> F_array;
+  hypre_ParVector    **U_array = (Frelax_data) -> U_array;
+
+  hypre_ParCSRMatrix **A_array = ((Frelax_data) -> A_array);
+  hypre_ParCSRMatrix **R_array = ((Frelax_data) -> P_array);
+  hypre_ParCSRMatrix **P_array = ((Frelax_data) -> P_array);
+  HYPRE_Int          **CF_marker_array = ((Frelax_data) -> CF_marker_array);
+
+  hypre_ParVector *Vtemp = (Frelax_data) -> Vtemp;
+  hypre_ParVector *Ztemp = (Frelax_data) -> Ztemp;
+
+  HYPRE_Int num_c_levels = (Frelax_data) -> num_levels-1;
+
+  hypre_ParVector *Aux_F = NULL;
+  hypre_ParVector *Aux_U = NULL;
+
+  HYPRE_Real alpha, beta;
+
+  while (Not_Finished)
+  {
+    local_size = hypre_VectorSize(hypre_ParVectorLocalVector(F_array[level]));
+    hypre_VectorSize(hypre_ParVectorLocalVector(Vtemp)) = local_size;
+    Aux_F = F_array[level];
+    Aux_U = U_array[level];
+
+    if (cycle_param == 1) {
+      // visiting coarser grids and relax
+      // This also doubles as the smoother used when no
+      // coarsening is done
+      for (j = 0; j < num_sweeps; j++) {
+        Solve_err_flag = hypre_BoomerAMGRelaxIF(A_array[level], 
+                                              Aux_F,
+                                              CF_marker_array[level],
+                                              relax_type,
+                                              relax_order,
+                                              cycle_param,
+                                              relax_weight,
+                                              omega,
+                                              NULL,
+                                              Aux_U,
+                                              Vtemp, 
+                                              Ztemp);
+      }
+      if ((num_c_levels > 0) && (level != num_c_levels))
+      {
+           fine_grid = level;
+           coarse_grid = level + 1;
+
+           hypre_ParVectorSetConstantValues(U_array[coarse_grid], 0.0); 
+            
+           alpha = -1.0;
+           beta = 1.0;
+
+           // JSP: avoid unnecessary copy using out-of-place version of SpMV
+           hypre_ParCSRMatrixMatvecOutOfPlace(alpha, A_array[fine_grid], U_array[fine_grid],
+                                              beta, F_array[fine_grid], Vtemp);
+
+           alpha = 1.0;
+           beta = 0.0;
+           hypre_ParCSRMatrixMatvecT(alpha,R_array[fine_grid],Vtemp,
+                                        beta,F_array[coarse_grid]);
+
+           ++level;
+           cycle_param = 1;
+           if (level == num_c_levels) cycle_param = 3;
+      }
+    } else if (cycle_param == 3) {
+      // solve the coarsest grid with Gaussian elimination
+      hypre_GaussElimSolve(Frelax_data, level, 9);
+      cycle_param = 2;
+    } else if (cycle_param == 2) {
+        /*---------------------------------------------------------------
+        * Visit finer level next.
+        * Interpolate and add correction using hypre_ParCSRMatrixMatvec.
+        * Reset counters and cycling parameters for finer level.
+        *--------------------------------------------------------------*/
+
+        fine_grid = level - 1;
+        coarse_grid = level;
+        alpha = 1.0;
+        beta = 1.0;
+
+        hypre_ParCSRMatrixMatvec(alpha, P_array[fine_grid], 
+                                 U_array[coarse_grid],
+                                 beta, U_array[fine_grid]);            
+
+        --level;
+        cycle_param = 2;
+        if (level == 0) cycle_param = 99;
+    } else {
+      Not_Finished = 0;
+    }
+  }
+
+  return Solve_err_flag;
+}
+
+HYPRE_Int
+hypre_MGRCycle( void               *mgr_vdata,
+                  hypre_ParVector    **F_array,
+                  hypre_ParVector    **U_array )
+{
+//   MPI_Comm 	         comm;
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+
+   HYPRE_Int       Solve_err_flag;
+   HYPRE_Int       level;
+   HYPRE_Int       coarse_grid;
+   HYPRE_Int       fine_grid;
+   HYPRE_Int       Not_Finished;
+   HYPRE_Int	   cycle_type;
+
+   hypre_ParCSRMatrix  	**A_array = (mgr_data -> A_array);
+   hypre_ParCSRMatrix  	**RT_array  = (mgr_data -> RT_array);
+   hypre_ParCSRMatrix  	**P_array   = (mgr_data -> P_array);
+   hypre_ParCSRMatrix  	*RAP = (mgr_data -> RAP);
+   HYPRE_Solver    	cg_solver = (mgr_data -> coarse_grid_solver);
+   HYPRE_Int		(*coarse_grid_solver_solve)(void*, void*, void*, void*) = (mgr_data -> coarse_grid_solver_solve);
+
+   HYPRE_Int           	**CF_marker = (mgr_data -> CF_marker_array);
+   HYPRE_Int            nsweeps = (mgr_data -> num_relax_sweeps);
+   HYPRE_Int            relax_type = (mgr_data -> relax_type);
+   HYPRE_Real           relax_weight = (mgr_data -> relax_weight);
+   HYPRE_Real           omega = (mgr_data -> omega);
+   HYPRE_Real          	**relax_l1_norms = (mgr_data -> l1_norms);
+   hypre_ParVector     	*Vtemp = (mgr_data -> Vtemp);
+   hypre_ParVector     	*Ztemp = (mgr_data -> Ztemp);
+
+   HYPRE_Int            i, relax_points;
+   HYPRE_Int           	num_coarse_levels = (mgr_data -> num_coarse_levels);
+
+   HYPRE_Real    alpha;
+   HYPRE_Real    beta;
+
+  HYPRE_Int            Frelax_method = (mgr_data -> Frelax_method);   
+  hypre_ParAMGData    **FrelaxVcycleData = (mgr_data -> FrelaxVcycleData);   
+
+   /* Initialize */
+//   comm = hypre_ParCSRMatrixComm(A_array[0]);
+   Solve_err_flag = 0;
+   Not_Finished = 1;
+   cycle_type = 1;
+   level = 0;
+
+   /***** Main loop ******/
+   while (Not_Finished)
+   {
+
+	   /* Do coarse grid correction solve */
+	   if(cycle_type == 3)
+	   {
+		   /* call coarse grid solver here */
+	           /* default is BoomerAMG */
+		   coarse_grid_solver_solve(cg_solver, RAP, F_array[level], U_array[level]);
+		   /**** cycle up ***/
+		   cycle_type = 2;
+	   }
+	   /* F-relaxation */
+	   else if(cycle_type == 1)
+	   {
+
+		   fine_grid = level;
+		   coarse_grid = level + 1;
+		   /* Relax solution - F-relaxation */
+		   relax_points = -1;
+
+                   if (Frelax_method == 0) 
+                   { /* (single level) relaxation for A_ff */
+		      if (relax_type == 18)
+		      {
+		         hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid],
+						   relax_points, relax_weight, relax_l1_norms[fine_grid],
+		   				   U_array[fine_grid], Vtemp);
+		      }
+		      else if(relax_type == 8 || relax_type == 13 || relax_type == 14)
+		      {
+		         hypre_BoomerAMGRelax(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid],
+		   						relax_type, relax_points, relax_weight,
+		   						omega, relax_l1_norms[fine_grid], U_array[fine_grid], Vtemp, Ztemp);
+		      }
+		      else
+		      {
+		        for(i=0; i<nsweeps; i++)
+			   Solve_err_flag = hypre_BoomerAMGRelax(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid],
+			                 relax_type,relax_points,relax_weight,omega,NULL, U_array[fine_grid], Vtemp, Ztemp);
+		      }
+                   }
+                   else if (Frelax_method == 1) 
+      		   {
+                      /* v-cycle smoother for A_ff */
+                      hypre_MGRFrelaxVcycle(FrelaxVcycleData[level]);
+                   } 
+                   else 
+                   {
+                      for (i=0; i<nsweeps; i++)
+                            Solve_err_flag = hypre_BoomerAMGRelax(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid],
+                            relax_type, relax_points, relax_weight,
+                            omega, NULL, U_array[fine_grid], Vtemp, Ztemp);
+                   }
+		   // Update residual and compute coarse-grid rhs
+		   // hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
+		   alpha = -1.0;
+		   beta = 1.0;
+
+		  // hypre_ParCSRMatrixMatvec(alpha, A_array[fine_grid], U_array[fine_grid],
+		  //							beta, Vtemp);
+		   hypre_ParCSRMatrixMatvecOutOfPlace(alpha, A_array[fine_grid], U_array[fine_grid],
+                                         beta, F_array[fine_grid], Vtemp);
+
+		   alpha = 1.0;
+		   beta = 0.0;
+
+		   hypre_ParCSRMatrixMatvecT(alpha,RT_array[fine_grid],Vtemp,
+									 beta,F_array[coarse_grid]);
+
+		   // initialize coarse grid solution array
+		   hypre_ParVectorSetConstantValues(U_array[coarse_grid], 0.0);
+
+		   ++level;
+
+		   if (level == num_coarse_levels) cycle_type = 3;
+	   }
+	   else if(level != 0)
+	   {
+		   /* Interpolate */
+
+		   fine_grid = level - 1;
+		   coarse_grid = level;
+		   alpha = 1.0;
+		   beta = 1.0;
+
+		   hypre_ParCSRMatrixMatvec(alpha, P_array[fine_grid],
+									U_array[coarse_grid],
+									beta, U_array[fine_grid]);
+
+		   /* post relaxation sweeps */
+//		   for(i=0; i<nsweeps; i++)
+//			Solve_err_flag = hypre_BoomerAMGRelax(A_array[fine_grid], F_array[fine_grid], CF_marker[fine_grid],
+//			                 relax_type, relax_points, relax_weight, omega, NULL, U_array[fine_grid], Vtemp, Ztemp);
+
+		   if (Solve_err_flag != 0)
+			   return(Solve_err_flag);
+
+		   --level;
+
+	   }
+	   else
+	   {
+		   Not_Finished = 0;
+	   }
+   }
+   return Solve_err_flag;
+}
diff -Nru hypre-2.11.2/src/parcsr_ls/par_relax.c hypre-2.13.0/src/parcsr_ls/par_relax.c
--- hypre-2.11.2/src/parcsr_ls/par_relax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_relax.c	2017-10-20 17:42:22.000000000 +0000
@@ -18,20 +18,7 @@
 
 #include "_hypre_parcsr_ls.h"
 #include "Common.h"
-
-#ifdef HYPRE_USING_ESSL
-#include <essl.h>
-#else
-/* RDF: This needs to be integrated with the hypre blas/lapack stuff */
-#ifdef __cplusplus
-extern "C" {
-#endif
-HYPRE_Int hypre_F90_NAME_LAPACK(dgetrf, DGETRF) (HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dgetrs, DGETRS) (const char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *b, HYPRE_Int*, HYPRE_Int *);
-#ifdef __cplusplus
-}
-#endif
-#endif
+#include "_hypre_lapack.h"
 
 /*--------------------------------------------------------------------------
  * hypre_BoomerAMGRelax
@@ -2368,27 +2355,31 @@
          /*-----------------------------------------------------------------
           * Copy f into temporary vector.
           *-----------------------------------------------------------------*/
-
+         PUSH_RANGE("RELAX",4);
+#ifdef HYPRE_USE_GPU
+           hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(Vtemp));
+           hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(f));
+         VecCopy(Vtemp_data,f_data,hypre_VectorSize(hypre_ParVectorLocalVector(Vtemp)),HYPRE_STREAM(4));
+#else
          hypre_ParVectorCopy(f,Vtemp);
-
+#endif 
          /*-----------------------------------------------------------------
           * Perform Matvec Vtemp=f-Au
           *-----------------------------------------------------------------*/
 
-            hypre_ParCSRMatrixMatvec(-1.0,A, u, 1.0, Vtemp);
+            hypre_ParCSRMatrixMatvec(-relax_weight,A, u, relax_weight, Vtemp);
+#ifdef HYPRE_USE_GPU
+         VecScale(u_data,Vtemp_data,l1_norms,n,HYPRE_STREAM(4));
+#else
             for (i = 0; i < n; i++)
             {
-
                /*-----------------------------------------------------------
                 * If diagonal is nonzero, relax point i; otherwise, skip it.
                 *-----------------------------------------------------------*/
-
-               if (A_diag_data[A_diag_i[i]] != zero)
-               {
-                  u_data[i] += relax_weight * Vtemp_data[i]
-                                / A_diag_data[A_diag_i[i]];
-               }
+                  u_data[i] += Vtemp_data[i] / l1_norms[i];
             }
+#endif
+         POP_RANGE;
       }
       break;
 
@@ -4131,28 +4122,13 @@
 
             piv = hypre_CTAlloc(HYPRE_Int, n_global);
 
-           /* write over A with LU */
-#ifdef HYPRE_USING_ESSL
-            dgetrf(n_global, n_global, A_mat, n_global, piv, &info);
-
-#else
-            hypre_F90_NAME_LAPACK(dgetrf, DGETRF)(&n_global, &n_global, 
-                                             A_mat, &n_global, piv, &info);
-#endif
+            /* write over A with LU */
+            hypre_dgetrf(&n_global, &n_global, A_mat, &n_global, piv, &info);
             
-           /*now b_vec = inv(A)*b_vec  */
-#ifdef HYPRE_USING_ESSL
-            dgetrs("N", n_global, &one_i, A_mat, 
-                   n_global, piv, b_vec, 
-                   n_global, &info);
+            /*now b_vec = inv(A)*b_vec  */
+            hypre_dgetrs("N", &n_global, &one_i, A_mat, &n_global, piv, b_vec, &n_global, &info);
 
-#else
-            hypre_F90_NAME_LAPACK(dgetrs, DGETRS)("N", &n_global, &one_i, A_mat, 
-                                             &n_global, piv, b_vec, 
-                                             &n_global, &info);
-#endif
             hypre_TFree(piv);
-            
 
             for (i = 0; i < n; i++)
             {
@@ -4336,25 +4312,11 @@
          piv = hypre_CTAlloc(HYPRE_Int, n_global);
 
          /* write over A with LU */
-#ifdef HYPRE_USING_ESSL
-         dgetrf(n_global, n_global, A_tmp, n_global, piv, &my_info);
-
-#else
-         hypre_F90_NAME_LAPACK(dgetrf, DGETRF)(&n_global, &n_global, 
-                                      A_tmp, &n_global, piv, &my_info);
-#endif
+         hypre_dgetrf(&n_global, &n_global, A_tmp, &n_global, piv, &my_info);
             
          /*now b_vec = inv(A)*b_vec  */
-#ifdef HYPRE_USING_ESSL
-         dgetrs("N", n_global, &one_i, A_tmp, 
-                     n_global, piv, b_vec, 
-                     n_global, &my_info);
+         hypre_dgetrs("N", &n_global, &one_i, A_tmp, &n_global, piv, b_vec, &n_global, &my_info);
 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)("N", &n_global, &one_i, A_tmp, 
-                                             &n_global, piv, b_vec, 
-                                             &n_global, &my_info);
-#endif
          hypre_TFree(piv);
       }
       for (i = 0; i < n; i++)
@@ -4372,7 +4334,7 @@
    return hypre_error_flag;
 }
 
-
+HYPRE_CUDA_GLOBAL
 HYPRE_Int gselim(HYPRE_Real *A,
                  HYPRE_Real *x,
                  HYPRE_Int n)
diff -Nru hypre-2.11.2/src/parcsr_ls/par_relax_more.c hypre-2.13.0/src/parcsr_ls/par_relax_more.c
--- hypre-2.11.2/src/parcsr_ls/par_relax_more.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_relax_more.c	2017-10-20 17:42:22.000000000 +0000
@@ -26,52 +26,61 @@
                               
    HYPRE_Real e_max;
    HYPRE_Real row_sum, max_norm;
-   HYPRE_Real *col_val;
+   HYPRE_Real *A_diag_data;
+   HYPRE_Real *A_offd_data;
    HYPRE_Real temp;
    HYPRE_Real diag_value;
 
    HYPRE_Int   pos_diag, neg_diag;
-   HYPRE_Int   start_row, end_row;
-   HYPRE_Int   row_length;
-   HYPRE_Int *col_ind;
+   HYPRE_Int  A_num_rows;
+   HYPRE_Int *A_diag_i;
+   HYPRE_Int *A_offd_i;
    HYPRE_Int   j;
-   HYPRE_Int i;
+   HYPRE_Int i, start;
    
 
    /* estimate with the inf-norm of A - should be ok for SPD matrices */
 
-   start_row  = hypre_ParCSRMatrixFirstRowIndex(A);
-   end_row    =  hypre_ParCSRMatrixLastRowIndex(A);
+   A_num_rows  =  hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
+   A_diag_i    =  hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(A));
+   A_diag_data =  hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(A));
+   A_offd_i    =  hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(A));
+   A_offd_data =  hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(A));
     
    max_norm = 0.0;
 
    pos_diag = neg_diag = 0;
  
-   for ( i = start_row; i <= end_row; i++ )
+   for ( i = 0; i < A_num_rows; i++ )
    {
-      HYPRE_ParCSRMatrixGetRow((HYPRE_ParCSRMatrix) A, i, &row_length, &col_ind, &col_val);
-
-      row_sum = 0.0;
-
-      for (j = 0; j < row_length; j++)
+      start = A_diag_i[i];
+      diag_value = A_diag_data[start];
+      if (diag_value > 0) 
       {
-         if (j==0) diag_value = fabs(col_val[j]);
-     
-         row_sum += fabs(col_val[j]);
+         pos_diag++;
+      }
+      if (diag_value < 0) 
+      {
+         neg_diag++;
+         diag_value = -diag_value;
+      }
+      row_sum = diag_value;
 
-         if ( col_ind[j] == i && col_val[j] > 0.0 ) pos_diag++;
-         if ( col_ind[j] == i && col_val[j] < 0.0 ) neg_diag++;
+      /*for (j = 0; j < row_length; j++)*/
+      for (j = start+1; j < A_diag_i[i+1]; j++)
+      {
+         row_sum += fabs(A_diag_data[j]);
+      }
+      for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
+      {
+         row_sum += fabs(A_offd_data[j]);
       }
       if (scale)
       {
          if (diag_value != 0.0)
             row_sum = row_sum/diag_value;
       }
-      
-
       if ( row_sum > max_norm ) max_norm = row_sum;
-
-      HYPRE_ParCSRMatrixRestoreRow((HYPRE_ParCSRMatrix) A, i, &row_length, &col_ind, &col_val);
    }
 
    /* get max across procs */
diff -Nru hypre-2.11.2/src/parcsr_ls/par_stats.c hypre-2.13.0/src/parcsr_ls/par_stats.c
--- hypre-2.11.2/src/parcsr_ls/par_stats.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_stats.c	2017-10-20 17:42:22.000000000 +0000
@@ -155,6 +155,7 @@
    HYPRE_Int additive;
    HYPRE_Int mult_additive;
    HYPRE_Int simple;
+   HYPRE_Int add_end;
    HYPRE_Int add_rlx;
    HYPRE_Real add_rlx_wt;
  
@@ -175,10 +176,10 @@
    additive = hypre_ParAMGDataAdditive(amg_data);
    mult_additive = hypre_ParAMGDataMultAdditive(amg_data);
    simple = hypre_ParAMGDataSimple(amg_data);
+   add_end = hypre_ParAMGDataAddLastLvl(amg_data);
    add_rlx = hypre_ParAMGDataAddRelaxType(amg_data);
    add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data);
 
-
    A_block_array = hypre_ParAMGDataABlockArray(amg_data);
    P_block_array = hypre_ParAMGDataPBlockArray(amg_data);
 
@@ -923,19 +924,40 @@
 
       if (additive == 0 || mult_additive == 0 || simple == 0)
       {
+         HYPRE_Int add_lvl = add_end;
+         if (add_end == -1) add_lvl = num_levels-1;
          if (additive > -1)
-            hypre_printf( "  Additive V-cycle starting at level %d  \n", additive);
+            hypre_printf( "  Additive V-cycle 1st level %d last level %d: \n", additive, add_lvl);
          if (mult_additive > -1)
-            hypre_printf( "  Mult-Additive V-cycle starting at level %d  \n", mult_additive);
+            hypre_printf( "  Mult-Additive V-cycle 1st level %d last level %d: \n", mult_additive, add_lvl);
          if (simple > -1)
-            hypre_printf( "  Simplified Mult-Additive V-cycle starting at level %d  \n", simple);
-         hypre_printf( "\n");
+            hypre_printf( "  Simplified Mult-Additive V-cycle 1st level %d: last level %d \n", simple, add_lvl);
          hypre_printf( "  Relaxation Parameters:\n");
-         hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
-         hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
+         if (add_lvl == num_levels-1)
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
               num_grid_sweeps[1],
-              num_grid_sweeps[2],(2*num_grid_sweeps[3]));
-         hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+              num_grid_sweeps[1],(2*num_grid_sweeps[1]));
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+         }
+         else
+         {
+            hypre_printf( "   Visiting Grid:                     down   up\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d\n",
+              num_grid_sweeps[1], num_grid_sweeps[1]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d\n", add_rlx, add_rlx);
+         }
+         if (add_lvl < num_levels -1)
+         {
+            hypre_printf( " \n");
+            hypre_printf( "Multiplicative portion: \n");
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d\n",
+              num_grid_sweeps[1], num_grid_sweeps[2], num_grid_sweeps[3]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  %4d\n",
+              grid_relax_type[1], grid_relax_type[2], grid_relax_type[3]);
+         }
          if (add_rlx == 0) hypre_printf( "   Relaxation Weight:   %e \n", add_rlx_wt);
          hypre_printf( "   Point types, partial sweeps (1=C, -1=F):\n");
          hypre_printf( "                  Pre-CG relaxation (down):");
@@ -953,12 +975,25 @@
       }
       else if (additive > 0 || mult_additive > 0 || simple > 0)
       {
+         HYPRE_Int add_lvl = add_end;
+         if (add_end == -1) add_lvl = num_levels-1;
          hypre_printf( "  Relaxation Parameters:\n");
-         hypre_printf( "   Visiting Grid:                     down   up  \n");
-         hypre_printf( "            Number of sweeps:         %4d   %2d  \n",
+         if (add_lvl < num_levels -1)
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d\n",
+              num_grid_sweeps[1], num_grid_sweeps[2], num_grid_sweeps[3]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  %4d\n",
+              grid_relax_type[1], grid_relax_type[2], grid_relax_type[3]);
+         }
+         else
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  \n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  \n",
               num_grid_sweeps[1], num_grid_sweeps[2]);
-         hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  \n",
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  \n",
               grid_relax_type[1], grid_relax_type[2]);
+         }
          hypre_printf( "   Point types, partial sweeps (1=C, -1=F):\n");
          if (grid_relax_points && grid_relax_type[1] != 8)
          {
@@ -995,18 +1030,27 @@
          }
          hypre_printf( "\n\n");
          if (additive > -1)
-            hypre_printf( "  Additive V-cycle starting at level %d  \n", additive);
+            hypre_printf( "  Additive V-cycle 1st level %d last level %d:  \n", additive, add_lvl);
          if (mult_additive > -1)
-            hypre_printf( "  Mult-Additive V-cycle starting at level %d  \n", mult_additive);
+            hypre_printf( "  Mult-Additive V-cycle 1st level %d last level %d: \n", mult_additive, add_lvl);
          if (simple > -1)
-            hypre_printf( "  Simplified Mult-Additive V-cycle starting at level %d  \n", simple);
-         hypre_printf( "\n");
+            hypre_printf( "  Simplified Mult-Additive V-cycle 1st level %d: last level %d  \n", simple, add_lvl);
          hypre_printf( "  Relaxation Parameters:\n");
-         hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
-         hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
+         if (add_lvl == num_levels-1)
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
               num_grid_sweeps[1],
-              num_grid_sweeps[2],(2*num_grid_sweeps[3]));
-         hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+              num_grid_sweeps[1],(2*num_grid_sweeps[1]));
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+         }
+         else
+         {
+            hypre_printf( "   Visiting Grid:                     down   up\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d\n",
+              num_grid_sweeps[1], num_grid_sweeps[1]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d\n", add_rlx, add_rlx);
+         }
          if (add_rlx == 0) hypre_printf( "   Relaxation Weight:   %e \n", add_rlx_wt);
          hypre_printf( "   Point types, partial sweeps (1=C, -1=F):\n");
          hypre_printf( "                  Pre-CG relaxation (down):");
diff -Nru hypre-2.11.2/src/parcsr_ls/par_strength.c hypre-2.13.0/src/parcsr_ls/par_strength.c
--- hypre-2.11.2/src/parcsr_ls/par_strength.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_strength.c	2017-10-20 17:42:22.000000000 +0000
@@ -533,6 +533,396 @@
    return (ierr);
 }
 
+/* 
+   Create Strength matrix from CF marker array data. Provides a more 
+   general form to build S for specific nodes of the 'global' matrix
+   (for example, F points or A_FF part), given the entire matrix. 
+   These nodes have the SMRK tag.
+   
+   Currently assumes num_functions == 1, hence separate routine is used 
+   for now. Could possibly be merged with BoomerAMGCreateS() to yield a 
+   more general function.
+ */
+HYPRE_Int
+hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix    *A,
+                          HYPRE_Real            strength_threshold,
+                          HYPRE_Real            max_row_sum,
+                          HYPRE_Int             *CF_marker,
+                          HYPRE_Int		SMRK,
+                          hypre_ParCSRMatrix    **S_ptr)
+{
+#ifdef HYPRE_PROFILE
+   hypre_profile_times[HYPRE_TIMER_ID_CREATES] -= hypre_MPI_Wtime();
+#endif
+
+   MPI_Comm          comm            = hypre_ParCSRMatrixComm(A);
+   hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+   hypre_CSRMatrix    *A_diag          = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Int          *A_diag_i        = hypre_CSRMatrixI(A_diag);
+   HYPRE_Real         *A_diag_data     = hypre_CSRMatrixData(A_diag);
+
+
+   hypre_CSRMatrix    *A_offd          = hypre_ParCSRMatrixOffd(A);
+   HYPRE_Int                *A_offd_i        = hypre_CSRMatrixI(A_offd);
+   HYPRE_Real         *A_offd_data = NULL;
+   HYPRE_Int                *A_diag_j        = hypre_CSRMatrixJ(A_diag);
+   HYPRE_Int                *A_offd_j        = hypre_CSRMatrixJ(A_offd);
+
+   HYPRE_Int          *row_starts      = hypre_ParCSRMatrixRowStarts(A);
+   HYPRE_Int                 num_variables   = hypre_CSRMatrixNumRows(A_diag);
+   HYPRE_Int                 global_num_vars = hypre_ParCSRMatrixGlobalNumRows(A);
+   HYPRE_Int           num_nonzeros_diag;
+   HYPRE_Int           num_nonzeros_offd = 0;
+   HYPRE_Int           num_cols_offd = 0;
+                  
+   hypre_ParCSRMatrix *S;
+   hypre_CSRMatrix    *S_diag;
+   HYPRE_Int                *S_diag_i;
+   HYPRE_Int                *S_diag_j;
+   /* HYPRE_Real         *S_diag_data; */
+   hypre_CSRMatrix    *S_offd;
+   HYPRE_Int                *S_offd_i = NULL;
+   HYPRE_Int                *S_offd_j = NULL;
+   /* HYPRE_Real         *S_offd_data; */
+                 
+   HYPRE_Real          diag, row_scale, row_sum;
+   HYPRE_Int                 i, jj, jA, jS;
+                      
+   HYPRE_Int                 ierr = 0;
+
+   HYPRE_Int *prefix_sum_workspace;
+   
+   /*--------------------------------------------------------------
+    * Compute a  ParCSR strength matrix, S.
+    *
+    * For now, the "strength" of dependence/influence is defined in
+    * the following way: i depends on j if
+    *     aij > hypre_max (k != i) aik,    aii < 0
+    * or
+    *     aij < hypre_min (k != i) aik,    aii >= 0
+    * Then S_ij = 1, else S_ij = 0.
+    *
+    * NOTE: the entries are negative initially, corresponding
+    * to "unaccounted-for" dependence.
+    *----------------------------------------------------------------*/
+
+   num_nonzeros_diag = A_diag_i[num_variables];
+   num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
+
+   A_offd_i = hypre_CSRMatrixI(A_offd);
+   num_nonzeros_offd = A_offd_i[num_variables];
+
+   S = hypre_ParCSRMatrixCreate(comm, global_num_vars, global_num_vars,
+      row_starts, row_starts,
+      num_cols_offd, num_nonzeros_diag, num_nonzeros_offd);
+/* row_starts is owned by A, col_starts = row_starts */
+   hypre_ParCSRMatrixSetRowStartsOwner(S,0);
+   S_diag = hypre_ParCSRMatrixDiag(S);
+   hypre_CSRMatrixI(S_diag) = hypre_CTAlloc(HYPRE_Int, num_variables+1);
+   hypre_CSRMatrixJ(S_diag) = hypre_CTAlloc(HYPRE_Int, num_nonzeros_diag);
+   S_offd = hypre_ParCSRMatrixOffd(S);
+   hypre_CSRMatrixI(S_offd) = hypre_CTAlloc(HYPRE_Int, num_variables+1);
+
+   S_diag_i = hypre_CSRMatrixI(S_diag);
+   HYPRE_Int *S_temp_diag_j = hypre_CSRMatrixJ(S_diag);
+   S_offd_i = hypre_CSRMatrixI(S_offd);
+
+   S_diag_j = hypre_CTAlloc(HYPRE_Int, num_nonzeros_diag);
+/*
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+   for (i = 0; i < num_nonzeros_diag; i++)
+      S_diag_j[i] = 0;
+*/
+
+   HYPRE_Int *S_temp_offd_j = NULL;
+
+   if (num_cols_offd)
+   {
+        A_offd_data = hypre_CSRMatrixData(A_offd);
+        hypre_CSRMatrixJ(S_offd) = hypre_CTAlloc(HYPRE_Int, num_nonzeros_offd);
+        S_temp_offd_j = hypre_CSRMatrixJ(S_offd);
+        HYPRE_Int *col_map_offd_S = hypre_TAlloc(HYPRE_Int, num_cols_offd);
+        hypre_ParCSRMatrixColMapOffd(S) = col_map_offd_S;
+
+        S_offd_j = hypre_CTAlloc(HYPRE_Int, num_nonzeros_offd);
+/*
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+        for (i = 0; i < num_nonzeros_offd; i++)
+          S_offd_j[i] = 0;
+*/
+
+        HYPRE_Int *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#endif
+        for (i = 0; i < num_cols_offd; i++)
+           col_map_offd_S[i] = col_map_offd_A[i];
+   }
+
+
+  /*-------------------------------------------------------------------
+    * Get the dof_func data for the off-processor columns
+    *-------------------------------------------------------------------*/
+
+   if (!comm_pkg)
+   {
+  hypre_MatvecCommPkgCreate(A);
+
+  comm_pkg = hypre_ParCSRMatrixCommPkg(A); 
+   }
+
+   /*HYPRE_Int prefix_sum_workspace[2*(hypre_NumThreads() + 1)];*/
+   prefix_sum_workspace = hypre_TAlloc(HYPRE_Int, 2*(hypre_NumThreads() + 1));
+
+   /* give S same nonzero structure as A */
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel private(i,diag,row_scale,row_sum,jA,jS)
+#endif
+   {
+   HYPRE_Int start, stop;
+   hypre_GetSimpleThreadPartition(&start, &stop, num_variables);
+   HYPRE_Int jS_diag = 0, jS_offd = 0;
+
+   for (i = start; i < stop; i++)
+   {
+    if (CF_marker[i] == SMRK) {
+      S_diag_i[i] = jS_diag;
+      if (num_cols_offd)
+      {
+         S_offd_i[i] = jS_offd;
+      }
+
+      diag = A_diag_data[A_diag_i[i]];
+
+      /* compute scaling factor and row sum */
+      row_scale = 0.0;
+      row_sum = diag;
+      if (diag < 0)
+      {
+        for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++)
+        {
+          jj = A_diag_j[jA];
+          if (CF_marker[jj] == SMRK) {
+            row_scale = hypre_max(row_scale, A_diag_data[jA]);
+            row_sum += A_diag_data[jA];
+          }
+        }
+        for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+        {
+          jj = A_offd_j[jA];
+          if (CF_marker[jj] == SMRK) {
+            row_scale = hypre_max(row_scale, A_offd_data[jA]);
+            row_sum += A_offd_data[jA];
+          }
+        }
+      }
+      else
+      {
+        for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++)
+        {
+          jj = A_diag_j[jA];
+          if (CF_marker[jj] == SMRK) {
+            row_scale = hypre_min(row_scale, A_diag_data[jA]);
+            row_sum += A_diag_data[jA];
+          } 
+        }
+        for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+        {
+          jj = A_offd_j[jA];
+          if (CF_marker[jj] == SMRK) {
+            row_scale = hypre_min(row_scale, A_offd_data[jA]);
+            row_sum += A_offd_data[jA];
+          }
+        }
+      } /* diag >= 0*/
+
+      jS_diag += A_diag_i[i + 1] - A_diag_i[i] - 1;
+      jS_offd += A_offd_i[i + 1] - A_offd_i[i];
+
+      /* compute row entries of S */
+      S_temp_diag_j[A_diag_i[i]] = -1;
+      if ((fabs(row_sum) > fabs(diag)*max_row_sum) && (max_row_sum < 1.0))
+      {
+         /* make all dependencies weak */
+         for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++)
+         {
+            jj = A_diag_j[jA];
+              S_temp_diag_j[jA] = -1;
+         }
+         jS_diag -= A_diag_i[i + 1] - (A_diag_i[i] + 1);
+
+         for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+         {
+              S_temp_offd_j[jA] = -1;
+         }
+         jS_offd -= A_offd_i[i + 1] - A_offd_i[i];
+      }
+      else
+      {
+        if (diag < 0) 
+        { 
+          for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++)
+          {
+            jj = A_diag_j[jA];
+            if (CF_marker[jj] == SMRK) {
+              if (A_diag_data[jA] <= strength_threshold * row_scale)
+              {
+                 S_temp_diag_j[jA] = -1;
+                 --jS_diag;
+              }
+              else
+              {
+                 S_temp_diag_j[jA] = jj;
+              }
+            } else {
+              S_temp_diag_j[jA] = -1;
+            }
+          }
+          for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+          {
+            jj = A_offd_j[jA];
+            if (CF_marker[jj] == SMRK) {
+              if (A_offd_data[jA] <= strength_threshold * row_scale)
+              {
+                 S_temp_offd_j[jA] = -1;
+                 --jS_offd;
+              }
+              else
+              {
+                 S_temp_offd_j[jA] = jj;
+              }
+            } else {
+              S_temp_offd_j[jA] = -1;
+            }
+          }
+        } /* diag < 0 */
+        else
+        {
+          for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++)
+          {
+            jj = A_diag_j[jA];
+            if (CF_marker[jj] == SMRK) {
+              if (A_diag_data[jA] >= strength_threshold * row_scale)
+              {
+                 S_temp_diag_j[jA] = -1;
+                 --jS_diag;
+              }
+              else
+              {
+                 S_temp_diag_j[jA] = jj;
+              }
+            } else {
+              S_temp_diag_j[jA] = -1;
+            }
+          }
+          for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+          {
+            jj = A_offd_j[jA];
+            if (CF_marker[jj] == SMRK) {
+              if (A_offd_data[jA] >= strength_threshold * row_scale)
+              {
+                 S_temp_offd_j[jA] = -1;
+                 --jS_offd;
+              }
+              else
+              {
+                 S_temp_offd_j[jA] = jj;
+              }
+            } else {
+              S_temp_offd_j[jA] = -1;
+            }
+          }
+        } /* diag >= 0 */
+      } /* !((row_sum > max_row_sum) && (max_row_sum < 1.0)) */
+    } /* CF_marker == SMRK */
+    else
+    {
+      S_diag_i[i] = jS_diag;
+      if (num_cols_offd)
+      {
+         S_offd_i[i] = jS_offd;
+      }
+      jS_diag += A_diag_i[i + 1] - A_diag_i[i] - 1;
+      jS_offd += A_offd_i[i + 1] - A_offd_i[i];
+
+      for (jA = A_diag_i[i]+1; jA < A_diag_i[i+1]; jA++)
+      {
+        S_temp_diag_j[jA] = -1;
+      }
+      jS_diag -= A_diag_i[i + 1] - (A_diag_i[i] + 1);
+
+      for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+      {
+        S_temp_offd_j[jA] = -1;
+      }
+      jS_offd -= A_offd_i[i + 1] - A_offd_i[i];
+    } /* CF_marker != SMRK */
+   } /* for each variable */
+
+   hypre_prefix_sum_pair(&jS_diag, S_diag_i + num_variables, &jS_offd, S_offd_i + num_variables, prefix_sum_workspace);
+
+   /*--------------------------------------------------------------
+    * "Compress" the strength matrix.
+    *
+    * NOTE: S has *NO DIAGONAL ELEMENT* on any row.  Caveat Emptor!
+    *
+    * NOTE: This "compression" section of code may be removed, and
+    * coarsening will still be done correctly.  However, the routine
+    * that builds interpolation would have to be modified first.
+    *----------------------------------------------------------------*/
+
+   for (i = start; i < stop; i++)
+   {
+      S_diag_i[i] += jS_diag;
+      S_offd_i[i] += jS_offd;
+
+      jS = S_diag_i[i];
+      for (jA = A_diag_i[i]; jA < A_diag_i[i+1]; jA++)
+      {
+         if (S_temp_diag_j[jA] > -1)
+         {
+            S_diag_j[jS]    = S_temp_diag_j[jA];
+            jS++;
+         }
+      }
+
+      jS = S_offd_i[i];
+      for (jA = A_offd_i[i]; jA < A_offd_i[i+1]; jA++)
+      {
+         if (S_temp_offd_j[jA] > -1)
+         {
+            S_offd_j[jS]    = S_temp_offd_j[jA];
+            jS++;
+         }
+      }
+   } /* for each variable */
+
+   } /* omp parallel */
+
+   hypre_CSRMatrixNumNonzeros(S_diag) = S_diag_i[num_variables];
+   hypre_CSRMatrixNumNonzeros(S_offd) = S_offd_i[num_variables];
+   hypre_CSRMatrixJ(S_diag) = S_diag_j;
+   hypre_CSRMatrixJ(S_offd) = S_offd_j;
+
+   hypre_ParCSRMatrixCommPkg(S) = NULL;
+
+   *S_ptr        = S;
+
+   hypre_TFree(prefix_sum_workspace);
+   hypre_TFree(S_temp_diag_j);
+   hypre_TFree(S_temp_offd_j);
+
+#ifdef HYPRE_PROFILE
+   hypre_profile_times[HYPRE_TIMER_ID_CREATES] += hypre_MPI_Wtime();
+#endif
+
+   return (ierr);
+}
 
 /*==========================================================================*/
 /*==========================================================================*/
@@ -1497,6 +1887,7 @@
 
       temp = hypre_UnorderedIntSetCopyToArray(&found_set, &num_cols_offd_C);
       
+      hypre_UnorderedIntSetDestroy(&found_set);
       hypre_TFree(S_ext_i);
       hypre_TFree(S_ext_j);
 
diff -Nru hypre-2.11.2/src/parcsr_ls/par_sv_interp_lsfit.c hypre-2.13.0/src/parcsr_ls/par_sv_interp_lsfit.c
--- hypre-2.11.2/src/parcsr_ls/par_sv_interp_lsfit.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_sv_interp_lsfit.c	2017-10-20 17:42:22.000000000 +0000
@@ -1,17 +1,7 @@
 #include "_hypre_parcsr_ls.h"
 #include "Common.h"
-
-
-#ifdef HYPRE_USING_ESSL
-#include <essl.h>
-#else
-HYPRE_Int hypre_F90_NAME_BLAS(dgemm, DGEMM) (char *, char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_BLAS(dgemv, DGEMV) (char *, HYPRE_Int * , HYPRE_Int * , HYPRE_Real *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Real *, HYPRE_Int *);
-
-HYPRE_Int hypre_F90_NAME_LAPACK(dgetrf, DGETRF) (HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dgetrs, DGETRS) (char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *b, HYPRE_Int*, HYPRE_Int *);
-
-#endif
+#include "_hypre_blas.h"
+#include "_hypre_lapack.h"
 
 #define ADJUST(a,b)  (adjust_list[(a)*(num_functions-1)+(b)])
 
@@ -414,36 +404,32 @@
        /* now  B_s <-delta*Beta*Beta^T + B_s */ 
        /* usage: DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
                  C := alpha*op( A )*op( B ) + beta*C */
-       hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "T", &num_smooth_vecs, 
-                                        &num_smooth_vecs, &k_size, 
-                                        &delta, Beta, &num_smooth_vecs, Beta, 
-                                        &num_smooth_vecs, &one, B_s, &num_smooth_vecs);
+       hypre_dgemm("N", "T", &num_smooth_vecs, 
+                   &num_smooth_vecs, &k_size, 
+                   &delta, Beta, &num_smooth_vecs, Beta, 
+                   &num_smooth_vecs, &one, B_s, &num_smooth_vecs);
        
        /* now do alpha <- (alpha - beta*w)*/
        /* usage: DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
                  y := alpha*A*x + beta*y */
-       hypre_F90_NAME_BLAS(dgemv,DGEMV)("N", &num_smooth_vecs, &k_size, &mone, 
-                                        Beta, &num_smooth_vecs, w_old, &one_i, 
-                                        &one, alpha, &one_i);
-       
+       hypre_dgemv("N", &num_smooth_vecs, &k_size, &mone, 
+                   Beta, &num_smooth_vecs, w_old, &one_i, 
+                   &one, alpha, &one_i);
       
        /* now get alpha <- inv(B_s)*alpha */
-           /*write over B_s with LU */
-       hypre_F90_NAME_LAPACK(dgetrf, DGETRF)(&num_smooth_vecs, &num_smooth_vecs, 
-                                             B_s, &num_smooth_vecs, piv, &info);
-       
-           /*now get alpha  */
-       hypre_F90_NAME_LAPACK(dgetrs, DGETRS)("N", &num_smooth_vecs, &one_i, B_s, 
-                                             &num_smooth_vecs, piv, alpha, 
-                                             &num_smooth_vecs, &info);
+       /*write over B_s with LU */
+       hypre_dgetrf(&num_smooth_vecs, &num_smooth_vecs, 
+                    B_s, &num_smooth_vecs, piv, &info);
        
-
+       /*now get alpha  */
+       hypre_dgetrs("N", &num_smooth_vecs, &one_i, B_s, 
+                    &num_smooth_vecs, piv, alpha, 
+                    &num_smooth_vecs, &info);
 
        /* now w <- w + (delta)*(Beta)^T*(alpha) */
-       hypre_F90_NAME_BLAS(dgemv,DGEMV)("T", &num_smooth_vecs, &k_size, &delta, 
-                                        Beta, &num_smooth_vecs, alpha, &one_i, 
-                                        &one, w, &one_i);
-
+       hypre_dgemv("T", &num_smooth_vecs, &k_size, &delta, 
+                   Beta, &num_smooth_vecs, alpha, &one_i, 
+                   &one, w, &one_i);
        
        /* note:we have w_old still, but we don't need it unless we
         * want to use it in the future for something */
diff -Nru hypre-2.11.2/src/parcsr_ls/par_vardifconv.c hypre-2.13.0/src/parcsr_ls/par_vardifconv.c
--- hypre-2.11.2/src/parcsr_ls/par_vardifconv.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_vardifconv.c	2017-10-20 17:42:22.000000000 +0000
@@ -351,10 +351,6 @@
             }
    }
 
-   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
-   rhs = hypre_ParVectorLocalVector(par_rhs);
-   hypre_VectorData(rhs) = rhs_data;
-
 #ifdef HYPRE_NO_GLOBAL_PARTITION
 /* ideally we would use less storage earlier in this function, but this is fine
    for testing */
@@ -369,6 +365,11 @@
    }
 #endif
 
+   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
+   hypre_ParVectorOwnsPartitioning(par_rhs) = 0;
+   rhs = hypre_ParVectorLocalVector(par_rhs);
+   hypre_VectorData(rhs) = rhs_data;
+
    A = hypre_ParCSRMatrixCreate(comm, grid_size, grid_size,
                                 global_part, global_part, num_cols_offd,
                                 diag_i[local_num_rows],
diff -Nru hypre-2.11.2/src/parcsr_ls/par_vardifconv_rs.c hypre-2.13.0/src/parcsr_ls/par_vardifconv_rs.c
--- hypre-2.11.2/src/parcsr_ls/par_vardifconv_rs.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/par_vardifconv_rs.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,559 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#include "_hypre_parcsr_ls.h"
+ 
+#ifndef M_PI
+#define M_PI 3.14159265358979
+#endif
+
+/* examples in Ruge & Stuben paper */
+static HYPRE_Int rs_example = 1;
+static HYPRE_Real rs_l = 3.0;
+
+/*--------------------------------------------------------------------------
+ * hypre_GenerateVarDifConv: with the FD discretization and examples 
+ *                           in Ruge-Stuben's paper ``Algebraic Multigrid''
+ *--------------------------------------------------------------------------*/
+
+HYPRE_ParCSRMatrix 
+GenerateRSVarDifConv( MPI_Comm comm,
+                 HYPRE_Int      nx,
+                 HYPRE_Int      ny,
+                 HYPRE_Int      nz, 
+                 HYPRE_Int      P,
+                 HYPRE_Int      Q,
+                 HYPRE_Int      R,
+                 HYPRE_Int      p,
+                 HYPRE_Int      q,
+                 HYPRE_Int      r,
+                 HYPRE_Real eps,
+		 HYPRE_ParVector *rhs_ptr,
+                 HYPRE_Int type)
+{
+   hypre_ParCSRMatrix *A;
+   hypre_CSRMatrix *diag;
+   hypre_CSRMatrix *offd;
+   hypre_ParVector *par_rhs;
+   hypre_Vector *rhs;
+   HYPRE_Real *rhs_data;
+
+   HYPRE_Int    *diag_i;
+   HYPRE_Int    *diag_j;
+   HYPRE_Real *diag_data;
+
+   HYPRE_Int    *offd_i;
+   HYPRE_Int    *offd_j;
+   HYPRE_Real *offd_data;
+
+   HYPRE_Int *global_part;
+   HYPRE_Int ix, iy, iz;
+   HYPRE_Int cnt, o_cnt;
+   HYPRE_Int local_num_rows; 
+   HYPRE_Int *col_map_offd;
+   HYPRE_Int row_index;
+   HYPRE_Int i,j;
+
+   HYPRE_Int nx_local, ny_local, nz_local;
+   HYPRE_Int nx_size, ny_size, nz_size;
+   HYPRE_Int num_cols_offd;
+   HYPRE_Int grid_size;
+
+   HYPRE_Int *nx_part;
+   HYPRE_Int *ny_part;
+   HYPRE_Int *nz_part;
+
+   HYPRE_Int num_procs, my_id;
+   HYPRE_Int P_busy, Q_busy, R_busy;
+
+   HYPRE_Real hhx, hhy, hhz;
+   HYPRE_Real xx, yy, zz;
+   HYPRE_Real afp, afm, bfp, bfm, cfp, cfm, di, ai, mux, ei, bi,
+              muy, fi, ci, muz, dfm, dfp, efm, efp, ffm, ffp, gi;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+   hypre_MPI_Comm_rank(comm,&my_id);
+
+   if (type >= 1 && type <= 3)
+   {
+     rs_example = type;
+   }
+
+   grid_size = nx*ny*nz;
+
+   hypre_GeneratePartitioning(nx,P,&nx_part);
+   hypre_GeneratePartitioning(ny,Q,&ny_part);
+   hypre_GeneratePartitioning(nz,R,&nz_part);
+
+   global_part = hypre_CTAlloc(HYPRE_Int,P*Q*R+1);
+
+   global_part[0] = 0;
+   cnt = 1;
+   for (iz = 0; iz < R; iz++)
+   {
+      nz_size = nz_part[iz+1]-nz_part[iz];
+      for (iy = 0; iy < Q; iy++)
+      {
+         ny_size = ny_part[iy+1]-ny_part[iy];
+         for (ix = 0; ix < P; ix++)
+         {
+            nx_size = nx_part[ix+1] - nx_part[ix];
+            global_part[cnt] = global_part[cnt-1];
+            global_part[cnt++] += nx_size*ny_size*nz_size;
+         }
+      }
+   }
+
+   nx_local = nx_part[p+1] - nx_part[p];
+   ny_local = ny_part[q+1] - ny_part[q];
+   nz_local = nz_part[r+1] - nz_part[r];
+
+   my_id = r*(P*Q) + q*P + p;
+   num_procs = P*Q*R;
+
+   local_num_rows = nx_local*ny_local*nz_local;
+   diag_i = hypre_CTAlloc(HYPRE_Int, local_num_rows+1);
+   offd_i = hypre_CTAlloc(HYPRE_Int, local_num_rows+1);
+   rhs_data = hypre_CTAlloc(HYPRE_Real, local_num_rows);
+
+   P_busy = hypre_min(nx,P);
+   Q_busy = hypre_min(ny,Q);
+   R_busy = hypre_min(nz,R);
+
+   num_cols_offd = 0;
+   if (p) num_cols_offd += ny_local*nz_local;
+   if (p < P_busy-1) num_cols_offd += ny_local*nz_local;
+   if (q) num_cols_offd += nx_local*nz_local;
+   if (q < Q_busy-1) num_cols_offd += nx_local*nz_local;
+   if (r) num_cols_offd += nx_local*ny_local;
+   if (r < R_busy-1) num_cols_offd += nx_local*ny_local;
+
+   if (!local_num_rows) num_cols_offd = 0;
+
+   col_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd);
+
+   hhx = 1.0/(HYPRE_Real)(nx+1);
+   hhy = 1.0/(HYPRE_Real)(ny+1);
+   hhz = 1.0/(HYPRE_Real)(nz+1);
+
+   cnt = 1;
+   o_cnt = 1;
+   diag_i[0] = 0;
+   offd_i[0] = 0;
+   for (iz = nz_part[r]; iz < nz_part[r+1]; iz++)
+   {
+      for (iy = ny_part[q];  iy < ny_part[q+1]; iy++)
+      {
+         for (ix = nx_part[p]; ix < nx_part[p+1]; ix++)
+         {
+            diag_i[cnt] = diag_i[cnt-1];
+            offd_i[o_cnt] = offd_i[o_cnt-1];
+            diag_i[cnt]++;
+            if (iz > nz_part[r]) 
+               diag_i[cnt]++;
+            else
+            {
+               if (iz) 
+               {
+                  offd_i[o_cnt]++;
+               }
+            }
+            if (iy > ny_part[q]) 
+               diag_i[cnt]++;
+            else
+            {
+               if (iy) 
+               {
+                  offd_i[o_cnt]++;
+               }
+            }
+            if (ix > nx_part[p]) 
+               diag_i[cnt]++;
+            else
+            {
+               if (ix) 
+               {
+                  offd_i[o_cnt]++; 
+               }
+            }
+            if (ix+1 < nx_part[p+1]) 
+               diag_i[cnt]++;
+            else
+            {
+               if (ix+1 < nx) 
+               {
+                  offd_i[o_cnt]++; 
+               }
+            }
+            if (iy+1 < ny_part[q+1]) 
+               diag_i[cnt]++;
+            else
+            {
+               if (iy+1 < ny) 
+               {
+                  offd_i[o_cnt]++;
+               }
+            }
+            if (iz+1 < nz_part[r+1]) 
+               diag_i[cnt]++;
+            else
+            {
+               if (iz+1 < nz) 
+               {
+                  offd_i[o_cnt]++;
+               }
+            }
+            cnt++;
+            o_cnt++;
+         }
+      }
+   }
+
+   diag_j = hypre_CTAlloc(HYPRE_Int, diag_i[local_num_rows]);
+   diag_data = hypre_CTAlloc(HYPRE_Real, diag_i[local_num_rows]);
+
+   if (num_procs > 1)
+   {
+      offd_j = hypre_CTAlloc(HYPRE_Int, offd_i[local_num_rows]);
+      offd_data = hypre_CTAlloc(HYPRE_Real, offd_i[local_num_rows]);
+   }
+
+   row_index = 0;
+   cnt = 0;
+   o_cnt = 0;
+   for (iz = nz_part[r]; iz < nz_part[r+1]; iz++)
+   {
+      zz = (HYPRE_Real)(iz+1)*hhz;
+      for (iy = ny_part[q]; iy < ny_part[q+1]; iy++)
+      {
+         yy = (HYPRE_Real)(iy+1)*hhy;
+         for (ix = nx_part[p]; ix < nx_part[p+1]; ix++)
+         {
+            xx = (HYPRE_Real)(ix+1)*hhx;
+	    afp = -eps*afun_rs(xx+0.5*hhx,yy,zz)/hhx/hhx;
+	    afm = -eps*afun_rs(xx-0.5*hhx,yy,zz)/hhx/hhx;
+	    bfp = -eps*bfun_rs(xx,yy+0.5*hhy,zz)/hhy/hhy;
+	    bfm = -eps*bfun_rs(xx,yy-0.5*hhy,zz)/hhy/hhy;
+	    cfp = -eps*cfun_rs(xx,yy,zz+0.5*hhz)/hhz/hhz;
+	    cfm = -eps*cfun_rs(xx,yy,zz-0.5*hhz)/hhz/hhz;
+            /* first order terms */
+            /* x-direction */
+            di = dfun_rs(xx, yy, zz);
+            ai = afun_rs(xx, yy, zz);
+            if (di * hhx > eps * ai)
+            {
+              mux = eps * ai / (2.0 * di * hhx);
+            }
+            else if (di * hhx < -eps * ai)
+            {
+              mux = 1.0 + eps * ai / (2.0 * di * hhx);
+            }
+            else
+            {
+              mux = 0.5;
+            }
+            /* y-direction */
+            ei = efun_rs(xx, yy, zz);
+            bi = bfun_rs(xx, yy, zz);
+            if (ei * hhy > eps * bi)
+            {
+              muy = eps * bi / (2.0 * ei * hhy);
+            }
+            else if (ei * hhy < -eps * bi)
+            {
+              muy = 1.0 + eps * bi / (2.0 * ei * hhy);
+            }
+            else
+            {
+              muy = 0.5;
+            }
+            /* z-direction */
+            fi = ffun_rs(xx, yy, zz);
+            ci = cfun_rs(xx, yy, zz);
+            if (fi * hhz > eps * ci)
+            {
+              muz = eps * ci / (2.0 * fi * hhz);
+            }
+            else if (fi * hhz < -eps * ci)
+            {
+              muz = 1.0 + eps * ci / (2.0 * fi * hhz);
+            }
+            else
+            {
+              muz = 0.5;
+            }
+
+	    dfm = di * (mux - 1.0) / hhx;
+            dfp = di * mux / hhx;
+	    efm = ei * (muy - 1.0) / hhy;
+            efp = ei * muy / hhy;
+            ffm = fi * (muz - 1.0) / hhz;
+	    ffp = fi * muz / hhz;
+	    gi = gfun_rs(xx, yy, zz);
+            /* stencil: center */
+            diag_j[cnt] = row_index;
+            diag_data[cnt++] = -(afp + afm + bfp + bfm + cfp + cfm  +
+                                 dfp + dfm + efp + efm + ffp + ffm) + gi;
+            /* rhs vector */
+	    rhs_data[row_index] = rfun_rs(xx,yy,zz);
+	    /* apply boundary conditions */
+            if (ix == 0)    rhs_data[row_index] -= (afm+dfm) * bndfun_rs(0,yy,zz);
+	    if (iy == 0)    rhs_data[row_index] -= (bfm+efm) * bndfun_rs(xx,0,zz);
+	    if (iz == 0)    rhs_data[row_index] -= (cfm+ffm) * bndfun_rs(xx,yy,0);
+	    if (ix+1 == nx) rhs_data[row_index] -= (afp+dfp) * bndfun_rs(1.0,yy,zz);
+	    if (iy+1 == ny) rhs_data[row_index] -= (bfp+efp) * bndfun_rs(xx,1.0,zz);
+	    if (iz+1 == nz) rhs_data[row_index] -= (cfp+ffp) * bndfun_rs(xx,yy,1.0);
+            /* stencil: z- */
+            if (iz > nz_part[r]) 
+            {
+               diag_j[cnt] = row_index - nx_local*ny_local;
+               diag_data[cnt++] = cfm + ffm;
+            }
+            else
+            {
+               if (iz) 
+               {
+                  offd_j[o_cnt] = hypre_map(ix,iy,iz-1,p,q,r-1,P,Q,R,
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = cfm + ffm;
+               }
+            }
+            /* stencil: y- */
+            if (iy > ny_part[q]) 
+            {
+               diag_j[cnt] = row_index - nx_local;
+               diag_data[cnt++] = bfm + efm;
+            }
+            else
+            {
+               if (iy) 
+               {
+                  offd_j[o_cnt] = hypre_map(ix,iy-1,iz,p,q-1,r,P,Q,R,
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = bfm + efm;
+               }
+            }
+            /* stencil: x- */
+            if (ix > nx_part[p]) 
+            {
+               diag_j[cnt] = row_index - 1;
+               diag_data[cnt++] = afm + dfm;
+            }
+            else
+            {
+               if (ix) 
+               {
+                  offd_j[o_cnt] = hypre_map(ix-1,iy,iz,p-1,q,r,P,Q,R,
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = afm + dfm;
+               }
+            }
+            /* stencil: x+ */
+            if (ix+1 < nx_part[p+1]) 
+            {
+               diag_j[cnt] = row_index + 1;
+               diag_data[cnt++] = afp + dfp;
+            }
+            else
+            {
+               if (ix+1 < nx) 
+               {
+                  offd_j[o_cnt] = hypre_map(ix+1,iy,iz,p+1,q,r,P,Q,R,
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = afp + dfp;
+               }
+            }
+            /* stencil: y+ */
+            if (iy+1 < ny_part[q+1]) 
+            {
+               diag_j[cnt] = row_index + nx_local;
+               diag_data[cnt++] = bfp + efp;
+            }
+            else
+            {
+               if (iy+1 < ny) 
+               {
+                  offd_j[o_cnt] = hypre_map(ix,iy+1,iz,p,q+1,r,P,Q,R,
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = bfp + efp;
+               }
+            }
+            /* stencil: z+ */
+            if (iz+1 < nz_part[r+1]) 
+            {
+               diag_j[cnt] = row_index + nx_local*ny_local;
+               diag_data[cnt++] = cfp + ffp;
+            }
+            else
+            {
+               if (iz+1 < nz) 
+               {
+                  offd_j[o_cnt] = hypre_map(ix,iy,iz+1,p,q,r+1,P,Q,R,
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = cfp + ffp;
+               }
+            }
+            /* done with this row */
+            row_index++;
+         }
+      }
+   }
+
+   if (num_procs > 1)
+   {
+      for (i=0; i < num_cols_offd; i++)
+         col_map_offd[i] = offd_j[i];
+   	
+      hypre_qsort0(col_map_offd, 0, num_cols_offd-1);
+
+      for (i=0; i < num_cols_offd; i++)
+         for (j=0; j < num_cols_offd; j++)
+            if (offd_j[i] == col_map_offd[j])
+            {
+               offd_j[i] = j;
+               break;
+            }
+   }
+
+#ifdef HYPRE_NO_GLOBAL_PARTITION
+/* ideally we would use less storage earlier in this function, but this is fine
+   for testing */
+   {
+      HYPRE_Int tmp1, tmp2;
+      tmp1 = global_part[my_id];
+      tmp2 = global_part[my_id + 1];
+      hypre_TFree(global_part);
+      global_part = hypre_CTAlloc(HYPRE_Int, 2);
+      global_part[0] = tmp1;
+      global_part[1] = tmp2;
+   }
+#endif
+
+   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
+   hypre_ParVectorOwnsPartitioning(par_rhs) = 0;
+   rhs = hypre_ParVectorLocalVector(par_rhs);
+   hypre_VectorData(rhs) = rhs_data;
+
+   A = hypre_ParCSRMatrixCreate(comm, grid_size, grid_size,
+                                global_part, global_part, num_cols_offd,
+                                diag_i[local_num_rows],
+                                offd_i[local_num_rows]);
+
+   hypre_ParCSRMatrixColMapOffd(A) = col_map_offd;
+
+   diag = hypre_ParCSRMatrixDiag(A);
+   hypre_CSRMatrixI(diag) = diag_i;
+   hypre_CSRMatrixJ(diag) = diag_j;
+   hypre_CSRMatrixData(diag) = diag_data;
+
+   offd = hypre_ParCSRMatrixOffd(A);
+   hypre_CSRMatrixI(offd) = offd_i;
+   if (num_cols_offd)
+   {
+      hypre_CSRMatrixJ(offd) = offd_j;
+      hypre_CSRMatrixData(offd) = offd_data;
+   }
+
+   hypre_TFree(nx_part);
+   hypre_TFree(ny_part);
+   hypre_TFree(nz_part);
+
+   *rhs_ptr = (HYPRE_ParVector) par_rhs;
+
+   return (HYPRE_ParCSRMatrix) A;
+}
+
+HYPRE_Real afun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = 1.0;
+   return value;
+}
+
+HYPRE_Real bfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = 1.0;
+   return value;
+}
+
+HYPRE_Real cfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = 1.0;
+   return value;
+}
+
+HYPRE_Real dfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   if (rs_example == 1)
+   {
+     value = sin(rs_l*M_PI/8.0);
+   }
+   else if (rs_example == 2)
+   {
+     value = (2.0*yy-1.0)*(1.0-xx*xx);
+   }
+   else
+   {
+     value = 4.0*xx*(xx-1.0)*(1.0-2.0*yy);
+   }
+   return value;
+}
+
+HYPRE_Real efun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   if (rs_example == 1)
+   {
+     value = cos(rs_l*M_PI/8.0);
+   }
+   else if (rs_example == 2)
+   {
+     value = 2.0*xx*yy*(yy-1.0);
+   }
+   else
+   {
+     value = -4.0*yy*(yy-1.0)*(1.0-2.0*xx);
+   }
+   return value;
+}
+
+HYPRE_Real ffun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = efun_rs(xx, yy, zz);
+   return value;
+}
+
+HYPRE_Real gfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = 0.0;
+   return value;
+}
+
+HYPRE_Real rfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = 1.0;
+   return value;
+}
+
+HYPRE_Real bndfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+{
+   HYPRE_Real value;
+   value = 0.0;
+   return value;
+}
+
diff -Nru hypre-2.11.2/src/parcsr_ls/schwarz.c hypre-2.13.0/src/parcsr_ls/schwarz.c
--- hypre-2.11.2/src/parcsr_ls/schwarz.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_ls/schwarz.c	2017-10-20 17:42:22.000000000 +0000
@@ -12,562 +12,472 @@
 
 #include "_hypre_parcsr_ls.h"
 #include "Common.h"
-
+#include "_hypre_lapack.h"
 
 /* -------------------------------------------------------------------------
    dof_domain: for each dof defines neighborhood to build interpolation,
-                               using 
-	       domain_diagmat (for cut--off scaling) and
-	       i_domain_dof, j_dof_domain (for extracting the block of A);
+   using
+   domain_diagmat (for cut--off scaling) and
+   i_domain_dof, j_dof_domain (for extracting the block of A);
 
    domain_matrixinverse: contains the inverse of subdomain matrix;
 
    B can be used to define strength matrix;
    ----------------------------------------------------------------------- */
 
-#ifdef HYPRE_USING_ESSL
-#include <essl.h>
-#else
-#ifdef __cplusplus
-extern "C" {
-#endif 
-HYPRE_Int hypre_F90_NAME_LAPACK(dpotrf, DPOTRF)(char *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *);
-
-HYPRE_Int hypre_F90_NAME_LAPACK(dgetrf, DGETRF) (HYPRE_Int *, HYPRE_Int *n, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Int *);
-HYPRE_Int hypre_F90_NAME_LAPACK(dgetrs, DGETRS) (char *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *, HYPRE_Int *, HYPRE_Int *, HYPRE_Real *b, HYPRE_Int*, HYPRE_Int *);
-#ifdef __cplusplus
-}
-#endif
-#endif
-
 /*--------------------------------------------------------------------------
  * hypre_AMGNodalSchwarzSmoother: (Not used currently)
  *--------------------------------------------------------------------------*/
 
-
-
 HYPRE_Int
 hypre_AMGNodalSchwarzSmoother( hypre_CSRMatrix    *A,
-			       HYPRE_Int                 num_functions,
-			       HYPRE_Int		   option,
-			       hypre_CSRMatrix   **domain_structure_pointer)
-
+                               HYPRE_Int                 num_functions,
+                               HYPRE_Int                   option,
+                               hypre_CSRMatrix   **domain_structure_pointer)
 {
+   /*  option =      0: nodal symGS;
+       1: next to nodal symGS (overlapping Schwarz) */
 
-  /*  option =      0: nodal symGS; 
-		    1: next to nodal symGS (overlapping Schwarz) */
-	
-
-  HYPRE_Int *i_domain_dof, *j_domain_dof;
-  HYPRE_Real *domain_matrixinverse;
-  HYPRE_Int num_domains;
-  hypre_CSRMatrix *domain_structure;
-
-  HYPRE_Int *i_dof_node, *j_dof_node;
-  HYPRE_Int *i_node_dof, *j_node_dof;
-
-  HYPRE_Int *i_node_dof_dof, *j_node_dof_dof;
-
-  HYPRE_Int *i_node_node, *j_node_node;
-
-  HYPRE_Int num_nodes;
-
-  HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A);
-  HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A);
-  HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A);
-  HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A);
-
-
-  HYPRE_Int ierr = 0;
-  HYPRE_Int i,j,k, l_loc, i_loc, j_loc;
-  HYPRE_Int i_dof, j_dof;
-  HYPRE_Int *i_local_to_global;
-  HYPRE_Int *i_global_to_local;
+   HYPRE_Int *i_domain_dof, *j_domain_dof;
+   HYPRE_Real *domain_matrixinverse;
+   HYPRE_Int num_domains;
+   hypre_CSRMatrix *domain_structure;
 
-  HYPRE_Int *i_int;
-  HYPRE_Int *i_int_to_local;
+   HYPRE_Int *i_dof_node, *j_dof_node;
+   HYPRE_Int *i_node_dof, *j_node_dof;
 
-  HYPRE_Int int_dof_counter, local_dof_counter, max_local_dof_counter=0; 
+   HYPRE_Int *i_node_dof_dof, *j_node_dof_dof;
 
-  HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0;
+   HYPRE_Int *i_node_node, *j_node_node;
 
+   HYPRE_Int num_nodes;
 
-  HYPRE_Real *AE;
+   HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A);
+   HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A);
+   HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A);
+   HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A);
 
-#ifdef HYPRE_USING_ESSL
-#else
-  char uplo = 'L';
-#endif
 
-  HYPRE_Int cnt;
+   HYPRE_Int ierr = 0;
+   HYPRE_Int i,j,k, l_loc, i_loc, j_loc;
+   HYPRE_Int i_dof, j_dof;
+   HYPRE_Int *i_local_to_global;
+   HYPRE_Int *i_global_to_local;
 
-  /* PCG arrays: --------------------------------------------------- 
-  HYPRE_Real *x, *rhs, *v, *w, *d, *aux;
+   HYPRE_Int *i_int;
+   HYPRE_Int *i_int_to_local;
 
-  HYPRE_Int max_iter;
+   HYPRE_Int int_dof_counter, local_dof_counter, max_local_dof_counter=0;
 
-  ------------------------------------------------------------------ */
+   HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0;
 
+   HYPRE_Real *AE;
 
+   char uplo = 'L';
 
+   HYPRE_Int cnt;
 
-  /* build dof_node graph: ----------------------------------------- */
+   /* PCG arrays: ---------------------------------------------------
+      HYPRE_Real *x, *rhs, *v, *w, *d, *aux;
 
-  num_nodes = num_dofs / num_functions;
+      HYPRE_Int max_iter;
 
-  /*hypre_printf("\nnum_nodes: %d, num_dofs: %d = %d x %d\n", num_nodes, num_dofs,
-	 num_nodes, num_functions);*/
+      ------------------------------------------------------------------ */
 
-  i_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs+1);
-  j_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs);
+   /* build dof_node graph: ----------------------------------------- */
 
-  for (i=0; i < num_dofs+1; i++)
-    i_dof_node[i] = i;
+   num_nodes = num_dofs / num_functions;
 
-  for (j = 0; j < num_nodes; j++)
-    for (k = 0; k < num_functions; k++) 
-      j_dof_node[j*num_functions+k] = j;
+   /*hypre_printf("\nnum_nodes: %d, num_dofs: %d = %d x %d\n", num_nodes, num_dofs,
+     num_nodes, num_functions);*/
 
-  /* build node_dof graph: ----------------------------------------- */
+   i_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs+1);
+   j_dof_node = hypre_CTAlloc(HYPRE_Int, num_dofs);
 
-  ierr = transpose_matrix_create(&i_node_dof, &j_node_dof,
-				 i_dof_node, j_dof_node,
-				 
-				 num_dofs, num_nodes);
+   for (i=0; i < num_dofs+1; i++)
+      i_dof_node[i] = i;
 
+   for (j = 0; j < num_nodes; j++)
+      for (k = 0; k < num_functions; k++)
+         j_dof_node[j*num_functions+k] = j;
 
-  /* build node_node graph: ----------------------------------------- */
+   /* build node_dof graph: ----------------------------------------- */
 
-  ierr = matrix_matrix_product(&i_node_dof_dof,
-			       &j_node_dof_dof,
+   ierr = transpose_matrix_create(&i_node_dof, &j_node_dof,
+                                  i_dof_node, j_dof_node,
+                                  num_dofs, num_nodes);
 
-			       i_node_dof, j_node_dof,
-			       i_dof_dof, j_dof_dof,
-			       
-			       num_nodes, num_dofs, num_dofs);
 
+   /* build node_node graph: ----------------------------------------- */
 
-  ierr = matrix_matrix_product(&i_node_node,
-			       &j_node_node,
+   ierr = matrix_matrix_product(&i_node_dof_dof,
+                                &j_node_dof_dof,
+                                i_node_dof, j_node_dof,
+                                i_dof_dof, j_dof_dof,
+                                num_nodes, num_dofs, num_dofs);
 
-			       i_node_dof_dof,
-			       j_node_dof_dof,
+   ierr = matrix_matrix_product(&i_node_node,
+                                &j_node_node,
+                                i_node_dof_dof,
+                                j_node_dof_dof,
+                                i_dof_node, j_dof_node,
+                                num_nodes, num_dofs, num_nodes);
 
-			       i_dof_node, j_dof_node,
-			       
-			       num_nodes, num_dofs, num_nodes);
+   hypre_TFree(i_node_dof_dof);
+   hypre_TFree(j_node_dof_dof);
 
-  hypre_TFree(i_node_dof_dof);
-  hypre_TFree(j_node_dof_dof);
+   /* compute for each node the local information: -------------------- */
 
+   i_global_to_local = i_dof_node;
 
+   for (i_dof =0; i_dof < num_dofs; i_dof++)
+      i_global_to_local[i_dof] = -1;
 
-  /* compute for each node the local information: -------------------- */
-
-  i_global_to_local = i_dof_node; 
-
-  for (i_dof =0; i_dof < num_dofs; i_dof++)
-     i_global_to_local[i_dof] = -1;
-
-  domain_matrixinverse_counter = 0;
-  domain_dof_counter = 0;
-  for (i=0; i < num_nodes; i++)
-    {
+   domain_matrixinverse_counter = 0;
+   domain_dof_counter = 0;
+   for (i=0; i < num_nodes; i++)
+   {
       local_dof_counter = 0;
 
       for (j=i_node_node[i]; j < i_node_node[i+1]; j++)
-	for (k=i_node_dof[j_node_node[j]];
-	     k<i_node_dof[j_node_node[j]+1]; k++)
-	  {
-	    j_dof = j_node_dof[k];
-
-	    if (i_global_to_local[j_dof] < 0)
-	      {
-		i_global_to_local[j_dof] = local_dof_counter;
-		local_dof_counter++;
-	      }
-
-	  }
-#ifdef HYPRE_USING_ESSL
-      if (i == num_nodes-1)
-	 domain_matrixinverse_counter += local_dof_counter*local_dof_counter;
-      else
-	 domain_matrixinverse_counter += 
-		local_dof_counter*(local_dof_counter+1)/2;
-#else
+         for (k=i_node_dof[j_node_node[j]];
+              k<i_node_dof[j_node_node[j]+1]; k++)
+         {
+            j_dof = j_node_dof[k];
+
+            if (i_global_to_local[j_dof] < 0)
+            {
+               i_global_to_local[j_dof] = local_dof_counter;
+               local_dof_counter++;
+            }
+
+         }
       domain_matrixinverse_counter += local_dof_counter*local_dof_counter;
-#endif
       domain_dof_counter += local_dof_counter;
 
       if (local_dof_counter > max_local_dof_counter)
-	max_local_dof_counter = local_dof_counter;
+         max_local_dof_counter = local_dof_counter;
 
       for (j=i_node_node[i]; j < i_node_node[i+1]; j++)
-	for (k=i_node_dof[j_node_node[j]];
-	     k<i_node_dof[j_node_node[j]+1]; k++)
-	  {
-	    j_dof = j_node_dof[k];
-	    i_global_to_local[j_dof] = -1;
-	  }	       
-    }
-
-
-  num_domains = num_nodes;
-
-  
-  i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
-
-  if (option == 1)
-    j_domain_dof = hypre_CTAlloc(HYPRE_Int, domain_dof_counter);
-  else
-    j_domain_dof = hypre_CTAlloc(HYPRE_Int, num_dofs);
-
-
-  if (option == 1)
-    domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter);
-  else
-    domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, num_dofs * num_functions);
-
-
+         for (k=i_node_dof[j_node_node[j]];
+              k<i_node_dof[j_node_node[j]+1]; k++)
+         {
+            j_dof = j_node_dof[k];
+            i_global_to_local[j_dof] = -1;
+         }
+   }
 
-  i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
+   num_domains = num_nodes;
 
+   i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
 
-  i_int_to_local = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
-  i_int          = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
+   if (option == 1)
+      j_domain_dof = hypre_CTAlloc(HYPRE_Int, domain_dof_counter);
+   else
+      j_domain_dof = hypre_CTAlloc(HYPRE_Int, num_dofs);
 
+   if (option == 1)
+      domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter);
+   else
+      domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, num_dofs * num_functions);
 
+   i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
 
-  for (l_loc=0; l_loc < max_local_dof_counter; l_loc++)
-    i_int[l_loc] = -1;
+   i_int_to_local = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
+   i_int          = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
 
+   for (l_loc=0; l_loc < max_local_dof_counter; l_loc++)
+      i_int[l_loc] = -1;
 
-  domain_dof_counter = 0;
-  domain_matrixinverse_counter = 0;
-  for (i=0; i < num_nodes; i++)
-    {
+   domain_dof_counter = 0;
+   domain_matrixinverse_counter = 0;
+   for (i=0; i < num_nodes; i++)
+   {
       i_domain_dof[i] = domain_dof_counter;
       local_dof_counter = 0;
 
       for (j=i_node_node[i]; j < i_node_node[i+1]; j++)
-	for (k=i_node_dof[j_node_node[j]];
-	     k<i_node_dof[j_node_node[j]+1]; k++)
-	  {
-	    j_dof = j_node_dof[k];
-
-	    if (i_global_to_local[j_dof] < 0)
-	      {
-		i_global_to_local[j_dof] = local_dof_counter;
-		i_local_to_global[local_dof_counter] = j_dof;
-		local_dof_counter++;
-	      }
+         for (k=i_node_dof[j_node_node[j]];
+              k<i_node_dof[j_node_node[j]+1]; k++)
+         {
+            j_dof = j_node_dof[k];
 
-	  }
+            if (i_global_to_local[j_dof] < 0)
+            {
+               i_global_to_local[j_dof] = local_dof_counter;
+               i_local_to_global[local_dof_counter] = j_dof;
+               local_dof_counter++;
+            }
 
-      for (j=i_node_dof[i]; j < i_node_dof[i+1]; j++)
-	for (k=i_dof_dof[j_node_dof[j]]; k < i_dof_dof[j_node_dof[j]+1]; k++)
-	  if (i_global_to_local[j_dof_dof[k]] < 0)
-            hypre_error_w_msg(HYPRE_ERROR_GENERIC,"WRONG local indexing: ====================== \n");
+         }
 
+      for (j=i_node_dof[i]; j < i_node_dof[i+1]; j++)
+         for (k=i_dof_dof[j_node_dof[j]]; k < i_dof_dof[j_node_dof[j]+1]; k++)
+            if (i_global_to_local[j_dof_dof[k]] < 0)
+               hypre_error_w_msg(HYPRE_ERROR_GENERIC,"WRONG local indexing: ====================== \n");
 
       int_dof_counter = 0;
       for (k=i_node_dof[i]; k < i_node_dof[i+1]; k++)
-	{
-	  i_dof = j_node_dof[k];
-	  i_loc = i_global_to_local[i_dof];
-	  i_int[i_loc] = int_dof_counter;
-	  i_int_to_local[int_dof_counter] = i_loc;
-	  int_dof_counter++;
-	}
+      {
+         i_dof = j_node_dof[k];
+         i_loc = i_global_to_local[i_dof];
+         i_int[i_loc] = int_dof_counter;
+         i_int_to_local[int_dof_counter] = i_loc;
+         int_dof_counter++;
+      }
 
       /* get local matrix AE: ======================================== */
 
       if (option == 1)
-	{
-	  AE = &domain_matrixinverse[domain_matrixinverse_counter];
-	  cnt = 0;
-	  for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	    for (j_loc=0; j_loc < local_dof_counter; j_loc++)
-	      AE[cnt++] = 0.e0;
-	  
-	  for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	    {
-	      i_dof = i_local_to_global[i_loc];
-	      for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
-		{
-		  j_loc = i_global_to_local[j_dof_dof[j]];
-		  if (j_loc >=0)
-		    AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j];
-		}
-	    }
-
-
-	  /* get block for Schwarz smoother: ============================= */
-	  /* ierr = hypre_matinv(XE, AE, local_dof_counter); */
-	  /* hypre_printf("ierr_AE_inv: %d\n", ierr); */
-#ifdef HYPRE_USING_ESSL
- 	cnt = local_dof_counter;
-        for (j_loc=1; j_loc < local_dof_counter; j_loc++)
-	   for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++)
-	      AE[cnt++] = AE[i_loc + j_loc * local_dof_counter];
-	ierr = dppf(AE, local_dof_counter, 1);
- 	if (ierr == 1) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Error! Matrix not SPD\n");
-#else  
-	hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo, &local_dof_counter, AE,
-		&local_dof_counter, &ierr); 
- 	if (ierr == 1) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Error! Matrix not SPD\n");
-#endif  
-
-	for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	      j_domain_dof[domain_dof_counter+i_loc] 
-		= i_local_to_global[i_loc]; 
-      }
+      {
+         AE = &domain_matrixinverse[domain_matrixinverse_counter];
+         cnt = 0;
+         for (i_loc=0; i_loc < local_dof_counter; i_loc++)
+            for (j_loc=0; j_loc < local_dof_counter; j_loc++)
+               AE[cnt++] = 0.e0;
+
+         for (i_loc=0; i_loc < local_dof_counter; i_loc++)
+         {
+            i_dof = i_local_to_global[i_loc];
+            for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
+            {
+               j_loc = i_global_to_local[j_dof_dof[j]];
+               if (j_loc >=0)
+                  AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j];
+            }
+         }
 
+         /* get block for Schwarz smoother: ============================= */
+         /* ierr = hypre_matinv(XE, AE, local_dof_counter); */
+         /* hypre_printf("ierr_AE_inv: %d\n", ierr); */
+         hypre_dpotrf(&uplo, &local_dof_counter, AE,
+                      &local_dof_counter, &ierr);
+         if (ierr == 1) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Error! Matrix not SPD\n");
+
+         for (i_loc=0; i_loc < local_dof_counter; i_loc++)
+            j_domain_dof[domain_dof_counter+i_loc]
+               = i_local_to_global[i_loc];
+      }
 
       if (option == 0)
-	{
+      {
+         AE = &domain_matrixinverse[domain_matrixinverse_counter];
+         for (i_loc=0; i_loc < int_dof_counter; i_loc++)
+            for (j_loc=0; j_loc < int_dof_counter; j_loc++)
+               AE[i_loc + j_loc * int_dof_counter] = 0.e0;
 
-	  AE = &domain_matrixinverse[domain_matrixinverse_counter];
-	  for (i_loc=0; i_loc < int_dof_counter; i_loc++)
-	    for (j_loc=0; j_loc < int_dof_counter; j_loc++)
-	      AE[i_loc + j_loc * int_dof_counter] = 0.e0;
-
-      
-
-	  for (l_loc=0; l_loc < int_dof_counter; l_loc++)
-	    {
-	      i_loc = i_int_to_local[l_loc];
-	      i_dof = i_local_to_global[i_loc];
-	      for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
-		{
-		  j_loc = i_global_to_local[j_dof_dof[j]];
-		  if (j_loc >=0)
-		    if (i_int[j_loc] >=0)
-		      AE[i_loc + i_int[j_loc] * int_dof_counter]
-			= a_dof_dof[j];
-		}
-	    }
-
-	  /* ierr = hypre_matinv(XE, AE, int_dof_counter); */
-#ifdef HYPRE_USING_ESSL
- 	cnt = local_dof_counter;
-        for (j_loc=1; j_loc < local_dof_counter; j_loc++)
-	   for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++)
-	      AE[cnt++] = AE[i_loc + j_loc * local_dof_counter];
-	dppf(AE, local_dof_counter, 1);
-#else
-	hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo, &local_dof_counter, AE, 
-		&local_dof_counter, &ierr);
-/*	dpotrf_(&uplo, &local_dof_counter, AE, &local_dof_counter, &ierr);*/
-#endif 
-
-      if (ierr) hypre_error_w_msg(HYPRE_ERROR_GENERIC," error in dpotrf !!!\n");
-
-	  for (i_loc=0; i_loc < int_dof_counter; i_loc++)
-	    {
-	      j_domain_dof[domain_dof_counter + i_loc] =
-		i_local_to_global[i_int_to_local[i_loc]];
-
-	      for (j_loc=0; j_loc < int_dof_counter; j_loc++)
-		domain_matrixinverse[domain_matrixinverse_counter
-				    + i_loc + j_loc * int_dof_counter]
-		  = AE[i_loc + j_loc * int_dof_counter];
-	    }
-
-	  domain_dof_counter+=int_dof_counter;
-	  domain_matrixinverse_counter+=int_dof_counter*int_dof_counter;
-	}
-      else
-	{
-	  domain_dof_counter+=local_dof_counter;
-#ifdef HYPRE_USING_ESSL
-	  domain_matrixinverse_counter+=local_dof_counter*(local_dof_counter+1)/2;
-#else
-	  domain_matrixinverse_counter+=local_dof_counter*local_dof_counter;
-#endif
-	}
+         for (l_loc=0; l_loc < int_dof_counter; l_loc++)
+         {
+            i_loc = i_int_to_local[l_loc];
+            i_dof = i_local_to_global[i_loc];
+            for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
+            {
+               j_loc = i_global_to_local[j_dof_dof[j]];
+               if (j_loc >=0)
+                  if (i_int[j_loc] >=0)
+                     AE[i_loc + i_int[j_loc] * int_dof_counter]
+                        = a_dof_dof[j];
+            }
+         }
 
+         /* ierr = hypre_matinv(XE, AE, int_dof_counter); */
+         hypre_dpotrf(&uplo, &local_dof_counter, AE,
+                      &local_dof_counter, &ierr);
 
-      for (l_loc=0; l_loc < local_dof_counter; l_loc++)
-	{
-	  i_int[l_loc] = -1;
-	  i_global_to_local[i_local_to_global[l_loc]] = -1;
-	}
+         if (ierr) hypre_error_w_msg(HYPRE_ERROR_GENERIC," error in dpotrf !!!\n");
 
+         for (i_loc=0; i_loc < int_dof_counter; i_loc++)
+         {
+            j_domain_dof[domain_dof_counter + i_loc] =
+               i_local_to_global[i_int_to_local[i_loc]];
 
-    }
+            for (j_loc=0; j_loc < int_dof_counter; j_loc++)
+               domain_matrixinverse[domain_matrixinverse_counter
+                                    + i_loc + j_loc * int_dof_counter]
+                  = AE[i_loc + j_loc * int_dof_counter];
+         }
 
-  i_domain_dof[num_nodes] = domain_dof_counter;
+         domain_dof_counter+=int_dof_counter;
+         domain_matrixinverse_counter+=int_dof_counter*int_dof_counter;
+      }
+      else
+      {
+         domain_dof_counter+=local_dof_counter;
+         domain_matrixinverse_counter+=local_dof_counter*local_dof_counter;
+      }
 
+      for (l_loc=0; l_loc < local_dof_counter; l_loc++)
+      {
+         i_int[l_loc] = -1;
+         i_global_to_local[i_local_to_global[l_loc]] = -1;
+      }
+   }
 
-  hypre_TFree(i_dof_node);
-  hypre_TFree(j_dof_node);  
+   i_domain_dof[num_nodes] = domain_dof_counter;
 
-  hypre_TFree(i_node_dof);
-  hypre_TFree(j_node_dof);
-  hypre_TFree(i_node_node);
-  hypre_TFree(j_node_node);
+   hypre_TFree(i_dof_node);
+   hypre_TFree(j_dof_node);
 
-  hypre_TFree(i_int);
-  hypre_TFree(i_int_to_local);
+   hypre_TFree(i_node_dof);
+   hypre_TFree(j_node_dof);
+   hypre_TFree(i_node_node);
+   hypre_TFree(j_node_node);
 
-  hypre_TFree(i_local_to_global);
+   hypre_TFree(i_int);
+   hypre_TFree(i_int_to_local);
 
-  domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter, 
-		i_domain_dof[num_domains]);
-  hypre_CSRMatrixI(domain_structure) = i_domain_dof;
-  hypre_CSRMatrixJ(domain_structure) = j_domain_dof;
-  hypre_CSRMatrixData(domain_structure) = domain_matrixinverse;
+   hypre_TFree(i_local_to_global);
 
-  *domain_structure_pointer = domain_structure;
+   domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter,
+                                            i_domain_dof[num_domains]);
+   hypre_CSRMatrixI(domain_structure) = i_domain_dof;
+   hypre_CSRMatrixJ(domain_structure) = j_domain_dof;
+   hypre_CSRMatrixData(domain_structure) = domain_matrixinverse;
 
+   *domain_structure_pointer = domain_structure;
 
    return hypre_error_flag;
-
 }
 
 HYPRE_Int hypre_ParMPSchwarzSolve(hypre_ParCSRMatrix *par_A,
-                            hypre_CSRMatrix *A_boundary,
-                            hypre_ParVector *rhs_vector,
-                            hypre_CSRMatrix *domain_structure,
-                            hypre_ParVector *par_x,
-                            HYPRE_Real relax_wt,
-                            HYPRE_Real *scale,
-                            hypre_ParVector *Vtemp, HYPRE_Int *pivots, 
-                            HYPRE_Int use_nonsymm)
-
+                                  hypre_CSRMatrix *A_boundary,
+                                  hypre_ParVector *rhs_vector,
+                                  hypre_CSRMatrix *domain_structure,
+                                  hypre_ParVector *par_x,
+                                  HYPRE_Real relax_wt,
+                                  HYPRE_Real *scale,
+                                  hypre_ParVector *Vtemp, HYPRE_Int *pivots,
+                                  HYPRE_Int use_nonsymm)
 {
-  hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(par_A);
-  HYPRE_Int num_sends = 0;
-  HYPRE_Int *send_map_starts;
-  HYPRE_Int *send_map_elmts;
-
-  hypre_ParCSRCommHandle *comm_handle;
-
-  HYPRE_Int ierr = 0;
-  /* HYPRE_Int num_dofs; */
-  hypre_CSRMatrix *A_diag;
-  HYPRE_Int *A_diag_i;
-  HYPRE_Int *A_diag_j;
-  HYPRE_Real *A_diag_data;
-  hypre_CSRMatrix *A_offd;
-  HYPRE_Int *A_offd_i;
-  HYPRE_Int *A_offd_j;
-  HYPRE_Real *A_offd_data;
-  HYPRE_Real *x;
-  HYPRE_Real *x_ext;
-  HYPRE_Real *x_ext_old;
-  HYPRE_Real *rhs;
-  HYPRE_Real *rhs_ext;
-  HYPRE_Real *vtemp_data;
-  HYPRE_Real *aux;
-  HYPRE_Real *buf_data;
+   hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(par_A);
+   HYPRE_Int num_sends = 0;
+   HYPRE_Int *send_map_starts;
+   HYPRE_Int *send_map_elmts;
+
+   hypre_ParCSRCommHandle *comm_handle;
+
+   HYPRE_Int ierr = 0;
+   /* HYPRE_Int num_dofs; */
+   hypre_CSRMatrix *A_diag;
+   HYPRE_Int *A_diag_i;
+   HYPRE_Int *A_diag_j;
+   HYPRE_Real *A_diag_data;
+   hypre_CSRMatrix *A_offd;
+   HYPRE_Int *A_offd_i;
+   HYPRE_Int *A_offd_j;
+   HYPRE_Real *A_offd_data;
+   HYPRE_Real *x;
+   HYPRE_Real *x_ext;
+   HYPRE_Real *x_ext_old;
+   HYPRE_Real *rhs;
+   HYPRE_Real *rhs_ext;
+   HYPRE_Real *vtemp_data;
+   HYPRE_Real *aux;
+   HYPRE_Real *buf_data;
 /*hypre_Vector *x_vector;*/
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  HYPRE_Int max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
-  HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-  HYPRE_Int *A_boundary_i;
-  HYPRE_Int *A_boundary_j;
-  HYPRE_Real *A_boundary_data;
-  HYPRE_Int num_variables;
-  HYPRE_Int num_cols_offd;
-
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-  char uplo = 'L';
-
-  HYPRE_Int jj,i,j,k, j_loc, k_loc;
-  HYPRE_Int index;
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  HYPRE_Int num_procs;
-
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  /* num_dofs = hypre_CSRMatrixNumRows(A); */
-
-  A_diag = hypre_ParCSRMatrixDiag(par_A);
-  A_offd = hypre_ParCSRMatrixOffd(par_A);
-  num_variables = hypre_CSRMatrixNumRows(A_diag);
-  num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
-  x = hypre_VectorData(hypre_ParVectorLocalVector(par_x));
-  vtemp_data = hypre_VectorData(hypre_ParVectorLocalVector(Vtemp));
-  rhs = hypre_VectorData(hypre_ParVectorLocalVector(rhs_vector));
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   HYPRE_Int max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
+   HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
+   HYPRE_Int *A_boundary_i;
+   HYPRE_Int *A_boundary_j;
+   HYPRE_Real *A_boundary_data;
+   HYPRE_Int num_variables;
+   HYPRE_Int num_cols_offd;
+
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+   char uplo = 'L';
+
+   HYPRE_Int jj,i,j,k, j_loc, k_loc;
+   HYPRE_Int index;
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
 
-  if (use_nonsymm)
-     uplo = 'N';
+   /* initiate:      ----------------------------------------------- */
+   /* num_dofs = hypre_CSRMatrixNumRows(A); */
+
+   A_diag = hypre_ParCSRMatrixDiag(par_A);
+   A_offd = hypre_ParCSRMatrixOffd(par_A);
+   num_variables = hypre_CSRMatrixNumRows(A_diag);
+   num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
+   x = hypre_VectorData(hypre_ParVectorLocalVector(par_x));
+   vtemp_data = hypre_VectorData(hypre_ParVectorLocalVector(Vtemp));
+   rhs = hypre_VectorData(hypre_ParVectorLocalVector(rhs_vector));
+
+   if (use_nonsymm)
+      uplo = 'N';
 
 /*x_vector = hypre_ParVectorLocalVector(par_x);*/
-  A_diag_i = hypre_CSRMatrixI(A_diag);
-  A_diag_j = hypre_CSRMatrixJ(A_diag);
-  A_diag_data = hypre_CSRMatrixData(A_diag);
-  A_offd_i = hypre_CSRMatrixI(A_offd);
-  if (num_cols_offd)
-  {
-     A_offd_j = hypre_CSRMatrixJ(A_offd);
-     A_offd_data = hypre_CSRMatrixData(A_offd);
-     A_boundary_i = hypre_CSRMatrixI(A_boundary);
-     A_boundary_j = hypre_CSRMatrixJ(A_boundary);
-     A_boundary_data = hypre_CSRMatrixData(A_boundary);
-  }
-  aux = hypre_CTAlloc(HYPRE_Real, max_domain_size);
+   A_diag_i = hypre_CSRMatrixI(A_diag);
+   A_diag_j = hypre_CSRMatrixJ(A_diag);
+   A_diag_data = hypre_CSRMatrixData(A_diag);
+   A_offd_i = hypre_CSRMatrixI(A_offd);
+   if (num_cols_offd)
+   {
+      A_offd_j = hypre_CSRMatrixJ(A_offd);
+      A_offd_data = hypre_CSRMatrixData(A_offd);
+      A_boundary_i = hypre_CSRMatrixI(A_boundary);
+      A_boundary_j = hypre_CSRMatrixJ(A_boundary);
+      A_boundary_data = hypre_CSRMatrixData(A_boundary);
+   }
+   aux = hypre_CTAlloc(HYPRE_Real, max_domain_size);
 
-  hypre_ParVectorCopy(rhs_vector,Vtemp);
-  hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,Vtemp);
+   hypre_ParVectorCopy(rhs_vector,Vtemp);
+   hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,Vtemp);
 
-  if (comm_pkg)
-  {
-     num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
-     send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
-     send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
-
-     buf_data = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]);
-     x_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
-     x_ext_old = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
-     rhs_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+   if (comm_pkg)
+   {
+      num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
+      send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
+      send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
 
-     index = 0;
-     for (i=0; i < num_sends; i++)
-     {
-        for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-           buf_data[index++] = vtemp_data[send_map_elmts[j]];
-     }
+      buf_data = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]);
+      x_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+      x_ext_old = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+      rhs_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
 
-     comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,
-                rhs_ext);
-     hypre_ParCSRCommHandleDestroy(comm_handle);
-     comm_handle = NULL;
+      index = 0;
+      for (i=0; i < num_sends; i++)
+      {
+         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+            buf_data[index++] = vtemp_data[send_map_elmts[j]];
+      }
 
-     index = 0;
-     for (i=0; i < num_sends; i++)
-     {
-        for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-           buf_data[index++] = x[send_map_elmts[j]];
-     }
+      comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,
+                                                 rhs_ext);
+      hypre_ParCSRCommHandleDestroy(comm_handle);
+      comm_handle = NULL;
 
-     comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,x_ext);
-     hypre_ParCSRCommHandleDestroy(comm_handle);
-     comm_handle = NULL;
-  }
+      index = 0;
+      for (i=0; i < num_sends; i++)
+      {
+         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+            buf_data[index++] = x[send_map_elmts[j]];
+      }
+
+      comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,x_ext);
+      hypre_ParCSRCommHandleDestroy(comm_handle);
+      comm_handle = NULL;
+   }
 
-  /* correction of residual for exterior points to be updated locally */
+   /* correction of residual for exterior points to be updated locally */
    for (i=0; i < num_cols_offd; i++)
    {
       x_ext_old[i] = x_ext[i];
       for (j = A_boundary_i[i]; j < A_boundary_i[i+1]; j++)
       {
          k_loc = A_boundary_j[j];
-	 if (k_loc < num_variables)
-           rhs_ext[i] += A_boundary_data[j]*x[k_loc]; 
-	 else
-           rhs_ext[i] += A_boundary_data[j]*x_ext[k_loc-num_variables]; 
+         if (k_loc < num_variables)
+            rhs_ext[i] += A_boundary_data[j]*x[k_loc];
+         else
+            rhs_ext[i] += A_boundary_data[j]*x_ext[k_loc-num_variables];
       }
    }
    /* forward solve: ----------------------------------------------- */
@@ -581,227 +491,187 @@
 
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  j_loc = j_domain_dof[j];
-	  if (j_loc < num_variables) 
-	  {
-	     aux[jj] = rhs[j_loc];
-	     for (k=A_diag_i[j_loc]; k<A_diag_i[j_loc+1]; k++)
-	        aux[jj] -= A_diag_data[k] * x[A_diag_j[k]];
-	     for (k=A_offd_i[j_loc]; k<A_offd_i[j_loc+1]; k++)
-	        aux[jj] -= A_offd_data[k] * x_ext[A_offd_j[k]];
-	  }
-	  else
-	  {
-	     j_loc -= num_variables;
-	     aux[jj] = rhs_ext[j_loc];
-	     for (k=A_boundary_i[j_loc]; k<A_boundary_i[j_loc+1]; k++)
-	        k_loc = A_boundary_j[k];
-		if (k_loc < num_variables)
-	           aux[jj] -= A_boundary_data[k] * x[k_loc];
-		else
-	           aux[jj] -= A_boundary_data[k] * x_ext[k_loc-num_variables];
-	  }
-	  jj++;
-	}
+      {
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+         {
+            aux[jj] = rhs[j_loc];
+            for (k=A_diag_i[j_loc]; k<A_diag_i[j_loc+1]; k++)
+               aux[jj] -= A_diag_data[k] * x[A_diag_j[k]];
+            for (k=A_offd_i[j_loc]; k<A_offd_i[j_loc+1]; k++)
+               aux[jj] -= A_offd_data[k] * x_ext[A_offd_j[k]];
+         }
+         else
+         {
+            j_loc -= num_variables;
+            aux[jj] = rhs_ext[j_loc];
+            for (k=A_boundary_i[j_loc]; k<A_boundary_i[j_loc+1]; k++)
+               k_loc = A_boundary_j[k];
+            if (k_loc < num_variables)
+               aux[jj] -= A_boundary_data[k] * x[k_loc];
+            else
+               aux[jj] -= A_boundary_data[k] * x_ext[k_loc-num_variables];
+         }
+         jj++;
+      }
       /* solve for correction: ------------------------------------- */
-/* OLD - HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
       if (use_nonsymm)
       {
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr); 
-#else
-          hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                                &domain_matrixinverse[matrix_size_counter], 
-                                                &matrix_size, &pivots[piv_counter], aux,
-                                                &matrix_size, &ierr); 
-#endif
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], aux,
+                      &matrix_size, &ierr);
       }
       else
       {
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, aux,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                                  &domain_matrixinverse[matrix_size_counter], 
-                                                  &matrix_size, aux,
-                                                  &matrix_size, &ierr); 
-#endif
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, aux,
+                      &matrix_size, &ierr);
       }
       if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  j_loc = j_domain_dof[j];
-	  if (j_loc < num_variables) 
-	     x[j_loc] +=  relax_wt*aux[jj++];
-	  else
-	     x_ext[j_loc-num_variables] +=  relax_wt*aux[jj++];
-	}
-/* OLD - HYPRE_USING_ESSL
-      matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
+      {
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+            x[j_loc] +=  relax_wt*aux[jj++];
+         else
+            x_ext[j_loc-num_variables] +=  relax_wt*aux[jj++];
+      }
       matrix_size_counter += matrix_size * matrix_size;
       piv_counter += matrix_size;
 
 
    }
 /*
-   for (i=0; i < num_cols_offd; i++)
-      x_ext[i] -= x_ext_old[i];  
+  for (i=0; i < num_cols_offd; i++)
+  x_ext[i] -= x_ext_old[i];
 
-   if (comm_pkg)
-   {
-      comm_handle=hypre_ParCSRCommHandleCreate (2,comm_pkg,x_ext,buf_data);
+  if (comm_pkg)
+  {
+  comm_handle=hypre_ParCSRCommHandleCreate (2,comm_pkg,x_ext,buf_data);
 
-      hypre_ParCSRCommHandleDestroy(comm_handle);
-      comm_handle = NULL;
+  hypre_ParCSRCommHandleDestroy(comm_handle);
+  comm_handle = NULL;
 
-      index = 0;
-      for (i=0; i < num_sends; i++)
-      {
-         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-            x[send_map_elmts[j]] += buf_data[index++];
-      }
-   }
-   for (i=0; i < num_variables; i++)
-      x[i] *= scale[i];
+  index = 0;
+  for (i=0; i < num_sends; i++)
+  {
+  for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+  x[send_map_elmts[j]] += buf_data[index++];
+  }
+  }
+  for (i=0; i < num_variables; i++)
+  x[i] *= scale[i];
 
-   hypre_ParVectorCopy(rhs_vector,Vtemp);
-   hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,Vtemp);
+  hypre_ParVectorCopy(rhs_vector,Vtemp);
+  hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,Vtemp);
 
-   if (comm_pkg)
-   {
-      index = 0;
-      for (i=0; i < num_sends; i++)
-      {
-         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-            buf_data[index++] = vtemp_data[send_map_elmts[j]];
-      }
+  if (comm_pkg)
+  {
+  index = 0;
+  for (i=0; i < num_sends; i++)
+  {
+  for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+  buf_data[index++] = vtemp_data[send_map_elmts[j]];
+  }
 
-      comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,
-                rhs_ext);
-      hypre_ParCSRCommHandleDestroy(comm_handle);
-      comm_handle = NULL;
+  comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,
+  rhs_ext);
+  hypre_ParCSRCommHandleDestroy(comm_handle);
+  comm_handle = NULL;
 
-      index = 0;
-      for (i=0; i < num_sends; i++)
-      {
-         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-            buf_data[index++] = x[send_map_elmts[j]];
-      }
+  index = 0;
+  for (i=0; i < num_sends; i++)
+  {
+  for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+  buf_data[index++] = x[send_map_elmts[j]];
+  }
 
-      comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,x_ext);
-      hypre_ParCSRCommHandleDestroy(comm_handle);
-      comm_handle = NULL;
-   }
+  comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,x_ext);
+  hypre_ParCSRCommHandleDestroy(comm_handle);
+  comm_handle = NULL;
+  }
 */
    /* correction of residual for exterior points to be updated locally */
 /*   for (i=0; i < num_cols_offd; i++)
-   {
-      x_ext_old[i] = x_ext[i];
-      for (j = A_boundary_i[i]; j < A_boundary_i[i+1]; j++)
-      {
-         k_loc = A_boundary_j[j];
-	 if (k_loc < num_variables)
-           rhs_ext[i] += A_boundary_i[k]*x[k_loc]; 
-	 else
-           rhs_ext[i] += A_boundary_i[k]*x_ext[k_loc-num_variables]; 
-      }
-   }
+     {
+     x_ext_old[i] = x_ext[i];
+     for (j = A_boundary_i[i]; j < A_boundary_i[i+1]; j++)
+     {
+     k_loc = A_boundary_j[j];
+     if (k_loc < num_variables)
+     rhs_ext[i] += A_boundary_i[k]*x[k_loc];
+     else
+     rhs_ext[i] += A_boundary_i[k]*x_ext[k_loc-num_variables];
+     }
+     }
 */
    /* backward solve: ------------------------------------------------ */
    for (i=num_domains-1; i > -1; i--)
    {
       matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
-/* OLD - HYPRE_USING_ESSL
-      matrix_size_counter -= matrix_size * (matrix_size+1)/2;  
-*/
       matrix_size_counter -= matrix_size * matrix_size;
       piv_counter -= matrix_size;
- 
+
       /* compute residual: ---------------------------------------- */
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  j_loc = j_domain_dof[j];
-	  if (j_loc < num_variables) 
-	  {
-	     aux[jj] = rhs[j_loc];
-	     for (k=A_diag_i[j_loc]; k<A_diag_i[j_loc+1]; k++)
-	        aux[jj] -= A_diag_data[k] * x[A_diag_j[k]];
-	     for (k=A_offd_i[j_loc]; k<A_offd_i[j_loc+1]; k++)
-	        aux[jj] -= A_offd_data[k] * x_ext[A_offd_j[k]];
-	  }
-	  else
-	  {
-	     j_loc -= num_variables;
-	     aux[jj] = rhs_ext[j_loc];
-	     for (k=A_boundary_i[j_loc]; k<A_boundary_i[j_loc+1]; k++)
-	        k_loc = A_boundary_j[k];
-		if (k_loc < num_variables)
-	           aux[jj] -= A_boundary_data[k] * x[k_loc];
-		else
-	           aux[jj] -= A_boundary_data[k] * x_ext[k_loc-num_variables];
-	  }
-	  jj++;
-	}
+      {
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+         {
+            aux[jj] = rhs[j_loc];
+            for (k=A_diag_i[j_loc]; k<A_diag_i[j_loc+1]; k++)
+               aux[jj] -= A_diag_data[k] * x[A_diag_j[k]];
+            for (k=A_offd_i[j_loc]; k<A_offd_i[j_loc+1]; k++)
+               aux[jj] -= A_offd_data[k] * x_ext[A_offd_j[k]];
+         }
+         else
+         {
+            j_loc -= num_variables;
+            aux[jj] = rhs_ext[j_loc];
+            for (k=A_boundary_i[j_loc]; k<A_boundary_i[j_loc+1]; k++)
+               k_loc = A_boundary_j[k];
+            if (k_loc < num_variables)
+               aux[jj] -= A_boundary_data[k] * x[k_loc];
+            else
+               aux[jj] -= A_boundary_data[k] * x_ext[k_loc-num_variables];
+         }
+         jj++;
+      }
 
       /* solve for correction: ------------------------------------- */
-/* OLD - HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/ 
       if (use_nonsymm)
       {
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr);
-#else
-          hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                                &domain_matrixinverse[matrix_size_counter], 
-                                                &matrix_size, &pivots[piv_counter], aux,
-                                                &matrix_size, &ierr); 
-#endif 
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], aux,
+                      &matrix_size, &ierr);
       }
       else
       {
-#ifdef HYPRE_USING_ESSL
-           dpotrs(&uplo, matrix_size, one, 
-                  &domain_matrixinverse[matrix_size_counter], 
-                  matrix_size, aux,
-                  matrix_size, &ierr); 
-#else
-           hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                                 &domain_matrixinverse[matrix_size_counter], 
-                                                 &matrix_size, aux,
-                                                 &matrix_size, &ierr); 
-#endif
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, aux,
+                      &matrix_size, &ierr);
       }
-      
-   if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
+
+      if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  j_loc = j_domain_dof[j];
-	  if (j_loc < num_variables) 
-	     x[j_loc] +=  relax_wt*aux[jj++];
-	  else
-	     x_ext[j_loc-num_variables] +=  relax_wt*aux[jj++];
-	}
-    }			      
+      {
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+            x[j_loc] +=  relax_wt*aux[jj++];
+         else
+            x_ext[j_loc-num_variables] +=  relax_wt*aux[jj++];
+      }
+   }
 
    for (i=0; i < num_cols_offd; i++)
-      x_ext[i] -= x_ext_old[i];  
+      x_ext[i] -= x_ext_old[i];
 
    if (comm_pkg)
    {
@@ -816,7 +686,7 @@
          for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
             x[send_map_elmts[j]] += buf_data[index++];
       }
-      
+
       hypre_TFree(buf_data);
       hypre_TFree(x_ext);
       hypre_TFree(x_ext_old);
@@ -825,910 +695,749 @@
    for (i=0; i < num_variables; i++)
       x[i] *= scale[i];
 
-  hypre_TFree(aux);
+   hypre_TFree(aux);
 
-  return hypre_error_flag;
+   return hypre_error_flag;
 
 }
 
 HYPRE_Int hypre_MPSchwarzSolve(hypre_ParCSRMatrix *par_A,
-                         hypre_Vector *rhs_vector,
-                         hypre_CSRMatrix *domain_structure,
-                         hypre_ParVector *par_x,
-                         HYPRE_Real relax_wt,
-                         hypre_Vector *aux_vector, HYPRE_Int *pivots, 
-                         HYPRE_Int use_nonsymm)
+                               hypre_Vector *rhs_vector,
+                               hypre_CSRMatrix *domain_structure,
+                               hypre_ParVector *par_x,
+                               HYPRE_Real relax_wt,
+                               hypre_Vector *aux_vector, HYPRE_Int *pivots,
+                               HYPRE_Int use_nonsymm)
+{
+   HYPRE_Int ierr = 0;
+   /* HYPRE_Int num_dofs; */
+   HYPRE_Int *i_dof_dof;
+   HYPRE_Int *j_dof_dof;
+   HYPRE_Real *a_dof_dof;
+   HYPRE_Real *x;
+   HYPRE_Real *rhs;
+   HYPRE_Real *aux;
+   hypre_CSRMatrix *A;
+   hypre_Vector *x_vector;
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
 
-{
-  HYPRE_Int ierr = 0;
-  /* HYPRE_Int num_dofs; */
-  HYPRE_Int *i_dof_dof;
-  HYPRE_Int *j_dof_dof;
-  HYPRE_Real *a_dof_dof;
-  HYPRE_Real *x;
-  HYPRE_Real *rhs;
-  HYPRE_Real *aux;
-  hypre_CSRMatrix *A;
-  hypre_Vector *x_vector;
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-  char uplo = 'L';
-
-  HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-   
-  HYPRE_Int num_procs;
-  
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  /* num_dofs = hypre_CSRMatrixNumRows(A); */
-  x_vector = hypre_ParVectorLocalVector(par_x);
-  A = hypre_ParCSRMatrixDiag(par_A);
-  i_dof_dof = hypre_CSRMatrixI(A);
-  j_dof_dof = hypre_CSRMatrixJ(A);
-  a_dof_dof = hypre_CSRMatrixData(A);
-  x = hypre_VectorData(x_vector);
-  aux = hypre_VectorData(aux_vector);
-  /* for (i=0; i < num_dofs; i++)
-    x[i] = 0.e0; */
-
-  if (use_nonsymm)
-     uplo = 'N';
-
-
-  if (num_procs > 1)
-     hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
-  else 
-     rhs = hypre_VectorData(rhs_vector);
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+   char uplo = 'L';
 
-  /* forward solve: ----------------------------------------------- */
+   HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
 
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-    {
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+
+   /* initiate:      ----------------------------------------------- */
+   /* num_dofs = hypre_CSRMatrixNumRows(A); */
+   x_vector = hypre_ParVectorLocalVector(par_x);
+   A = hypre_ParCSRMatrixDiag(par_A);
+   i_dof_dof = hypre_CSRMatrixI(A);
+   j_dof_dof = hypre_CSRMatrixJ(A);
+   a_dof_dof = hypre_CSRMatrixData(A);
+   x = hypre_VectorData(x_vector);
+   aux = hypre_VectorData(aux_vector);
+   /* for (i=0; i < num_dofs; i++)
+      x[i] = 0.e0; */
+
+   if (use_nonsymm)
+      uplo = 'N';
+
+
+   if (num_procs > 1)
+      hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
+   else
+      rhs = hypre_VectorData(rhs_vector);
+
+   /* forward solve: ----------------------------------------------- */
+
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
       matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
       /* compute residual: ---------------------------------------- */
 
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  aux[jj] = rhs[j_domain_dof[j]]; 
-	  for (k=i_dof_dof[j_domain_dof[j]];
-	       k<i_dof_dof[j_domain_dof[j]+1]; k++)
-	    aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
-	  jj++;
-	}
+      {
+         aux[jj] = rhs[j_domain_dof[j]];
+         for (k=i_dof_dof[j_domain_dof[j]];
+              k<i_dof_dof[j_domain_dof[j]+1]; k++)
+            aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
+         jj++;
+      }
       /* solve for correction: ------------------------------------- */
-/* OLD - HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
-     
       if (use_nonsymm)
       {
-         
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], aux,
-                                               &matrix_size, &ierr); 
-#endif
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], aux,
+                      &matrix_size, &ierr);
       }
       else
       {
-         
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, aux,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, aux,
-                                               &matrix_size, &ierr); 
-#endif
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, aux,
+                      &matrix_size, &ierr);
       }
-      
+
       if (ierr) hypre_error(HYPRE_ERROR_GENERIC);
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
-	}
-/* OLD - HYPRE_USING_ESSL
-      matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
+      {
+         x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
+      }
       matrix_size_counter += matrix_size * matrix_size;
       piv_counter += matrix_size;
-
-
-    }
+   }
 
 /*  if (num_procs > 1)
-  {
-     hypre_TFree(rhs);
-     hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
-  }
-  else 
-     rhs = hypre_VectorData(rhs_vector);
-*/
-  /* backward solve: ------------------------------------------------ */
-    for (i=num_domains-1; i > -1; i--)
     {
-      matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
-/* OLD - HYPRE_USING_ESSL
-      matrix_size_counter -= matrix_size * (matrix_size+1)/2;  
+    hypre_TFree(rhs);
+    hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
+    }
+    else
+    rhs = hypre_VectorData(rhs_vector);
 */
+   /* backward solve: ------------------------------------------------ */
+   for (i=num_domains-1; i > -1; i--)
+   {
+      matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
       matrix_size_counter -= matrix_size * matrix_size;
       piv_counter -= matrix_size;
 
- 
       /* compute residual: ---------------------------------------- */
+
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  aux[jj] = rhs[j_domain_dof[j]]; 
-	  for (k=i_dof_dof[j_domain_dof[j]];
-	       k<i_dof_dof[j_domain_dof[j]+1]; k++)
-	    aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
-	  jj++;
-	}
+      {
+         aux[jj] = rhs[j_domain_dof[j]];
+         for (k=i_dof_dof[j_domain_dof[j]];
+              k<i_dof_dof[j_domain_dof[j]+1]; k++)
+            aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
+         jj++;
+      }
 
       /* solve for correction: ------------------------------------- */
-/* OLD - HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
 
       if (use_nonsymm)
       {
-         
-#ifdef HYPRE_USING_ESSL
-           dgetrs(&uplo, matrix_size, one, 
-                  &domain_matrixinverse[matrix_size_counter], 
-                  matrix_size, &pivots[piv_counter], aux,
-                  matrix_size, &ierr);
-#else
-        hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                                &domain_matrixinverse[matrix_size_counter], 
-                                                &matrix_size, &pivots[piv_counter], aux,
-                                                &matrix_size, &ierr); 
-#endif 
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], aux,
+                      &matrix_size, &ierr);
       }
       else
       {
-         
-#ifdef HYPRE_USING_ESSL
-           dpotrs(&uplo, matrix_size, one, 
-                  &domain_matrixinverse[matrix_size_counter], 
-                  matrix_size, aux,
-                  matrix_size, &ierr); 
-#else
-           hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                                 &domain_matrixinverse[matrix_size_counter], 
-                                                 &matrix_size, aux,
-                                                 &matrix_size, &ierr); 
-#endif
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, aux,
+                      &matrix_size, &ierr);
       }
-      
+
       if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  relax_wt*aux[jj++];
-	}
-    }			      
+      {
+         x[j_domain_dof[j]]+=  relax_wt*aux[jj++];
+      }
+   }
 
-			      
-  if (num_procs > 1) hypre_TFree(rhs);
+   if (num_procs > 1) hypre_TFree(rhs);
 
-  return hypre_error_flag;
+   return hypre_error_flag;
 }
 
 HYPRE_Int hypre_MPSchwarzCFSolve(hypre_ParCSRMatrix *par_A,
-                           hypre_Vector *rhs_vector,
-                           hypre_CSRMatrix *domain_structure,
-                           hypre_ParVector *par_x,
-                           HYPRE_Real relax_wt,
-                           hypre_Vector *aux_vector,
-                           HYPRE_Int *CF_marker,
-                           HYPRE_Int rlx_pt, HYPRE_Int *pivots, 
-                           HYPRE_Int use_nonsymm)
+                                 hypre_Vector *rhs_vector,
+                                 hypre_CSRMatrix *domain_structure,
+                                 hypre_ParVector *par_x,
+                                 HYPRE_Real relax_wt,
+                                 hypre_Vector *aux_vector,
+                                 HYPRE_Int *CF_marker,
+                                 HYPRE_Int rlx_pt, HYPRE_Int *pivots,
+                                 HYPRE_Int use_nonsymm)
+{
+   HYPRE_Int ierr = 0;
+   /* HYPRE_Int num_dofs; */
+   HYPRE_Int *i_dof_dof;
+   HYPRE_Int *j_dof_dof;
+   HYPRE_Real *a_dof_dof;
+   HYPRE_Real *x;
+   HYPRE_Real *rhs;
+   HYPRE_Real *aux;
+   hypre_CSRMatrix *A;
+   hypre_Vector *x_vector;
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
 
-{
-  HYPRE_Int ierr = 0;
-  /* HYPRE_Int num_dofs; */
-  HYPRE_Int *i_dof_dof;
-  HYPRE_Int *j_dof_dof;
-  HYPRE_Real *a_dof_dof;
-  HYPRE_Real *x;
-  HYPRE_Real *rhs;
-  HYPRE_Real *aux;
-  hypre_CSRMatrix *A;
-  hypre_Vector *x_vector;
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-  char uplo = 'L';
-  HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  HYPRE_Int num_procs;
-
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  /* num_dofs = hypre_CSRMatrixNumRows(A); */
-  x_vector = hypre_ParVectorLocalVector(par_x);
-  A = hypre_ParCSRMatrixDiag(par_A);
-  i_dof_dof = hypre_CSRMatrixI(A);
-  j_dof_dof = hypre_CSRMatrixJ(A);
-  a_dof_dof = hypre_CSRMatrixData(A);
-  x = hypre_VectorData(x_vector);
-  aux = hypre_VectorData(aux_vector);
-  /* for (i=0; i < num_dofs; i++)
-    x[i] = 0.e0; */
-
-  if (use_nonsymm)
-     uplo = 'N';
-  
-  if (num_procs > 1)
-     hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
-  else 
-     rhs = hypre_VectorData(rhs_vector);
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+   char uplo = 'L';
+   HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+
+   /* initiate:      ----------------------------------------------- */
+   /* num_dofs = hypre_CSRMatrixNumRows(A); */
+   x_vector = hypre_ParVectorLocalVector(par_x);
+   A = hypre_ParCSRMatrixDiag(par_A);
+   i_dof_dof = hypre_CSRMatrixI(A);
+   j_dof_dof = hypre_CSRMatrixJ(A);
+   a_dof_dof = hypre_CSRMatrixData(A);
+   x = hypre_VectorData(x_vector);
+   aux = hypre_VectorData(aux_vector);
+   /* for (i=0; i < num_dofs; i++)
+      x[i] = 0.e0; */
 
-  /* forward solve: ----------------------------------------------- */
+   if (use_nonsymm)
+      uplo = 'N';
 
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-    {
-      if (CF_marker[i] == rlx_pt)
-      {
-      matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
+   if (num_procs > 1)
+      hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
+   else
+      rhs = hypre_VectorData(rhs_vector);
 
-      /* compute residual: ---------------------------------------- */
+   /* forward solve: ----------------------------------------------- */
 
-      jj = 0;
-      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  aux[jj] = rhs[j_domain_dof[j]]; 
-	  if (CF_marker[j_domain_dof[j]] == rlx_pt)
-	  {
-	  for (k=i_dof_dof[j_domain_dof[j]];
-	       k<i_dof_dof[j_domain_dof[j]+1]; k++)
-	    aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
-	  }
-	  jj++;
-	}
-      /* solve for correction: ------------------------------------- */
-/* OLD HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
-      if (use_nonsymm)
-      {
-         
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], aux,
-                                               &matrix_size, &ierr); 
-#endif
-      }
-      else
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
+      if (CF_marker[i] == rlx_pt)
       {
-         
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, aux,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, aux,
-                                               &matrix_size, &ierr); 
-#endif
-      }
-      
+         matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
-      if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
-      jj = 0;
-      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
-	}
-/* OLD - HYPRE_USING_ESSL
-      matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
-      matrix_size_counter += matrix_size * matrix_size;
-      piv_counter += matrix_size;
+         /* compute residual: ---------------------------------------- */
 
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            aux[jj] = rhs[j_domain_dof[j]];
+            if (CF_marker[j_domain_dof[j]] == rlx_pt)
+            {
+               for (k=i_dof_dof[j_domain_dof[j]];
+                    k<i_dof_dof[j_domain_dof[j]+1]; k++)
+                  aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
+            }
+            jj++;
+         }
+         /* solve for correction: ------------------------------------- */
+         if (use_nonsymm)
+         {
+            hypre_dgetrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, &pivots[piv_counter], aux,
+                         &matrix_size, &ierr);
+         }
+         else
+         {
+            hypre_dpotrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, aux,
+                         &matrix_size, &ierr);
+         }
+
+         if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
+         }
+         matrix_size_counter += matrix_size * matrix_size;
+         piv_counter += matrix_size;
       }
-    }
+   }
 
 /*  if (num_procs > 1)
-  {
-     hypre_TFree(rhs);
-     hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
-  }
-  else 
-     rhs = hypre_VectorData(rhs_vector);
-*/
-  /* backward solve: ------------------------------------------------ */
-    for (i=num_domains-1; i > -1; i--)
     {
+    hypre_TFree(rhs);
+    hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
+    }
+    else
+    rhs = hypre_VectorData(rhs_vector);
+*/
+   /* backward solve: ------------------------------------------------ */
+   for (i=num_domains-1; i > -1; i--)
+   {
       if (CF_marker[i] == rlx_pt)
       {
-      matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
-/* OLD -  HYPRE_USING_ESSL
-      matrix_size_counter -= matrix_size * (matrix_size+1)/2;  
-*/
-      matrix_size_counter -= matrix_size * matrix_size;
-      piv_counter -= matrix_size;
+         matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
+         matrix_size_counter -= matrix_size * matrix_size;
+         piv_counter -= matrix_size;
+
+         /* compute residual: ---------------------------------------- */
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            aux[jj] = rhs[j_domain_dof[j]];
+            if (CF_marker[j_domain_dof[j]] == rlx_pt)
+            {
+               for (k=i_dof_dof[j_domain_dof[j]];
+                    k<i_dof_dof[j_domain_dof[j]+1]; k++)
+                  aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
+            }
+            jj++;
+         }
 
- 
-      /* compute residual: ---------------------------------------- */
-      jj = 0;
-      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  aux[jj] = rhs[j_domain_dof[j]]; 
-	  if (CF_marker[j_domain_dof[j]] == rlx_pt)
-	  {
-	  for (k=i_dof_dof[j_domain_dof[j]];
-	       k<i_dof_dof[j_domain_dof[j]+1]; k++)
-	    aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
-	  }
-	  jj++;
-	}
+         /* solve for correction: ------------------------------------- */
+         if (use_nonsymm)
+         {
+            hypre_dgetrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, &pivots[piv_counter], aux,
+                         &matrix_size, &ierr);
+         }
+         else
+         {
+            hypre_dpotrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, aux,
+                         &matrix_size, &ierr);
+         }
 
-      /* solve for correction: ------------------------------------- */
-/* OLD -  HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
-      if (use_nonsymm)
-      {
-         
-#ifdef HYPRE_USING_ESSL
-           dgetrs(&uplo, matrix_size, one, 
-                  &domain_matrixinverse[matrix_size_counter], 
-                  matrix_size, &pivots[piv_counter], aux,
-                  matrix_size, &ierr);
-#else
-        hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                                &domain_matrixinverse[matrix_size_counter], 
-                                                &matrix_size, &pivots[piv_counter], aux,
-                                                &matrix_size, &ierr); 
-#endif 
-      }
-      else
-      {
-         
-#ifdef HYPRE_USING_ESSL
-           dpotrs(&uplo, matrix_size, one, 
-                  &domain_matrixinverse[matrix_size_counter], 
-                  matrix_size, aux,
-                  matrix_size, &ierr); 
-#else
-           hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                                 &domain_matrixinverse[matrix_size_counter], 
-                                                 &matrix_size, aux,
-                                                 &matrix_size, &ierr); 
-#endif   
-      }
-      
-      if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
-      jj = 0;
-      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  relax_wt*aux[jj++];
-	}
+         if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            x[j_domain_dof[j]]+=  relax_wt*aux[jj++];
+         }
       }
-    }			      
+   }
 
-			      
-  if (num_procs > 1) hypre_TFree(rhs);
+   if (num_procs > 1) hypre_TFree(rhs);
 
-  return hypre_error_flag;
+   return hypre_error_flag;
 }
 
 HYPRE_Int hypre_MPSchwarzFWSolve(hypre_ParCSRMatrix *par_A,
-                           hypre_Vector *rhs_vector,
-                           hypre_CSRMatrix *domain_structure,
-                           hypre_ParVector *par_x,
-                           HYPRE_Real relax_wt,
-                           hypre_Vector *aux_vector, HYPRE_Int *pivots, 
-                           HYPRE_Int use_nonsymm)
+                                 hypre_Vector *rhs_vector,
+                                 hypre_CSRMatrix *domain_structure,
+                                 hypre_ParVector *par_x,
+                                 HYPRE_Real relax_wt,
+                                 hypre_Vector *aux_vector, HYPRE_Int *pivots,
+                                 HYPRE_Int use_nonsymm)
+{
+   HYPRE_Int ierr = 0;
+   /* HYPRE_Int num_dofs; */
+   HYPRE_Int *i_dof_dof;
+   HYPRE_Int *j_dof_dof;
+   HYPRE_Real *a_dof_dof;
+   HYPRE_Real *x;
+   HYPRE_Real *rhs;
+   HYPRE_Real *aux;
+   hypre_CSRMatrix *A;
+   hypre_Vector *x_vector;
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
 
-{
-  HYPRE_Int ierr = 0;
-  /* HYPRE_Int num_dofs; */
-  HYPRE_Int *i_dof_dof;
-  HYPRE_Int *j_dof_dof;
-  HYPRE_Real *a_dof_dof;
-  HYPRE_Real *x;
-  HYPRE_Real *rhs;
-  HYPRE_Real *aux;
-  hypre_CSRMatrix *A;
-  hypre_Vector *x_vector;
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-  char uplo = 'L';
-  HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  HYPRE_Int num_procs;
-
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  /* num_dofs = hypre_CSRMatrixNumRows(A); */
-  x_vector = hypre_ParVectorLocalVector(par_x);
-  A = hypre_ParCSRMatrixDiag(par_A);
-  i_dof_dof = hypre_CSRMatrixI(A);
-  j_dof_dof = hypre_CSRMatrixJ(A);
-  a_dof_dof = hypre_CSRMatrixData(A);
-  x = hypre_VectorData(x_vector);
-  aux = hypre_VectorData(aux_vector);
-  /* for (i=0; i < num_dofs; i++)
-    x[i] = 0.e0; */
-
-  if (num_procs > 1)
-     hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
-  else 
-     rhs = hypre_VectorData(rhs_vector);
 
-  /* forward solve: ----------------------------------------------- */
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+   char uplo = 'L';
+   HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
 
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-    {
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+
+   /* initiate:      ----------------------------------------------- */
+   /* num_dofs = hypre_CSRMatrixNumRows(A); */
+   x_vector = hypre_ParVectorLocalVector(par_x);
+   A = hypre_ParCSRMatrixDiag(par_A);
+   i_dof_dof = hypre_CSRMatrixI(A);
+   j_dof_dof = hypre_CSRMatrixJ(A);
+   a_dof_dof = hypre_CSRMatrixData(A);
+   x = hypre_VectorData(x_vector);
+   aux = hypre_VectorData(aux_vector);
+   /* for (i=0; i < num_dofs; i++)
+      x[i] = 0.e0; */
+
+   if (num_procs > 1)
+      hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
+   else
+      rhs = hypre_VectorData(rhs_vector);
+
+   /* forward solve: ----------------------------------------------- */
+
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
       matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
       /* compute residual: ---------------------------------------- */
 
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  aux[jj] = rhs[j_domain_dof[j]]; 
-	  for (k=i_dof_dof[j_domain_dof[j]];
-	       k<i_dof_dof[j_domain_dof[j]+1]; k++)
-	     aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
-	  jj++;
-	}
+      {
+         aux[jj] = rhs[j_domain_dof[j]];
+         for (k=i_dof_dof[j_domain_dof[j]];
+              k<i_dof_dof[j_domain_dof[j]+1]; k++)
+            aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
+         jj++;
+      }
       /* solve for correction: ------------------------------------- */
-/* OLD - HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
       if (use_nonsymm)
       {
-         
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], aux,
-                                               &matrix_size, &ierr); 
-#endif
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], aux,
+                      &matrix_size, &ierr);
       }
       else
       {
-         
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, aux,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, aux,
-                                               &matrix_size, &ierr); 
-#endif
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, aux,
+                      &matrix_size, &ierr);
       }
-      
+
 
       if (ierr)   hypre_error(HYPRE_ERROR_GENERIC);
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
-	}
-/* OLD - HYPRE_USING_ESSL
-      matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
+      {
+         x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
+      }
       matrix_size_counter += matrix_size * matrix_size;
       piv_counter += matrix_size;
+   }
 
-    }
-
-  if (num_procs > 1) hypre_TFree(rhs);
-
-  return hypre_error_flag;
+   if (num_procs > 1) hypre_TFree(rhs);
 
+   return hypre_error_flag;
 }
 
 HYPRE_Int hypre_MPSchwarzCFFWSolve(hypre_ParCSRMatrix *par_A,
-                             hypre_Vector *rhs_vector,
-                             hypre_CSRMatrix *domain_structure,
-                             hypre_ParVector *par_x,
-                             HYPRE_Real relax_wt,
-                             hypre_Vector *aux_vector,
-                             HYPRE_Int *CF_marker,
-                             HYPRE_Int rlx_pt, HYPRE_Int *pivots, 
-                             HYPRE_Int use_nonsymm)
+                                   hypre_Vector *rhs_vector,
+                                   hypre_CSRMatrix *domain_structure,
+                                   hypre_ParVector *par_x,
+                                   HYPRE_Real relax_wt,
+                                   hypre_Vector *aux_vector,
+                                   HYPRE_Int *CF_marker,
+                                   HYPRE_Int rlx_pt, HYPRE_Int *pivots,
+                                   HYPRE_Int use_nonsymm)
+{
+   HYPRE_Int ierr = 0;
+   /* HYPRE_Int num_dofs; */
+   HYPRE_Int *i_dof_dof;
+   HYPRE_Int *j_dof_dof;
+   HYPRE_Real *a_dof_dof;
+   HYPRE_Real *x;
+   HYPRE_Real *rhs;
+   HYPRE_Real *aux;
+   hypre_CSRMatrix *A;
+   hypre_Vector *x_vector;
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
 
-{
-  HYPRE_Int ierr = 0;
-  /* HYPRE_Int num_dofs; */
-  HYPRE_Int *i_dof_dof;
-  HYPRE_Int *j_dof_dof;
-  HYPRE_Real *a_dof_dof;
-  HYPRE_Real *x;
-  HYPRE_Real *rhs;
-  HYPRE_Real *aux;
-  hypre_CSRMatrix *A;
-  hypre_Vector *x_vector;
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  HYPRE_Real *domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-
-  char uplo = 'L';
-  HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  HYPRE_Int num_procs;
-
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  /* num_dofs = hypre_CSRMatrixNumRows(A); */
-  x_vector = hypre_ParVectorLocalVector(par_x);
-  A = hypre_ParCSRMatrixDiag(par_A);
-  i_dof_dof = hypre_CSRMatrixI(A);
-  j_dof_dof = hypre_CSRMatrixJ(A);
-  a_dof_dof = hypre_CSRMatrixData(A);
-  x = hypre_VectorData(x_vector);
-  aux = hypre_VectorData(aux_vector);
-  /* for (i=0; i < num_dofs; i++)
-    x[i] = 0.e0; */
-
-  if (use_nonsymm)
-     uplo = 'N';
-
-  if (num_procs > 1)
-     hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
-  else 
-     rhs = hypre_VectorData(rhs_vector);
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
 
-  /* forward solve: ----------------------------------------------- */
+   char uplo = 'L';
+   HYPRE_Int jj,i,j,k; /*, j_loc, k_loc;*/
 
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-    {
-      if (CF_marker[i] == rlx_pt)
-      {
-      matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
-      /* compute residual: ---------------------------------------- */
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
 
-      jj = 0;
-      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  aux[jj] = rhs[j_domain_dof[j]]; 
-          if (CF_marker[j_domain_dof[j]] == rlx_pt)
-          {
-	  for (k=i_dof_dof[j_domain_dof[j]];
-	       k<i_dof_dof[j_domain_dof[j]+1]; k++)
-	    aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
-          }
-	  jj++;
-	}
-      /* solve for correction: ------------------------------------- */
-/* OLD -  HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
-      if (use_nonsymm)
-      {
-         
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], aux,
-                                               &matrix_size, &ierr); 
-#endif
-      }
-      
-      else
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+
+   /* initiate:      ----------------------------------------------- */
+   /* num_dofs = hypre_CSRMatrixNumRows(A); */
+   x_vector = hypre_ParVectorLocalVector(par_x);
+   A = hypre_ParCSRMatrixDiag(par_A);
+   i_dof_dof = hypre_CSRMatrixI(A);
+   j_dof_dof = hypre_CSRMatrixJ(A);
+   a_dof_dof = hypre_CSRMatrixData(A);
+   x = hypre_VectorData(x_vector);
+   aux = hypre_VectorData(aux_vector);
+   /* for (i=0; i < num_dofs; i++)
+      x[i] = 0.e0; */
+
+   if (use_nonsymm)
+      uplo = 'N';
+
+   if (num_procs > 1)
+      hypre_parCorrRes(par_A,par_x,rhs_vector,&rhs);
+   else
+      rhs = hypre_VectorData(rhs_vector);
+
+   /* forward solve: ----------------------------------------------- */
+
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
+      if (CF_marker[i] == rlx_pt)
       {
-         
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, aux,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, aux,
-                                               &matrix_size, &ierr); 
-#endif
-      }
-      
-      if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
-      jj = 0;
-      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
-	}
-/* OLD -  HYPRE_USING_ESSL
-      matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
-      matrix_size_counter += matrix_size * matrix_size;
-      piv_counter += matrix_size;
+         matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
+         /* compute residual: ---------------------------------------- */
 
-      }
-    }
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            aux[jj] = rhs[j_domain_dof[j]];
+            if (CF_marker[j_domain_dof[j]] == rlx_pt)
+            {
+               for (k=i_dof_dof[j_domain_dof[j]];
+                    k<i_dof_dof[j_domain_dof[j]+1]; k++)
+                  aux[jj] -= a_dof_dof[k] * x[j_dof_dof[k]];
+            }
+            jj++;
+         }
+         /* solve for correction: ------------------------------------- */
+         if (use_nonsymm)
+         {
+            hypre_dgetrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, &pivots[piv_counter], aux,
+                         &matrix_size, &ierr);
+         }
 
-  if (num_procs > 1) hypre_TFree(rhs);
+         else
+         {
+            hypre_dpotrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, aux,
+                         &matrix_size, &ierr);
+         }
+
+         if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            x[j_domain_dof[j]]+=  relax_wt * aux[jj++];
+         }
+         matrix_size_counter += matrix_size * matrix_size;
+         piv_counter += matrix_size;
+      }
+   }
 
-  return hypre_error_flag;
+   if (num_procs > 1) hypre_TFree(rhs);
 
+   return hypre_error_flag;
 }
 
-HYPRE_Int 
+HYPRE_Int
 transpose_matrix_create(  HYPRE_Int **i_face_element_pointer,
-			  HYPRE_Int **j_face_element_pointer,
-
-			  HYPRE_Int *i_element_face, HYPRE_Int *j_element_face,
-
-			  HYPRE_Int num_elements, HYPRE_Int num_faces)
-
+                          HYPRE_Int **j_face_element_pointer,
+                          HYPRE_Int *i_element_face, HYPRE_Int *j_element_face,
+                          HYPRE_Int num_elements, HYPRE_Int num_faces)
 {
-  /* FILE *f; */
-  HYPRE_Int i, j;
+   /* FILE *f; */
+   HYPRE_Int i, j;
 
-  HYPRE_Int *i_face_element, *j_face_element;
+   HYPRE_Int *i_face_element, *j_face_element;
 
-  /* ======================================================================
-     first create face_element graph: -------------------------------------
-     ====================================================================== */
+   /* ======================================================================
+      first create face_element graph: -------------------------------------
+      ====================================================================== */
 
-  i_face_element = (HYPRE_Int *) malloc((num_faces+1) * sizeof(HYPRE_Int));
-  j_face_element = (HYPRE_Int *) malloc(i_element_face[num_elements] * sizeof(HYPRE_Int));
+   i_face_element = (HYPRE_Int *) malloc((num_faces+1) * sizeof(HYPRE_Int));
+   j_face_element = (HYPRE_Int *) malloc(i_element_face[num_elements] * sizeof(HYPRE_Int));
 
 
-  for (i=0; i < num_faces; i++)
-    i_face_element[i] = 0;
+   for (i=0; i < num_faces; i++)
+      i_face_element[i] = 0;
 
-  for (i=0; i < num_elements; i++)
-    for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
-      i_face_element[j_element_face[j]]++;
+   for (i=0; i < num_elements; i++)
+      for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
+         i_face_element[j_element_face[j]]++;
 
-  i_face_element[num_faces] = i_element_face[num_elements];
+   i_face_element[num_faces] = i_element_face[num_elements];
 
-  for (i=num_faces-1; i > -1; i--)
-    i_face_element[i] = i_face_element[i+1] - i_face_element[i];
+   for (i=num_faces-1; i > -1; i--)
+      i_face_element[i] = i_face_element[i+1] - i_face_element[i];
 
-  for (i=0; i < num_elements; i++)
-    for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
+   for (i=0; i < num_elements; i++)
+      for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
       {
-	j_face_element[i_face_element[j_element_face[j]]] = i;
-	i_face_element[j_element_face[j]]++;
+         j_face_element[i_face_element[j_element_face[j]]] = i;
+         i_face_element[j_element_face[j]]++;
       }
 
-  for (i=num_faces-1; i > -1; i--)
-    i_face_element[i+1] = i_face_element[i];
+   for (i=num_faces-1; i > -1; i--)
+      i_face_element[i+1] = i_face_element[i];
 
-  i_face_element[0] = 0;
+   i_face_element[0] = 0;
 
-  /* hypre_printf("end building face--element graph: ++++++++++++++++++\n"); */
+   /* hypre_printf("end building face--element graph: ++++++++++++++++++\n"); */
 
-  /* END building face_element graph; ================================ */
+   /* END building face_element graph; ================================ */
 
-  *i_face_element_pointer = i_face_element;
-  *j_face_element_pointer = j_face_element;
+   *i_face_element_pointer = i_face_element;
+   *j_face_element_pointer = j_face_element;
 
-  return 0;
+   return 0;
 
 }
-HYPRE_Int 
-matrix_matrix_product(    HYPRE_Int **i_element_edge_pointer, 
-			  HYPRE_Int **j_element_edge_pointer,
-
-			  HYPRE_Int *i_element_face, HYPRE_Int *j_element_face,
-			  HYPRE_Int *i_face_edge, HYPRE_Int *j_face_edge,
-
-			  HYPRE_Int num_elements, HYPRE_Int num_faces, HYPRE_Int num_edges)
-
+HYPRE_Int
+matrix_matrix_product(    HYPRE_Int **i_element_edge_pointer,
+                          HYPRE_Int **j_element_edge_pointer,
+                          HYPRE_Int *i_element_face, HYPRE_Int *j_element_face,
+                          HYPRE_Int *i_face_edge, HYPRE_Int *j_face_edge,
+                          HYPRE_Int num_elements, HYPRE_Int num_faces, HYPRE_Int num_edges)
 {
-  /* FILE *f; */
-  HYPRE_Int i, j, k, l, m;
-
-  HYPRE_Int i_edge_on_local_list, i_edge_on_list;
-  HYPRE_Int local_element_edge_counter = 0, element_edge_counter = 0;
-  HYPRE_Int *j_local_element_edge;
+   /* FILE *f; */
+   HYPRE_Int i, j, k, l, m;
 
-  
-  HYPRE_Int *i_element_edge, *j_element_edge;
+   HYPRE_Int i_edge_on_local_list, i_edge_on_list;
+   HYPRE_Int local_element_edge_counter = 0, element_edge_counter = 0;
+   HYPRE_Int *j_local_element_edge;
 
+   HYPRE_Int *i_element_edge, *j_element_edge;
 
-  j_local_element_edge = (HYPRE_Int *) malloc((num_edges+1) * sizeof(HYPRE_Int));
+   j_local_element_edge = (HYPRE_Int *) malloc((num_edges+1) * sizeof(HYPRE_Int));
 
-  i_element_edge = (HYPRE_Int *) malloc((num_elements+1) * sizeof(HYPRE_Int));
+   i_element_edge = (HYPRE_Int *) malloc((num_elements+1) * sizeof(HYPRE_Int));
 
-  for (i=0; i < num_elements+1; i++)
-    i_element_edge[i] = 0;
+   for (i=0; i < num_elements+1; i++)
+      i_element_edge[i] = 0;
 
-  for (i=0; i < num_elements; i++)
-    {
+   for (i=0; i < num_elements; i++)
+   {
       local_element_edge_counter = 0;
       for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
-	{
-	  k = j_element_face[j];
+      {
+         k = j_element_face[j];
 
-	  for (l=i_face_edge[k]; l < i_face_edge[k+1]; l++)
-	    {
-	      /* element i  and edge j_face_edge[l] are connected */
-	    
-	      /* hypre_printf("element %d  contains edge %d;\n",
-		     i, j_face_edge[l]);  */
-
-	      i_edge_on_local_list = -1;
-	      for (m=0; m < local_element_edge_counter; m++)
-		if (j_local_element_edge[m] == j_face_edge[l])
-		  {
-		    i_edge_on_local_list++;
-		    break;
-		  }
-
-	      if (i_edge_on_local_list == -1)
-		{
-		  i_element_edge[i]++;
-		  j_local_element_edge[local_element_edge_counter]=
-		    j_face_edge[l];
-		  local_element_edge_counter++;
-		}
-	    }
-	}
-    }
+         for (l=i_face_edge[k]; l < i_face_edge[k+1]; l++)
+         {
+            /* element i  and edge j_face_edge[l] are connected */
+
+            /* hypre_printf("element %d  contains edge %d;\n",
+               i, j_face_edge[l]);  */
+
+            i_edge_on_local_list = -1;
+            for (m=0; m < local_element_edge_counter; m++)
+               if (j_local_element_edge[m] == j_face_edge[l])
+               {
+                  i_edge_on_local_list++;
+                  break;
+               }
+
+            if (i_edge_on_local_list == -1)
+            {
+               i_element_edge[i]++;
+               j_local_element_edge[local_element_edge_counter]=
+                  j_face_edge[l];
+               local_element_edge_counter++;
+            }
+         }
+      }
+   }
 
-  free(j_local_element_edge);
+   free(j_local_element_edge);
 
-  for (i=0; i < num_elements; i++)
-    i_element_edge[i+1] += i_element_edge[i];
+   for (i=0; i < num_elements; i++)
+      i_element_edge[i+1] += i_element_edge[i];
 
-  for (i=num_elements; i>0; i--)
-    i_element_edge[i] = i_element_edge[i-1];
+   for (i=num_elements; i>0; i--)
+      i_element_edge[i] = i_element_edge[i-1];
 
-  i_element_edge[0] = 0;
+   i_element_edge[0] = 0;
 
-  j_element_edge = (HYPRE_Int *) malloc(i_element_edge[num_elements]
-				     * sizeof(HYPRE_Int));
+   j_element_edge = (HYPRE_Int *) malloc(i_element_edge[num_elements]
+                                         * sizeof(HYPRE_Int));
 
-  /* fill--in the actual j_element_edge array: --------------------- */
+   /* fill--in the actual j_element_edge array: --------------------- */
 
-  element_edge_counter = 0;
-  for (i=0; i < num_elements; i++)
-    {
+   element_edge_counter = 0;
+   for (i=0; i < num_elements; i++)
+   {
       i_element_edge[i] = element_edge_counter;
       for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
-	{
-	  for (k=i_face_edge[j_element_face[j]];
-	       k<i_face_edge[j_element_face[j]+1];k++)
-	    {
-	      /* check if edge j_face_edge[k] is already on list ***/
-
-	      i_edge_on_list = -1;
-	      for (l=i_element_edge[i];
-		   l<element_edge_counter; l++)
-		if (j_element_edge[l] == j_face_edge[k])
-		  {
-		    i_edge_on_list++;
-		    break;
-		  }
-
-	      if (i_edge_on_list == -1) 
-		{
-		  if (element_edge_counter >= 
-		      i_element_edge[num_elements])
-		    {
-                      hypre_error_w_msg(HYPRE_ERROR_GENERIC,"error in j_element_edge size: \n");
-		      break;
-		    }
-
-		  j_element_edge[element_edge_counter] =
-		    j_face_edge[k];
-		  element_edge_counter++;
-		}
-	    }
-	}
-		
-    }
+      {
+         for (k=i_face_edge[j_element_face[j]];
+              k<i_face_edge[j_element_face[j]+1];k++)
+         {
+            /* check if edge j_face_edge[k] is already on list ***/
 
-  i_element_edge[num_elements] = element_edge_counter;
+            i_edge_on_list = -1;
+            for (l=i_element_edge[i];
+                 l<element_edge_counter; l++)
+               if (j_element_edge[l] == j_face_edge[k])
+               {
+                  i_edge_on_list++;
+                  break;
+               }
 
-  /*------------------------------------------------------------------
-  f = fopen("element_edge", "w");
-  for (i=0; i < num_elements; i++)
-    {   
-      hypre_printf("\nelement: %d has edges:\n", i);  
-      for (j=i_element_edge[i]; j < i_element_edge[i+1]; j++)
-	{
-	  hypre_printf("%d ", j_element_edge[j]); 
-	  hypre_fprintf(f, "%d %d\n", i, j_element_edge[j]);
-	}
-	  
-      hypre_printf("\n"); 
-    }
+            if (i_edge_on_list == -1)
+            {
+               if (element_edge_counter >=
+                   i_element_edge[num_elements])
+               {
+                  hypre_error_w_msg(HYPRE_ERROR_GENERIC,"error in j_element_edge size: \n");
+                  break;
+               }
+
+               j_element_edge[element_edge_counter] =
+                  j_face_edge[k];
+               element_edge_counter++;
+            }
+         }
+      }
+
+   }
+
+   i_element_edge[num_elements] = element_edge_counter;
 
-  fclose(f);
-  */
+   /*------------------------------------------------------------------
+     f = fopen("element_edge", "w");
+     for (i=0; i < num_elements; i++)
+     {
+     hypre_printf("\nelement: %d has edges:\n", i);
+     for (j=i_element_edge[i]; j < i_element_edge[i+1]; j++)
+     {
+     hypre_printf("%d ", j_element_edge[j]);
+     hypre_fprintf(f, "%d %d\n", i, j_element_edge[j]);
+     }
+
+     hypre_printf("\n");
+     }
 
-  /* hypre_printf("end element_edge computation: ++++++++++++++++++++++++ \n");*/
+     fclose(f);
+   */
 
-  *i_element_edge_pointer = i_element_edge;
-  *j_element_edge_pointer = j_element_edge;
+   /* hypre_printf("end element_edge computation: ++++++++++++++++++++++++ \n");*/
 
-  return hypre_error_flag;
+   *i_element_edge_pointer = i_element_edge;
+   *j_element_edge_pointer = j_element_edge;
 
+   return hypre_error_flag;
 }
 
-
 /*--------------------------------------------------------------------------
- * hypre_AMGCreateDomainDof:  
+ * hypre_AMGCreateDomainDof:
  *--------------------------------------------------------------------------*/
 
 /*****************************************************************************
@@ -1736,1159 +1445,1032 @@
  * Routine for constructing graph domain_dof with minimal overlap
  *             and computing the respective matrix inverses to be
  *             used in an overlapping Schwarz procedure (like smoother
- *             in AMG); 
+ *             in AMG);
  *
  *****************************************************************************/
 HYPRE_Int
 hypre_AMGCreateDomainDof(hypre_CSRMatrix     *A,
-			 HYPRE_Int domain_type, HYPRE_Int overlap,
-			 HYPRE_Int num_functions, HYPRE_Int *dof_func,
-			 hypre_CSRMatrix    **domain_structure_pointer, 
+                         HYPRE_Int domain_type, HYPRE_Int overlap,
+                         HYPRE_Int num_functions, HYPRE_Int *dof_func,
+                         hypre_CSRMatrix    **domain_structure_pointer,
                          HYPRE_Int **piv_pointer, HYPRE_Int use_nonsymm)
 {
 
-  HYPRE_Int *i_domain_dof, *j_domain_dof;
-  HYPRE_Real *domain_matrixinverse;
-  HYPRE_Int num_domains;
-  hypre_CSRMatrix *domain_structure = NULL;  
-
-  HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A);
-  HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A);
-  HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A);
-  HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A);
+   HYPRE_Int *i_domain_dof, *j_domain_dof;
+   HYPRE_Real *domain_matrixinverse;
+   HYPRE_Int num_domains;
+   hypre_CSRMatrix *domain_structure = NULL;
 
-  /* HYPRE_Int *i_dof_to_accept_weight; */
-  HYPRE_Int *i_dof_to_prefer_weight,
-    *w_dof_dof, *i_dof_weight;
-  HYPRE_Int *i_dof_to_aggregate, *i_aggregate_dof, *j_aggregate_dof;
-  
-  HYPRE_Int *i_dof_index;
+   HYPRE_Int *i_dof_dof = hypre_CSRMatrixI(A);
+   HYPRE_Int *j_dof_dof = hypre_CSRMatrixJ(A);
+   HYPRE_Real *a_dof_dof = hypre_CSRMatrixData(A);
+   HYPRE_Int num_dofs = hypre_CSRMatrixNumRows(A);
 
-  HYPRE_Int ierr = 0;
-  HYPRE_Int i,j,k,  l_loc, i_loc, j_loc;
-  HYPRE_Int i_dof;
-  HYPRE_Int *i_local_to_global;
-  HYPRE_Int *i_global_to_local;
+   /* HYPRE_Int *i_dof_to_accept_weight; */
+   HYPRE_Int *i_dof_to_prefer_weight,
+      *w_dof_dof, *i_dof_weight;
+   HYPRE_Int *i_dof_to_aggregate, *i_aggregate_dof, *j_aggregate_dof;
 
+   HYPRE_Int *i_dof_index;
 
-  HYPRE_Int local_dof_counter, max_local_dof_counter=0; 
+   HYPRE_Int ierr = 0;
+   HYPRE_Int i,j,k,  l_loc, i_loc, j_loc;
+   HYPRE_Int i_dof;
+   HYPRE_Int *i_local_to_global;
+   HYPRE_Int *i_global_to_local;
 
-  HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0;
-  HYPRE_Int nf;
+   HYPRE_Int local_dof_counter, max_local_dof_counter=0;
 
-  HYPRE_Real *AE;
+   HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0;
+   HYPRE_Int nf;
 
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int *ipiv;
-  HYPRE_Int *piv = NULL;
-  char uplo = 'L';
-  HYPRE_Int cnt;
+   HYPRE_Real *AE;
 
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int *ipiv;
+   HYPRE_Int *piv = NULL;
+   char uplo = 'L';
+   HYPRE_Int cnt;
 
+   /* --------------------------------------------------------------------- */
 
-  /* --------------------------------------------------------------------- */
+   /*=======================================================================*/
+   /*    create artificial domains by agglomeration;                        */
+   /*=======================================================================*/
 
-  /*=======================================================================*/
-  /*    create artificial domains by agglomeration;                        */
-  /*=======================================================================*/
+   /*hypre_printf("----------- create artificials domain by agglomeration;  ======\n");
+    */
 
-  /*hypre_printf("----------- create artificials domain by agglomeration;  ======\n");
-*/
+   if (num_dofs == 0)
+   {
+      *domain_structure_pointer = domain_structure;
 
-  if (num_dofs == 0)
-  {
-     *domain_structure_pointer = domain_structure;
+      *piv_pointer = piv;
 
-     *piv_pointer = piv;
+      return hypre_error_flag;
+   }
 
-     return hypre_error_flag;
-  }
+   i_aggregate_dof = hypre_CTAlloc(HYPRE_Int,num_dofs+1);
+   j_aggregate_dof= hypre_CTAlloc(HYPRE_Int,num_dofs);
 
-  i_aggregate_dof = hypre_CTAlloc(HYPRE_Int,num_dofs+1);
-  j_aggregate_dof= hypre_CTAlloc(HYPRE_Int,num_dofs);
+   if (domain_type == 2)
+   {
+      i_dof_to_prefer_weight = hypre_CTAlloc(HYPRE_Int,num_dofs);
+      w_dof_dof = hypre_CTAlloc(HYPRE_Int,i_dof_dof[num_dofs]);
+      i_dof_weight = hypre_CTAlloc(HYPRE_Int,num_dofs);
 
-  if (domain_type == 2)
-  {
-     i_dof_to_prefer_weight = hypre_CTAlloc(HYPRE_Int,num_dofs);
-     w_dof_dof = hypre_CTAlloc(HYPRE_Int,i_dof_dof[num_dofs]);
-     i_dof_weight = hypre_CTAlloc(HYPRE_Int,num_dofs);
+      for (i=0; i<num_dofs; i++)
+         for (j=i_dof_dof[i]; j< i_dof_dof[i+1]; j++)
+         {
+            if (j_dof_dof[j] == i)
+               w_dof_dof[j]=0;
+            else
+               w_dof_dof[j]=1;
+         }
 
-     for (i=0; i<num_dofs; i++)
-        for (j=i_dof_dof[i]; j< i_dof_dof[i+1]; j++)
-        {
-	   if (j_dof_dof[j] == i) 
-	      w_dof_dof[j]=0;
-	   else
-	      w_dof_dof[j]=1;
-        }
+      /*hypre_printf("end computing weights for agglomeration procedure: --------\n");
+       */
+      hypre_AMGeAgglomerate(i_aggregate_dof, j_aggregate_dof,
+                            i_dof_dof, j_dof_dof, w_dof_dof,
+                            i_dof_dof, j_dof_dof,
+                            i_dof_dof, j_dof_dof,
+                            i_dof_to_prefer_weight,
+                            i_dof_weight,
+                            num_dofs, num_dofs,
+                            &num_domains);
+
+      hypre_TFree(i_dof_to_prefer_weight);
+      hypre_TFree(i_dof_weight);
+      hypre_TFree(w_dof_dof);
+   }
+   else
+   {
+      nf = 1;
+      if (domain_type == 1) nf = num_functions;
 
+      num_domains = num_dofs/nf;
+      for (i=0; i < num_domains+1; i++)
+      {
+         i_aggregate_dof[i] = nf*i;
+      }
+      for (i=0; i < num_dofs; i++)
+      {
+         j_aggregate_dof[i] = i;
+      }
+   }
+   /*hypre_printf("num_dofs: %d, num_domains: %d\n", num_dofs, num_domains);*/
 
-     /*hypre_printf("end computing weights for agglomeration procedure: --------\n");
-*/
-     hypre_AMGeAgglomerate(i_aggregate_dof, j_aggregate_dof,
 
-			       i_dof_dof, j_dof_dof, w_dof_dof,
-		     
-			       i_dof_dof, j_dof_dof,
-			       i_dof_dof, j_dof_dof,
-
-			       i_dof_to_prefer_weight,
-			       i_dof_weight,
-
-			       num_dofs, num_dofs,
-			       &num_domains);
-
-     hypre_TFree(i_dof_to_prefer_weight);
-     hypre_TFree(i_dof_weight);
-     hypre_TFree(w_dof_dof);
+   /*
+     hypre_printf("========================================================\n");
+     hypre_printf("== artificial non--overlapping domains (aggregates): ===\n");
+     hypre_printf("========================================================\n");
 
-  }
-  else 
-  {
-     nf = 1;
-     if (domain_type == 1) nf = num_functions;
 
-     num_domains = num_dofs/nf;
-     for (i=0; i < num_domains+1; i++)
-     {
-        i_aggregate_dof[i] = nf*i;
-     }
-     for (i=0; i < num_dofs; i++)
+     for (i=0; i < num_domains; i++)
      {
-        j_aggregate_dof[i] = i;
-     }
-  }
-  /*hypre_printf("num_dofs: %d, num_domains: %d\n", num_dofs, num_domains);*/
-
-
-  /*
-  hypre_printf("========================================================\n");
-  hypre_printf("== artificial non--overlapping domains (aggregates): ===\n");
-  hypre_printf("========================================================\n");
-
-
-  for (i=0; i < num_domains; i++)
-    {
-      hypre_printf("\n aggregate %d:\n", i);
-      for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-	hypre_printf("%d, ", j_aggregate_dof[j]);
-
-      hypre_printf("\n");
-    }
-    */
-
+     hypre_printf("\n aggregate %d:\n", i);
+     for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+     hypre_printf("%d, ", j_aggregate_dof[j]);
 
+     hypre_printf("\n");
+     }
+   */
 
-  /* make domains from aggregates: *********************************/
+   /* make domains from aggregates: *********************************/
 
+   if (overlap == 1)
+   {
+      i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
 
-  if (overlap == 1)
-  {
-     i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
+      i_dof_index = hypre_CTAlloc(HYPRE_Int, num_dofs);
 
-     i_dof_index = hypre_CTAlloc(HYPRE_Int, num_dofs);
+      for (i=0; i < num_dofs; i++)
+         i_dof_index[i] = -1;
 
-     for (i=0; i < num_dofs; i++)
-        i_dof_index[i] = -1;
+      i_dof_to_aggregate = hypre_CTAlloc(HYPRE_Int, num_dofs);
+      for (i=0; i < num_domains; i++)
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            i_dof_to_aggregate[j_aggregate_dof[j]] = i;
 
-     i_dof_to_aggregate = hypre_CTAlloc(HYPRE_Int, num_dofs);
-     for (i=0; i < num_domains; i++)
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-           i_dof_to_aggregate[j_aggregate_dof[j]] = i;
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         i_domain_dof[i] =  domain_dof_counter;
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            i_dof_index[j_aggregate_dof[j]]=-1;
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        i_domain_dof[i] =  domain_dof_counter;
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-	   i_dof_index[j_aggregate_dof[j]]=-1;
-
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-           for (k=i_dof_dof[j_aggregate_dof[j]];
-	       k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
-	      if (i_dof_to_aggregate[j_dof_dof[k]] >= i 
-	          && i_dof_index[j_dof_dof[k]]==-1) 
-	      {
-	         i_dof_index[j_dof_dof[k]]++;
-	         domain_dof_counter++;
-	      }
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            for (k=i_dof_dof[j_aggregate_dof[j]];
+                 k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
+               if (i_dof_to_aggregate[j_dof_dof[k]] >= i
+                   && i_dof_index[j_dof_dof[k]]==-1)
+               {
+                  i_dof_index[j_dof_dof[k]]++;
+                  domain_dof_counter++;
+               }
+      }
 
-     }
+      i_domain_dof[num_domains] =  domain_dof_counter;
+      j_domain_dof = hypre_CTAlloc(HYPRE_Int,domain_dof_counter);
 
-     i_domain_dof[num_domains] =  domain_dof_counter;
-     j_domain_dof = hypre_CTAlloc(HYPRE_Int,domain_dof_counter);
+      for (i=0; i < num_dofs; i++)
+         i_dof_index[i] = -1;
 
-     for (i=0; i < num_dofs; i++)
-        i_dof_index[i] = -1;
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            i_dof_index[j_aggregate_dof[j]]=-1;
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     { 
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-	   i_dof_index[j_aggregate_dof[j]]=-1;
-
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-	   for (k=i_dof_dof[j_aggregate_dof[j]];
-	        k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
-	      if (i_dof_to_aggregate[j_dof_dof[k]] >= i
-	          && i_dof_index[j_dof_dof[k]]==-1) 
-	      {
-	         i_dof_index[j_dof_dof[k]]++;
-	         j_domain_dof[domain_dof_counter] = j_dof_dof[k];
-	         domain_dof_counter++;
-	      }
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            for (k=i_dof_dof[j_aggregate_dof[j]];
+                 k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
+               if (i_dof_to_aggregate[j_dof_dof[k]] >= i
+                   && i_dof_index[j_dof_dof[k]]==-1)
+               {
+                  i_dof_index[j_dof_dof[k]]++;
+                  j_domain_dof[domain_dof_counter] = j_dof_dof[k];
+                  domain_dof_counter++;
+               }
+      }
 
-     }
+      hypre_TFree(i_aggregate_dof);
+      hypre_TFree(j_aggregate_dof);
+      hypre_TFree(i_dof_to_aggregate);
+      hypre_TFree(i_dof_index);
+   }
+   else if (overlap == 2)
+   {
+      i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
 
-     hypre_TFree(i_aggregate_dof);
-     hypre_TFree(j_aggregate_dof);
-     hypre_TFree(i_dof_to_aggregate);
-     hypre_TFree(i_dof_index);
-  }
-  else if (overlap == 2)
-  {	
-     i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
+      i_dof_index = hypre_CTAlloc(HYPRE_Int, num_dofs);
 
-     i_dof_index = hypre_CTAlloc(HYPRE_Int, num_dofs);
+      for (i=0; i < num_dofs; i++)
+         i_dof_index[i] = -1;
 
-     for (i=0; i < num_dofs; i++)
-        i_dof_index[i] = -1;
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         i_domain_dof[i] =  domain_dof_counter;
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            for (k=i_dof_dof[j_aggregate_dof[j]];
+                 k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
+               if (i_dof_index[j_dof_dof[k]]==-1)
+               {
+                  i_dof_index[j_dof_dof[k]]++;
+                  domain_dof_counter++;
+               }
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        i_domain_dof[i] =  domain_dof_counter;
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-           for (k=i_dof_dof[j_aggregate_dof[j]];
-	       k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
-	      if (i_dof_index[j_dof_dof[k]]==-1) 
-	      {
-	         i_dof_index[j_dof_dof[k]]++;
-	         domain_dof_counter++;
-	      }
-
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-           for (k=i_dof_dof[j_aggregate_dof[j]];
-	        k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
-	      i_dof_index[j_dof_dof[k]]=-1;
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            for (k=i_dof_dof[j_aggregate_dof[j]];
+                 k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
+               i_dof_index[j_dof_dof[k]]=-1;
 
-     }
+      }
 
-     for (i=0; i < num_dofs; i++)
-        i_dof_index[i] = -1;
+      for (i=0; i < num_dofs; i++)
+         i_dof_index[i] = -1;
 
-     i_domain_dof[num_domains] =  domain_dof_counter;
-     j_domain_dof = hypre_CTAlloc(HYPRE_Int,domain_dof_counter);
+      i_domain_dof[num_domains] =  domain_dof_counter;
+      j_domain_dof = hypre_CTAlloc(HYPRE_Int,domain_dof_counter);
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-	   for (k=i_dof_dof[j_aggregate_dof[j]];
-	        k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
-	      if ( i_dof_index[j_dof_dof[k]]==-1) 
-	      {
-	         i_dof_index[j_dof_dof[k]]++;
-	         j_domain_dof[domain_dof_counter] = j_dof_dof[k];
-	         domain_dof_counter++;
-	      }
-
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-	   for (k=i_dof_dof[j_aggregate_dof[j]];
-	        k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
-	      i_dof_index[j_dof_dof[k]]=-1;
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            for (k=i_dof_dof[j_aggregate_dof[j]];
+                 k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
+               if ( i_dof_index[j_dof_dof[k]]==-1)
+               {
+                  i_dof_index[j_dof_dof[k]]++;
+                  j_domain_dof[domain_dof_counter] = j_dof_dof[k];
+                  domain_dof_counter++;
+               }
 
-     }
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            for (k=i_dof_dof[j_aggregate_dof[j]];
+                 k<i_dof_dof[j_aggregate_dof[j]+1]; k++)
+               i_dof_index[j_dof_dof[k]]=-1;
+      }
 
-     hypre_TFree(i_aggregate_dof);
-     hypre_TFree(j_aggregate_dof);
-     hypre_TFree(i_dof_index);
-  }
-  else
-  {
-     i_domain_dof = i_aggregate_dof;
-     j_domain_dof = j_aggregate_dof;
-  }
+      hypre_TFree(i_aggregate_dof);
+      hypre_TFree(j_aggregate_dof);
+      hypre_TFree(i_dof_index);
+   }
+   else
+   {
+      i_domain_dof = i_aggregate_dof;
+      j_domain_dof = j_aggregate_dof;
+   }
 
-  /*hypre_printf("END domain_dof computations: =================================\n");
-*/
-  domain_matrixinverse_counter = 0;
-  local_dof_counter = 0;
-  piv_counter = 0;
+   /*hypre_printf("END domain_dof computations: =================================\n");
+    */
+   domain_matrixinverse_counter = 0;
+   local_dof_counter = 0;
+   piv_counter = 0;
 
- 
-  for (i=0; i < num_domains; i++)
-    {
+   for (i=0; i < num_domains; i++)
+   {
       local_dof_counter = i_domain_dof[i+1]-i_domain_dof[i];
-      /*  OLD - HYPRE_USING_ESSL
-          domain_matrixinverse_counter+= local_dof_counter*(local_dof_counter+1)/2; */
       domain_matrixinverse_counter+= local_dof_counter * local_dof_counter;
       piv_counter += local_dof_counter;
 
-
       if (local_dof_counter > max_local_dof_counter)
-	max_local_dof_counter = local_dof_counter;
-    }
-
-  domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter);
-  if (use_nonsymm)
-     piv = hypre_CTAlloc(HYPRE_Int, piv_counter);
-
-
-
-
-  /* OLD-HYPRE_USING_ESSL
-  AE = hypre_CTAlloc(HYPRE_Real, max_local_dof_counter*max_local_dof_counter);
-  */
+         max_local_dof_counter = local_dof_counter;
+   }
 
-  i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
+   domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter);
+   if (use_nonsymm)
+      piv = hypre_CTAlloc(HYPRE_Int, piv_counter);
 
+   i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
 
-  i_global_to_local = hypre_CTAlloc(HYPRE_Int,num_dofs);
+   i_global_to_local = hypre_CTAlloc(HYPRE_Int,num_dofs);
 
-  for (i=0; i < num_dofs; i++)
-    i_global_to_local[i] = -1;
+   for (i=0; i < num_dofs; i++)
+      i_global_to_local[i] = -1;
 
-  piv_counter = 0;
-  domain_matrixinverse_counter = 0;
-  for (i=0; i < num_domains; i++)
-  {
+   piv_counter = 0;
+   domain_matrixinverse_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
       local_dof_counter = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
       {
-	  i_global_to_local[j_domain_dof[j]] = local_dof_counter;
-	  i_local_to_global[local_dof_counter] = j_domain_dof[j];
-	  local_dof_counter++;
+         i_global_to_local[j_domain_dof[j]] = local_dof_counter;
+         i_local_to_global[local_dof_counter] = j_domain_dof[j];
+         local_dof_counter++;
       }
 
-
       /* get local matrix in AE: ======================================== */
-      cnt = 0;  
+      cnt = 0;
 
-      /* OLD - HYPRE_USING_ESSL
-      for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	for (j_loc=0; j_loc < local_dof_counter; j_loc++)
-	  AE[cnt++] = 0.e0;
-      for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-      {
-  	 i_dof = i_local_to_global[i_loc];
-	 for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
-	 {
-	    j_loc = i_global_to_local[j_dof_dof[j]];
-	    if (j_loc >=0)
-		AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j];
-	 }
-      }
-      cnt = local_dof_counter;
-      for (j_loc=1; j_loc < local_dof_counter; j_loc++)
-         for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++)
-	    AE[cnt++] = AE[i_loc + j_loc * local_dof_counter];
-      dppf(AE, local_dof_counter, 1);
-      cnt = domain_matrixinverse_counter;
-      size = local_dof_counter*(local_dof_counter+1)/2;
-      for (i_loc = 0; i_loc < size ; i_loc++)
-         domain_matrixinverse[cnt++] = AE[i_loc];
-      domain_matrixinverse_counter += size;
-      */
       AE = &domain_matrixinverse[domain_matrixinverse_counter];
       ipiv = &piv[piv_counter];
       for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	for (j_loc=0; j_loc < local_dof_counter; j_loc++)
-	  AE[cnt++] = 0.e0;
+         for (j_loc=0; j_loc < local_dof_counter; j_loc++)
+            AE[cnt++] = 0.e0;
 
       for (i_loc=0; i_loc < local_dof_counter; i_loc++)
       {
-	 i_dof = i_local_to_global[i_loc];
-	 for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
-	 {
-	    j_loc = i_global_to_local[j_dof_dof[j]];
-	    if (j_loc >=0)
-	       AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j];
-	 }
+         i_dof = i_local_to_global[i_loc];
+         for (j=i_dof_dof[i_dof]; j < i_dof_dof[i_dof+1]; j++)
+         {
+            j_loc = i_global_to_local[j_dof_dof[j]];
+            if (j_loc >=0)
+               AE[i_loc + j_loc * local_dof_counter] = a_dof_dof[j];
+         }
       }
 
       if (use_nonsymm)
       {
-#ifdef HYPRE_USING_ESSL
-         dgetrf(local_dof_counter,local_dof_counter, AE,
-                local_dof_counter, ipiv, &ierr);
-#else
-         hypre_F90_NAME_LAPACK(dgetrf,DGETRF)(&local_dof_counter,
-                                              &local_dof_counter, AE,
-                                              &local_dof_counter, ipiv, &ierr);
-#endif
+         hypre_dgetrf(&local_dof_counter,
+                      &local_dof_counter, AE,
+                      &local_dof_counter, ipiv, &ierr);
          piv_counter +=local_dof_counter;
       }
-      
+
       else
       {
-#ifdef HYPRE_USING_ESSL         
-         dpotrf(&uplo,local_dof_counter, AE,
-                local_dof_counter, &ierr);
-#else
-         hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo,&local_dof_counter, AE,
-                                              &local_dof_counter, &ierr);
-#endif
-        
+         hypre_dpotrf(&uplo,&local_dof_counter, AE,
+                      &local_dof_counter, &ierr);
       }
-      
-      domain_matrixinverse_counter+=local_dof_counter*local_dof_counter;	  
+
+      domain_matrixinverse_counter+=local_dof_counter*local_dof_counter;
 
       for (l_loc=0; l_loc < local_dof_counter; l_loc++)
-	i_global_to_local[i_local_to_global[l_loc]] = -1;
-	  
-    }
+         i_global_to_local[i_local_to_global[l_loc]] = -1;
 
-  hypre_TFree(i_local_to_global);
-  hypre_TFree(i_global_to_local);
-/* OLD - HYPRE_USING_ESSL
-  hypre_TFree(AE);
-*/ 
-  
-  domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter,
-			i_domain_dof[num_domains]);
-
-  hypre_CSRMatrixI(domain_structure) = i_domain_dof;
-  hypre_CSRMatrixJ(domain_structure) = j_domain_dof;
-  hypre_CSRMatrixData(domain_structure) = domain_matrixinverse;
+   }
 
-  *domain_structure_pointer = domain_structure;
+   hypre_TFree(i_local_to_global);
+   hypre_TFree(i_global_to_local);
 
-  *piv_pointer = piv;
+   domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter,
+                                            i_domain_dof[num_domains]);
 
-  return hypre_error_flag;
+   hypre_CSRMatrixI(domain_structure) = i_domain_dof;
+   hypre_CSRMatrixJ(domain_structure) = j_domain_dof;
+   hypre_CSRMatrixData(domain_structure) = domain_matrixinverse;
 
-}
+   *domain_structure_pointer = domain_structure;
 
+   *piv_pointer = piv;
 
-/* unacceptable faces: i_face_to_prefer_weight[] = -1; ------------------*/
+   return hypre_error_flag;
+}
 
+/* unacceptable faces: i_face_to_prefer_weight[] = -1; ------------------*/
 
 HYPRE_Int hypre_AMGeAgglomerate(HYPRE_Int *i_AE_element, HYPRE_Int *j_AE_element,
-
-			  HYPRE_Int *i_face_face, HYPRE_Int *j_face_face, HYPRE_Int *w_face_face,
-		     
-			  HYPRE_Int *i_face_element, HYPRE_Int *j_face_element,
-			  HYPRE_Int *i_element_face, HYPRE_Int *j_element_face,
-
-			  HYPRE_Int *i_face_to_prefer_weight,
-			  HYPRE_Int *i_face_weight,
-
-			  HYPRE_Int num_faces, HYPRE_Int num_elements,
-			  HYPRE_Int *num_AEs_pointer)
+                                HYPRE_Int *i_face_face, HYPRE_Int *j_face_face, HYPRE_Int *w_face_face,
+                                HYPRE_Int *i_face_element, HYPRE_Int *j_face_element,
+                                HYPRE_Int *i_element_face, HYPRE_Int *j_element_face,
+                                HYPRE_Int *i_face_to_prefer_weight,
+                                HYPRE_Int *i_face_weight,
+                                HYPRE_Int num_faces, HYPRE_Int num_elements,
+                                HYPRE_Int *num_AEs_pointer)
 {
+   HYPRE_Int ierr = 0;
+   HYPRE_Int i, j, k, l;
 
-  HYPRE_Int ierr = 0;
-  HYPRE_Int i, j, k, l;
+   HYPRE_Int face_to_eliminate;
+   HYPRE_Int max_weight_old, max_weight;
 
-  HYPRE_Int face_to_eliminate;
-  HYPRE_Int max_weight_old, max_weight;
+   HYPRE_Int AE_counter=0, AE_element_counter=0;
 
-  HYPRE_Int AE_counter=0, AE_element_counter=0;
+   /* HYPRE_Int i_element_face_counter; */
 
-  /* HYPRE_Int i_element_face_counter; */
+   HYPRE_Int *i_element_to_AE;
 
-  HYPRE_Int *i_element_to_AE;
+   HYPRE_Int *previous, *next, *first;
+   HYPRE_Int head, tail, last;
 
-  HYPRE_Int *previous, *next, *first;
-  HYPRE_Int head, tail, last;
+   HYPRE_Int face_max_weight, face_local_max_weight, preferred_weight;
 
-  HYPRE_Int face_max_weight, face_local_max_weight, preferred_weight;
+   HYPRE_Int weight, weight_max;
 
-  HYPRE_Int weight, weight_max;
-
-  max_weight = 1;
-  for (i=0; i < num_faces; i++)
-    {
+   max_weight = 1;
+   for (i=0; i < num_faces; i++)
+   {
       weight = 1;
       for (j=i_face_face[i]; j < i_face_face[i+1]; j++)
-	weight+= w_face_face[j];
+         weight+= w_face_face[j];
       if (max_weight < weight) max_weight = weight;
-    }
-
-  first = hypre_CTAlloc(HYPRE_Int, max_weight+1);
-
-
-
-  next = hypre_CTAlloc(HYPRE_Int, num_faces);
+   }
 
+   first = hypre_CTAlloc(HYPRE_Int, max_weight+1);
 
-  previous = hypre_CTAlloc(HYPRE_Int, num_faces+1);
+   next = hypre_CTAlloc(HYPRE_Int, num_faces);
 
+   previous = hypre_CTAlloc(HYPRE_Int, num_faces+1);
 
-  tail = num_faces;
-  head = -1;
+   tail = num_faces;
+   head = -1;
 
-  for (i=0; i < num_faces; i++)
-    {
+   for (i=0; i < num_faces; i++)
+   {
       next[i] = i+1;
       previous[i] = i-1;
-    }
-  
-  last = num_faces-1;
-  previous[tail] = last;
-
-  for (weight=1; weight <= max_weight; weight++)
-    first[weight] = tail;
+   }
 
-  i_element_to_AE = hypre_CTAlloc(HYPRE_Int, num_elements);
+   last = num_faces-1;
+   previous[tail] = last;
 
-  /*=======================================================================
-                     AGGLOMERATION PROCEDURE:
-    ======================================================================= */
+   for (weight=1; weight <= max_weight; weight++)
+      first[weight] = tail;
 
-  for (k=0; k < num_elements; k++) 
-    i_element_to_AE[k] = -1;
+   i_element_to_AE = hypre_CTAlloc(HYPRE_Int, num_elements);
 
-  for (k=0; k < num_faces; k++) 
-    i_face_weight[k] = 1;
+   /*=======================================================================
+     AGGLOMERATION PROCEDURE:
+     ======================================================================= */
 
+   for (k=0; k < num_elements; k++)
+      i_element_to_AE[k] = -1;
 
-  first[0] = 0;
-  first[1] = 0;
+   for (k=0; k < num_faces; k++)
+      i_face_weight[k] = 1;
 
-  last = previous[tail];
-  weight_max = i_face_weight[last];
+   first[0] = 0;
+   first[1] = 0;
 
+   last = previous[tail];
+   weight_max = i_face_weight[last];
 
-  k = last;
-  face_max_weight = -1;
-  while (k!= head)
-    {
+   k = last;
+   face_max_weight = -1;
+   while (k!= head)
+   {
       if (i_face_to_prefer_weight[k] > -1)
-	face_max_weight = k;
-	  
+         face_max_weight = k;
+
       if (face_max_weight > -1) break;
-	  
-      k=previous[k];
-    }
 
+      k=previous[k];
+   }
 
-  /* this will be used if the faces have been sorted: *****************
-  k = last;
-  face_max_weight = -1;
-  while (k != head)
-    {
+   /* this will be used if the faces have been sorted: *****************
+      k = last;
+      face_max_weight = -1;
+      while (k != head)
+      {
       if (i_face_to_prefer_weight[k] > -1)
-	face_max_weight = k;
-
+      face_max_weight = k;
 
-      if (face_max_weight > -1) 
-	{
-	  max_weight = i_face_weight[face_max_weight];
-	  l = face_max_weight;
-
-	  while (previous[l] != head)
-	    {
-
-	      if (i_face_weight[previous[l]] < max_weight) 
-		break;
-	      else
-		if (i_face_to_prefer_weight[previous[l]] > 
-		    i_face_to_prefer_weight[face_max_weight])
-		  {
-		    l = previous[l];
-		    face_max_weight = l;
-		  }
-		else
-		  l = previous[l];
-	    }
 
-	  break; 
-	}
+      if (face_max_weight > -1)
+      {
+      max_weight = i_face_weight[face_max_weight];
+      l = face_max_weight;
 
+      while (previous[l] != head)
+      {
 
-      l =previous[k];
+      if (i_face_weight[previous[l]] < max_weight)
+      break;
+      else
+      if (i_face_to_prefer_weight[previous[l]] >
+      i_face_to_prefer_weight[face_max_weight])
+      {
+      l = previous[l];
+      face_max_weight = l;
+      }
+      else
+      l = previous[l];
+      }
 
+      break;
+      }
 
+      l =previous[k];
 
       weight = i_face_weight[k];
       last = previous[tail];
-      if (last == head) 
-	weight_max = 0;
+      if (last == head)
+      weight_max = 0;
       else
-	weight_max = i_face_weight[last];
-
-
-      ierr = hypre_remove_entry(weight, &weight_max, 
-			  previous, next, first, &last,
-			  head, tail, 
-			  k);
-
-			  
-
+      weight_max = i_face_weight[last];
 
+      ierr = hypre_remove_entry(weight, &weight_max,
+      previous, next, first, &last,
+      head, tail,
+      k);
 
       k=l;
-    }
-    */
+      }
+   */
 
-  if (face_max_weight == -1)
-    {
+   if (face_max_weight == -1)
+   {
       hypre_error_w_msg(HYPRE_ERROR_GENERIC,"all faces are unacceptable, i.e., no faces to eliminate !\n");
 
       *num_AEs_pointer = 1;
 
       i_AE_element[0] = 0;
       for (i=0; i < num_elements; i++)
-	{
-	  i_element_to_AE[i] = 0;
-	  j_AE_element[i] = i;
-	}
+      {
+         i_element_to_AE[i] = 0;
+         j_AE_element[i] = i;
+      }
 
       i_AE_element[1] = num_elements;
 
       return hypre_error_flag;
-    }
-
-  for (k=0; k < num_faces; k++)
-    if (i_face_to_prefer_weight[k] > i_face_to_prefer_weight[face_max_weight])
-      face_max_weight = k;
-
-  max_weight = i_face_weight[face_max_weight];
+   }
 
-  AE_counter=0;
-  AE_element_counter=0;
-   
+   for (k=0; k < num_faces; k++)
+      if (i_face_to_prefer_weight[k] > i_face_to_prefer_weight[face_max_weight])
+         face_max_weight = k;
 
-  i_AE_element[AE_counter] = AE_element_counter;
+   max_weight = i_face_weight[face_max_weight];
 
-  max_weight_old = -1;
+   AE_counter=0;
+   AE_element_counter=0;
 
-  face_local_max_weight = face_max_weight;
+   i_AE_element[AE_counter] = AE_element_counter;
 
-eliminate_face:
+   max_weight_old = -1;
 
-  face_to_eliminate = face_local_max_weight;
+   face_local_max_weight = face_max_weight;
 
-  max_weight = i_face_weight[face_to_eliminate]; 
+  eliminate_face:
 
-  last = previous[tail];
-  if (last == head) 
-    weight_max = 0;
-  else
-    weight_max = i_face_weight[last];
+   face_to_eliminate = face_local_max_weight;
 
-		   
-  ierr = hypre_remove_entry(max_weight, &weight_max, 
-		      previous, next, first, &last,
-		      head, tail, 
-		      face_to_eliminate);
+   max_weight = i_face_weight[face_to_eliminate];
 
-  i_face_weight[face_to_eliminate] = 0;
+   last = previous[tail];
+   if (last == head)
+      weight_max = 0;
+   else
+      weight_max = i_face_weight[last];
 
-  /*----------------------------------------------------------
-   *  agglomeration step: 
-   *
-   *  put on AE_element -- list all elements 
-   *  that share face "face_to_eliminate";
-   *----------------------------------------------------------*/
+   ierr = hypre_remove_entry(max_weight, &weight_max,
+                             previous, next, first, &last,
+                             head, tail,
+                             face_to_eliminate);
+
+   i_face_weight[face_to_eliminate] = 0;
+
+   /*----------------------------------------------------------
+    *  agglomeration step:
+    *
+    *  put on AE_element -- list all elements
+    *  that share face "face_to_eliminate";
+    *----------------------------------------------------------*/
 
-  for (k = i_face_element[face_to_eliminate];
-       k < i_face_element[face_to_eliminate+1]; k++)
-    {
+   for (k = i_face_element[face_to_eliminate];
+        k < i_face_element[face_to_eliminate+1]; k++)
+   {
       /* check if element j_face_element[k] is already on the list: */
 
       if (j_face_element[k] < num_elements)
-	{
-	  if (i_element_to_AE[j_face_element[k]] == -1)
-	    {
-	      j_AE_element[AE_element_counter] = j_face_element[k];
-	      i_element_to_AE[j_face_element[k]] = AE_counter;
-	      AE_element_counter++;
-	    }
-	}
-    }	  
-
+      {
+         if (i_element_to_AE[j_face_element[k]] == -1)
+         {
+            j_AE_element[AE_element_counter] = j_face_element[k];
+            i_element_to_AE[j_face_element[k]] = AE_counter;
+            AE_element_counter++;
+         }
+      }
+   }
 
-  /* local update & search:==================================== */
+   /* local update & search:==================================== */
 
-  for (j=i_face_face[face_to_eliminate];
-       j<i_face_face[face_to_eliminate+1]; j++)
-    if (i_face_weight[j_face_face[j]] > 0)
+   for (j=i_face_face[face_to_eliminate];
+        j<i_face_face[face_to_eliminate+1]; j++)
+      if (i_face_weight[j_face_face[j]] > 0)
       {
-	weight = i_face_weight[j_face_face[j]];
+         weight = i_face_weight[j_face_face[j]];
 
 
-	last = previous[tail];
-	if (last == head) 
-	  weight_max = 0;
-	else
-	  weight_max = i_face_weight[last];
+         last = previous[tail];
+         if (last == head)
+            weight_max = 0;
+         else
+            weight_max = i_face_weight[last];
+
+         ierr = hypre_move_entry(weight, &weight_max,
+                                 previous, next, first, &last,
+                                 head, tail,
+                                 j_face_face[j]);
+
+         i_face_weight[j_face_face[j]]+=w_face_face[j];
+
+         weight = i_face_weight[j_face_face[j]];
+
+         /* hypre_printf("update entry: %d\n", j_face_face[j]);  */
+
+         last = previous[tail];
+         if (last == head)
+            weight_max = 0;
+         else
+            weight_max = i_face_weight[last];
+
+         ierr = hypre_update_entry(weight, &weight_max,
+                                   previous, next, first, &last,
+                                   head, tail,
+                                   j_face_face[j]);
+
+         last = previous[tail];
+         if (last == head)
+            weight_max = 0;
+         else
+            weight_max = i_face_weight[last];
+      }
 
-	ierr = hypre_move_entry(weight, &weight_max, 
-			  previous, next, first, &last,
-			  head, tail, 
-			  j_face_face[j]);
+   /* find a face of the elements that have already been agglomerated
+      with a maximal weight: ====================================== */
 
-	i_face_weight[j_face_face[j]]+=w_face_face[j];
+   max_weight_old = max_weight;
 
-	weight = i_face_weight[j_face_face[j]];
+   face_local_max_weight = -1;
+   preferred_weight = -1;
+
+   for (l = i_AE_element[AE_counter];
+        l < AE_element_counter; l++)
+   {
+      for (j=i_element_face[j_AE_element[l]];
+           j<i_element_face[j_AE_element[l]+1]; j++)
+      {
+         i = j_element_face[j];
 
-	/* hypre_printf("update entry: %d\n", j_face_face[j]);  */
+         if (max_weight_old > 1 && i_face_weight[i] > 0 &&
+             i_face_to_prefer_weight[i] > -1)
+         {
+            if ( max_weight < i_face_weight[i])
+            {
+               face_local_max_weight = i;
+               max_weight = i_face_weight[i];
+               preferred_weight = i_face_to_prefer_weight[i];
+            }
 
-	last = previous[tail];
-	if (last == head) 
-	  weight_max = 0;
-	else
-	  weight_max = i_face_weight[last];
+            if ( max_weight == i_face_weight[i]
+                 && i_face_to_prefer_weight[i] > preferred_weight)
+            {
+               face_local_max_weight = i;
+               preferred_weight = i_face_to_prefer_weight[i];
+            }
+         }
+      }
+   }
 
-	ierr = hypre_update_entry(weight, &weight_max, 
-			    previous, next, first, &last,
-			    head, tail, 
-			    j_face_face[j]);
+   if (face_local_max_weight > -1) goto eliminate_face;
 
-	last = previous[tail];
-	if (last == head) 
-	  weight_max = 0;
-	else
-	  weight_max = i_face_weight[last];
-		
-      }
+   /* ----------------------------------------------------------------
+    * eliminate and label with i_face_weight[ ] = -1
+    * "boundary faces of agglomerated elements";
+    * those faces will be preferred for the next coarse spaces
+    * in case multiple coarse spaces are to be built;
+    * ---------------------------------------------------------------*/
 
-  /* find a face of the elements that have already been agglomerated
-     with a maximal weight: ====================================== */
-	  
-  max_weight_old = max_weight;
+   for (k = i_AE_element[AE_counter]; k < AE_element_counter; k++)
+   {
+      for (j = i_element_face[j_AE_element[k]];
+           j < i_element_face[j_AE_element[k]+1]; j++)
+      {
+         if (i_face_weight[j_element_face[j]] > 0)
+         {
+            weight = i_face_weight[j_element_face[j]];
+            last = previous[tail];
+            if (last == head)
+               weight_max = 0;
+            else
+               weight_max = i_face_weight[last];
 
-  face_local_max_weight = -1; 
-  preferred_weight = -1;
 
-  for (l = i_AE_element[AE_counter];
-       l < AE_element_counter; l++)
-    {
-      for (j=i_element_face[j_AE_element[l]];
-	   j<i_element_face[j_AE_element[l]+1]; j++)
-	{
-	  i = j_element_face[j];
-
-	  if (max_weight_old > 1 && i_face_weight[i] > 0 &&
-	      i_face_to_prefer_weight[i] > -1)
-	    {
-	      if ( max_weight < i_face_weight[i])
-		{
-		  face_local_max_weight = i;
-		  max_weight = i_face_weight[i];
-		  preferred_weight = i_face_to_prefer_weight[i];
-		}
-
-	      if ( max_weight == i_face_weight[i]
-		   && i_face_to_prefer_weight[i] > preferred_weight)
-		{
-		  face_local_max_weight = i;
-		  preferred_weight = i_face_to_prefer_weight[i];
-		}
-
-	    }		
-	}
-    }  
-
-  if (face_local_max_weight > -1) goto eliminate_face;
-
-  /* ----------------------------------------------------------------
-   * eliminate and label with i_face_weight[ ] = -1
-   * "boundary faces of agglomerated elements";
-   * those faces will be preferred for the next coarse spaces 
-   * in case multiple coarse spaces are to be built;    
-   * ---------------------------------------------------------------*/
+            ierr = hypre_remove_entry(weight, &weight_max,
+                                      previous, next, first, &last,
+                                      head, tail,
+                                      j_element_face[j]);
 
-  for (k = i_AE_element[AE_counter]; k < AE_element_counter; k++)
-    {
-      for (j = i_element_face[j_AE_element[k]];
-	   j < i_element_face[j_AE_element[k]+1]; j++)
-	{
-	  if (i_face_weight[j_element_face[j]] > 0)
-	    {
-	      weight = i_face_weight[j_element_face[j]];
-	      last = previous[tail];
-	      if (last == head) 
-		weight_max = 0;
-	      else
-		weight_max = i_face_weight[last];
-
-
-	      ierr = hypre_remove_entry(weight, &weight_max, 
-				  previous, next, first, &last,
-				  head, tail, 
-				  j_element_face[j]);
+            i_face_weight[j_element_face[j]] = -1;
 
-	      i_face_weight[j_element_face[j]] = -1;
+         }
+      }
+   }
 
-	    }
-	}
-    }
-      
-  if (AE_element_counter > i_AE_element[AE_counter]) 
-    {
-      /* hypre_printf("completing agglomerated element: %d\n", 
-		  AE_counter);   */ 
+   if (AE_element_counter > i_AE_element[AE_counter])
+   {
+      /* hypre_printf("completing agglomerated element: %d\n",
+         AE_counter);   */
       AE_counter++;
-    }
-
-  i_AE_element[AE_counter] = AE_element_counter;
-      
+   }
 
-  /* find a face with maximal weight: ---------------------------*/
+   i_AE_element[AE_counter] = AE_element_counter;
 
+   /* find a face with maximal weight: ---------------------------*/
 
-  last = previous[tail];
-  if (last == head) goto end_agglomerate;
+   last = previous[tail];
+   if (last == head) goto end_agglomerate;
 
-  weight_max = i_face_weight[last];
+   weight_max = i_face_weight[last];
 
-      
-  /* hypre_printf("global search: ======================================\n"); */
+   /* hypre_printf("global search: ======================================\n"); */
 
-  face_max_weight = -1;
+   face_max_weight = -1;
 
-  k = last;
-  while (k != head)
-    {
+   k = last;
+   while (k != head)
+   {
       if (i_face_to_prefer_weight[k] > -1)
-	face_max_weight = k;
+         face_max_weight = k;
 
+      if (face_max_weight > -1)
+      {
+         max_weight = i_face_weight[face_max_weight];
+         l = face_max_weight;
 
-      if (face_max_weight > -1) 
-	{
-	  max_weight = i_face_weight[face_max_weight];
-	  l = face_max_weight;
-
-	  while (previous[l] != head)
-	    {
-
-	      if (i_face_weight[previous[l]] < max_weight) 
-		break;
-	      else
-		if (i_face_to_prefer_weight[previous[l]] > 
-		    i_face_to_prefer_weight[face_max_weight])
-		  {
-		    l = previous[l];
-		    face_max_weight = l;
-		  }
-		else
-		  l = previous[l];
-	    }
+         while (previous[l] != head)
+         {
 
-	  break; 
-	}
+            if (i_face_weight[previous[l]] < max_weight)
+               break;
+            else
+               if (i_face_to_prefer_weight[previous[l]] >
+                   i_face_to_prefer_weight[face_max_weight])
+               {
+                  l = previous[l];
+                  face_max_weight = l;
+               }
+               else
+                  l = previous[l];
+         }
 
+         break;
+      }
 
       l =previous[k];
       /* remove face k: ---------------------------------------*/
 
-
       weight = i_face_weight[k];
       last = previous[tail];
-      if (last == head) 
-	weight_max = 0;
+      if (last == head)
+         weight_max = 0;
       else
-	weight_max = i_face_weight[last];
+         weight_max = i_face_weight[last];
 
 
-      ierr = hypre_remove_entry(weight, &weight_max, 
-			  previous, next, first, &last,
-			  head, tail, 
-			  k);
+      ierr = hypre_remove_entry(weight, &weight_max,
+                                previous, next, first, &last,
+                                head, tail,
+                                k);
 
-			  
       /* i_face_weight[k] = -1; */
 
-
       k=l;
-    }
-
-  if (face_max_weight == -1) goto end_agglomerate;
+   }
 
-  max_weight = i_face_weight[face_max_weight];
+   if (face_max_weight == -1) goto end_agglomerate;
 
-  face_local_max_weight = face_max_weight;
+   max_weight = i_face_weight[face_max_weight];
 
-  goto eliminate_face;
+   face_local_max_weight = face_max_weight;
 
-end_agglomerate:
+   goto eliminate_face;
 
+  end_agglomerate:
 
-  /* eliminate isolated elements: ----------------------------------*/
+   /* eliminate isolated elements: ----------------------------------*/
 
-  for (i=0; i<num_elements; i++)
-    {
+   for (i=0; i<num_elements; i++)
+   {
 
       if (i_element_to_AE[i] == -1)
-	{
-	  for (j=i_element_face[i]; j < i_element_face[i+1]
-		 && i_element_to_AE[i] == -1; j++)
-	    if (i_face_to_prefer_weight[j_element_face[j]] > -1)
-	      for (k=i_face_element[j_element_face[j]];
-		   k<i_face_element[j_element_face[j]+1]
-		     && i_element_to_AE[i] == -1; k++)
-		if (i_element_to_AE[j_face_element[k]] != -1)
-		  i_element_to_AE[i] = i_element_to_AE[j_face_element[k]];
-	}
+      {
+         for (j=i_element_face[i]; j < i_element_face[i+1]
+                 && i_element_to_AE[i] == -1; j++)
+            if (i_face_to_prefer_weight[j_element_face[j]] > -1)
+               for (k=i_face_element[j_element_face[j]];
+                    k<i_face_element[j_element_face[j]+1]
+                       && i_element_to_AE[i] == -1; k++)
+                  if (i_element_to_AE[j_face_element[k]] != -1)
+                     i_element_to_AE[i] = i_element_to_AE[j_face_element[k]];
+      }
 
       /*
-      if (i_element_to_AE[i] == -1)
-	{
-	  i_element_face_counter = 0;
-	  for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
-	    if (i_face_to_prefer_weight[j_element_face[j]] > -1)
-	      i_element_face_counter++;
-
-	  if (i_element_face_counter == 1)
-	    {
-	      for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
-		if (i_face_to_prefer_weight[j_element_face[j]] > -1)
-		  for (k=i_face_element[j_element_face[j]];
-		       k<i_face_element[j_element_face[j]+1]; k++)
-		    if (i_element_to_AE[j_face_element[k]] != -1)
-		      i_element_to_AE[i] = i_element_to_AE[j_face_element[k]];
-	    }
-	}
-	*/
+        if (i_element_to_AE[i] == -1)
+        {
+        i_element_face_counter = 0;
+        for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
+        if (i_face_to_prefer_weight[j_element_face[j]] > -1)
+        i_element_face_counter++;
+
+        if (i_element_face_counter == 1)
+        {
+        for (j=i_element_face[i]; j < i_element_face[i+1]; j++)
+        if (i_face_to_prefer_weight[j_element_face[j]] > -1)
+        for (k=i_face_element[j_element_face[j]];
+        k<i_face_element[j_element_face[j]+1]; k++)
+        if (i_element_to_AE[j_face_element[k]] != -1)
+        i_element_to_AE[i] = i_element_to_AE[j_face_element[k]];
+        }
+        }
+      */
 
       if (i_element_to_AE[i] == -1)
-	{
-	  i_element_to_AE[i] = AE_counter;
-	  AE_counter++;
-	}
-    }
-	  
-  num_AEs_pointer[0] = AE_counter;
+      {
+         i_element_to_AE[i] = AE_counter;
+         AE_counter++;
+      }
+   }
 
+   num_AEs_pointer[0] = AE_counter;
 
-  /* compute adjoint graph: -------------------------------------------*/
+   /* compute adjoint graph: -------------------------------------------*/
 
-  for (i=0; i < AE_counter; i++)
-    i_AE_element[i] = 0;
+   for (i=0; i < AE_counter; i++)
+      i_AE_element[i] = 0;
 
-  for (i=0; i < num_elements; i++)
-    i_AE_element[i_element_to_AE[i]]++;
+   for (i=0; i < num_elements; i++)
+      i_AE_element[i_element_to_AE[i]]++;
 
-  i_AE_element[AE_counter] = num_elements;
+   i_AE_element[AE_counter] = num_elements;
 
-  for (i=AE_counter-1; i > -1; i--)
-    i_AE_element[i] = i_AE_element[i+1] - i_AE_element[i];
+   for (i=AE_counter-1; i > -1; i--)
+      i_AE_element[i] = i_AE_element[i+1] - i_AE_element[i];
 
-  for (i=0; i < num_elements; i++)
-    {
+   for (i=0; i < num_elements; i++)
+   {
       j_AE_element[i_AE_element[i_element_to_AE[i]]] = i;
       i_AE_element[i_element_to_AE[i]]++;
-    }
+   }
 
-  for (i=AE_counter-1; i > -1; i--)
-    i_AE_element[i+1] = i_AE_element[i];
+   for (i=AE_counter-1; i > -1; i--)
+      i_AE_element[i+1] = i_AE_element[i];
 
-  i_AE_element[0] = 0;
+   i_AE_element[0] = 0;
 
-  /*--------------------------------------------------------------------*/
-  for (i=0; i < num_faces; i++)
-    if (i_face_to_prefer_weight[i] == -1) i_face_weight[i] = -1;
+   /*--------------------------------------------------------------------*/
+   for (i=0; i < num_faces; i++)
+      if (i_face_to_prefer_weight[i] == -1) i_face_weight[i] = -1;
 
 
-  hypre_TFree(i_element_to_AE);
+   hypre_TFree(i_element_to_AE);
 
-  hypre_TFree(previous);
-  hypre_TFree(next);
-  hypre_TFree(first);
+   hypre_TFree(previous);
+   hypre_TFree(next);
+   hypre_TFree(first);
 
-  return ierr;
+   return ierr;
 }
 
-HYPRE_Int hypre_update_entry(HYPRE_Int weight, HYPRE_Int *weight_max, 
-		 HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last,
-		 HYPRE_Int head, HYPRE_Int tail, 
-		 HYPRE_Int i)
+HYPRE_Int hypre_update_entry(HYPRE_Int weight, HYPRE_Int *weight_max,
+                             HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last,
+                             HYPRE_Int head, HYPRE_Int tail,
+                             HYPRE_Int i)
 
 {
-  HYPRE_Int weight0;
-
-  if (previous[i] != head) next[previous[i]] = next[i];
-  previous[next[i]] = previous[i];
+   HYPRE_Int weight0;
 
+   if (previous[i] != head) next[previous[i]] = next[i];
+   previous[next[i]] = previous[i];
 
-  if (first[weight] == tail)
-    {
-      if (weight <= weight_max[0]) 
-	{
-	  hypre_printf("ERROR IN UPDATE_ENTRY: ===================\n");
-	  hypre_printf("weight: %d, weight_max: %d\n",
-		 weight, weight_max[0]);
-	  return -1;
-	}
+   if (first[weight] == tail)
+   {
+      if (weight <= weight_max[0])
+      {
+         hypre_printf("ERROR IN UPDATE_ENTRY: ===================\n");
+         hypre_printf("weight: %d, weight_max: %d\n",
+                      weight, weight_max[0]);
+         return -1;
+      }
       for (weight0=weight_max[0]+1; weight0 <= weight; weight0++)
-	{
-	  first[weight0] = i;
-	  /* hypre_printf("create first[%d] = %d\n", weight0, i); */
-	}
-
-	  previous[i] = previous[tail];
-	  next[i] = tail;
-	  if (previous[tail] > head)
-	     next[previous[tail]] = i;
-	  previous[tail] = i;
+      {
+         first[weight0] = i;
+         /* hypre_printf("create first[%d] = %d\n", weight0, i); */
+      }
 
-    }
-  else
-    /* first[weight] already exists: =====================*/
-    {
+      previous[i] = previous[tail];
+      next[i] = tail;
+      if (previous[tail] > head)
+         next[previous[tail]] = i;
+      previous[tail] = i;
+
+   }
+   else
+      /* first[weight] already exists: =====================*/
+   {
       previous[i] = previous[first[weight]];
       next[i] = first[weight];
-      
+
       if (previous[first[weight]] != head)
-	next[previous[first[weight]]] = i;
+         next[previous[first[weight]]] = i;
 
       previous[first[weight]] = i;
 
       for (weight0=1; weight0 <= weight; weight0++)
-	if (first[weight0] == first[weight])
-	  first[weight0] = i;
-
-    }
+         if (first[weight0] == first[weight])
+            first[weight0] = i;
 
+   }
 
-  return 0;
-    
+   return 0;
 }
 
-HYPRE_Int hypre_remove_entry(HYPRE_Int weight, HYPRE_Int *weight_max, 
-		 HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last,
-		 HYPRE_Int head, HYPRE_Int tail, 
-		 HYPRE_Int i)
+HYPRE_Int hypre_remove_entry(HYPRE_Int weight, HYPRE_Int *weight_max,
+                             HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last,
+                             HYPRE_Int head, HYPRE_Int tail,
+                             HYPRE_Int i)
 {
-  HYPRE_Int weight0;
+   HYPRE_Int weight0;
 
-  if (previous[i] != head) next[previous[i]] = next[i];
-  previous[next[i]] = previous[i];
+   if (previous[i] != head) next[previous[i]] = next[i];
+   previous[next[i]] = previous[i];
 
-  for (weight0=1; weight0 <= weight_max[0]; weight0++)
-    {
+   for (weight0=1; weight0 <= weight_max[0]; weight0++)
+   {
       /* hypre_printf("first[%d}: %d\n", weight0,  first[weight0]); */
       if (first[weight0] == i)
-	{
-	  first[weight0] = next[i];
-	  /* hypre_printf("shift: first[%d]= %d to %d\n",
-		 weight0, i, next[i]);
-	  if (i == last[0]) 
-	    hypre_printf("i= last[0]: %d\n", i); */
-	}
-    }
-
-  next[i] = i;
-  previous[i] = i;
+      {
+         first[weight0] = next[i];
+         /* hypre_printf("shift: first[%d]= %d to %d\n",
+            weight0, i, next[i]);
+            if (i == last[0])
+            hypre_printf("i= last[0]: %d\n", i); */
+      }
+   }
 
-  return 0;
+   next[i] = i;
+   previous[i] = i;
 
+   return 0;
 }
 
-HYPRE_Int hypre_move_entry(HYPRE_Int weight, HYPRE_Int *weight_max, 
-	       HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last,
-	       HYPRE_Int head, HYPRE_Int tail, 
-	       HYPRE_Int i)
+HYPRE_Int hypre_move_entry(HYPRE_Int weight, HYPRE_Int *weight_max,
+                           HYPRE_Int *previous, HYPRE_Int *next, HYPRE_Int *first, HYPRE_Int *last,
+                           HYPRE_Int head, HYPRE_Int tail,
+                           HYPRE_Int i)
 {
-  HYPRE_Int  weight0;
+   HYPRE_Int  weight0;
 
-  if (previous[i] != head) next[previous[i]] = next[i];
-  previous[next[i]] = previous[i];
+   if (previous[i] != head) next[previous[i]] = next[i];
+   previous[next[i]] = previous[i];
 
-  for (weight0=1; weight0 <= weight_max[0]; weight0++)
-    {
+   for (weight0=1; weight0 <= weight_max[0]; weight0++)
+   {
       if (first[weight0] == i)
-	first[weight0] = next[i];
-    }
-
-  return 0;
+         first[weight0] = next[i];
+   }
 
+   return 0;
 }
 
-
-
 /*---------------------------------------------------------------------
- hypre_matinv:  X <--  A**(-1) ;  A IS POSITIVE DEFINITE (non--symmetric);
- ---------------------------------------------------------------------*/
-      
+  hypre_matinv:  X <--  A**(-1) ;  A IS POSITIVE DEFINITE (non--symmetric);
+  ---------------------------------------------------------------------*/
+
 HYPRE_Int hypre_matinv(HYPRE_Real *x, HYPRE_Real *a, HYPRE_Int k)
 {
-  HYPRE_Int i,j,l, ierr =0;
-
-
+   HYPRE_Int i,j,l, ierr =0;
 
-  for (i=0; i < k; i++)
-    {
+   for (i=0; i < k; i++)
+   {
       if (a[i+i*k] <= 0.e0)
-        {
-	  if (i < k-1)
-	    {
-	      /*********
-	      hypre_printf("indefinite singular matrix in *** matinv ***:\n");
-	      hypre_printf("i:%d;  diagonal entry: %e\n", i, a[i+k*i]);
-	      */
-	      ierr = -1;
-	    }
-
-            a[i+i*k] = 0.e0;
-        }
-         else
-            a[i+k*i] = 1.0 / a[i+i*k];
-
+      {
+         if (i < k-1)
+         {
+            /*********
+                      hypre_printf("indefinite singular matrix in *** matinv ***:\n");
+                      hypre_printf("i:%d;  diagonal entry: %e\n", i, a[i+k*i]);
+            */
+            ierr = -1;
+         }
 
+         a[i+i*k] = 0.e0;
+      }
+      else
+         a[i+k*i] = 1.0 / a[i+i*k];
 
       for (j=1; j < k-i; j++)
-        {
-          for (l=1; l < k-i; l++)
-            {
-              a[i+l+k*(i+j)] -= a[i+l+k*i] * a[i+k*i] * a[i+k*(i+j)];
-            }
-        }
-      
-      for (j=1; j < k-i; j++)
-        {
-          a[i+j+k*i] = a[i+j+k*i] * a[i+k*i];
-          a[i+k*(i+j)] = a[i+k*(i+j)] * a[i+k*i];
-        }
-    }
-
-
-
-  /* FULL INVERSION: --------------------------------------------*/
-  
-
-
+      {
+         for (l=1; l < k-i; l++)
+         {
+            a[i+l+k*(i+j)] -= a[i+l+k*i] * a[i+k*i] * a[i+k*(i+j)];
+         }
+      }
 
-  x[k*k-1] = a[k*k-1];
-  for (i=k-1; i > -1; i--)
-    {
       for (j=1; j < k-i; j++)
-        {
-          x[i+j+k*i] =0;
-          x[i+k*(i+j)] =0;
-
-
+      {
+         a[i+j+k*i] = a[i+j+k*i] * a[i+k*i];
+         a[i+k*(i+j)] = a[i+k*(i+j)] * a[i+k*i];
+      }
+   }
 
-          for (l=1; l< k-i; l++)
-            {
-              x[i+j+k*i] -= x[i+j+k*(i+l)] * a[i+l+k*i];
-              x[i+k*(i+j)] -= a[i+k*(i+l)] * x[i+l+k*(i+j)];
-            }
-        }
+   /* FULL INVERSION: --------------------------------------------*/
 
+   x[k*k-1] = a[k*k-1];
+   for (i=k-1; i > -1; i--)
+   {
+      for (j=1; j < k-i; j++)
+      {
+         x[i+j+k*i] =0;
+         x[i+k*(i+j)] =0;
 
+         for (l=1; l< k-i; l++)
+         {
+            x[i+j+k*i] -= x[i+j+k*(i+l)] * a[i+l+k*i];
+            x[i+k*(i+j)] -= a[i+k*(i+l)] * x[i+l+k*(i+j)];
+         }
+      }
 
       x[i+k*i] = a[i+k*i];
       for (j=1; j<k-i; j++)
-        {
-          x[i+k*i] -= x[i+k*(i+j)] * a[i+j+k*i];
-        }
-    }
-
-
+      {
+         x[i+k*i] -= x[i+k*(i+j)] * a[i+j+k*i];
+      }
+   }
 
-  return ierr;
+   return ierr;
 }
 
 HYPRE_Int
 hypre_parCorrRes( hypre_ParCSRMatrix *A,
-		  hypre_ParVector *x,
-		  hypre_Vector *rhs,
-		  HYPRE_Real **tmp_ptr)
+                  hypre_ParVector *x,
+                  hypre_Vector *rhs,
+                  HYPRE_Real **tmp_ptr)
 {
    HYPRE_Int i, j, index, start;
    HYPRE_Int num_sends, num_cols_offd;
@@ -2911,8 +2493,8 @@
    if (num_cols_offd)
    {
       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
-      x_buf_data = hypre_CTAlloc(HYPRE_Real, 
-			hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));
+      x_buf_data = hypre_CTAlloc(HYPRE_Real,
+                                 hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));
 
       index = 0;
       for (i = 0; i < num_sends; i++)
@@ -2920,7 +2502,7 @@
          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
          for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
             x_buf_data[index++]
-                 = x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
+               = x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
       }
 
       x_tmp = hypre_SeqVectorCreate(num_cols_offd);
@@ -2928,12 +2510,12 @@
       x_tmp_data = hypre_VectorData(x_tmp);
 
       comm_handle = hypre_ParCSRCommHandleCreate( 1, comm_pkg, x_buf_data,
-        		x_tmp_data);
+                                                  x_tmp_data);
 
       tmp_vector = hypre_SeqVectorCreate(local_size);
       hypre_SeqVectorInitialize(tmp_vector);
       hypre_SeqVectorCopy(rhs,tmp_vector);
-   
+
       hypre_ParCSRCommHandleDestroy(comm_handle);
       comm_handle = NULL;
 
@@ -2958,270 +2540,212 @@
 
 
 HYPRE_Int hypre_AdSchwarzSolve(hypre_ParCSRMatrix *par_A,
-                         hypre_ParVector *par_rhs,
-                         hypre_CSRMatrix *domain_structure,
-                         HYPRE_Real *scale,
-                         hypre_ParVector *par_x,
-                         hypre_ParVector *par_aux, HYPRE_Int *pivots, 
-                         HYPRE_Int use_nonsymm)
-
-{
-  HYPRE_Int ierr = 0;
-  HYPRE_Real *x;
-  HYPRE_Real *aux;
-  HYPRE_Real *tmp;
-  hypre_Vector *x_vector;
-  hypre_Vector *aux_vector;
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains;
-  HYPRE_Int max_domain_size;
-  HYPRE_Int *i_domain_dof;
-  HYPRE_Int *j_domain_dof;
-  HYPRE_Real *domain_matrixinverse;
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-  char uplo = 'L';
-
-  HYPRE_Int jj,i,j; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  HYPRE_Int num_procs;
-
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  x_vector = hypre_ParVectorLocalVector(par_x);
-  aux_vector = hypre_ParVectorLocalVector(par_aux);
-  x = hypre_VectorData(x_vector);
-  aux = hypre_VectorData(aux_vector);
-  num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
-  i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-
-  if (use_nonsymm)
-     uplo = 'N';
-
-  hypre_ParVectorCopy(par_rhs,par_aux);
-  hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,par_aux);
-  tmp = hypre_CTAlloc(HYPRE_Real,max_domain_size);
-
-  /* forward solve: ----------------------------------------------- */
-
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-  {
-     matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
-
-     /* compute residual: ---------------------------------------- */
-     
-     jj = 0;
-     for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-     {
-        tmp[jj] = aux[j_domain_dof[j]]; 
-        jj++;
-     }
-     /* solve for correction: ------------------------------------- */
-/* OLD -  HYPRE_USING_ESSL
-     dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, tmp, 1);
-*/
-     if (use_nonsymm)
-     {
-        
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], tmp,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], tmp,
-                                               &matrix_size, &ierr); 
-#endif
-     }
-     else
-     {
-        
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, tmp,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, tmp,
-                                               &matrix_size, &ierr); 
-#endif
-     }
-     
+                               hypre_ParVector *par_rhs,
+                               hypre_CSRMatrix *domain_structure,
+                               HYPRE_Real *scale,
+                               hypre_ParVector *par_x,
+                               hypre_ParVector *par_aux, HYPRE_Int *pivots,
+                               HYPRE_Int use_nonsymm)
+{
+   HYPRE_Int ierr = 0;
+   HYPRE_Real *x;
+   HYPRE_Real *aux;
+   HYPRE_Real *tmp;
+   hypre_Vector *x_vector;
+   hypre_Vector *aux_vector;
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains;
+   HYPRE_Int max_domain_size;
+   HYPRE_Int *i_domain_dof;
+   HYPRE_Int *j_domain_dof;
+   HYPRE_Real *domain_matrixinverse;
+
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+   char uplo = 'L';
+
+   HYPRE_Int jj,i,j; /*, j_loc, k_loc;*/
+
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+
+   /* initiate:      ----------------------------------------------- */
+   x_vector = hypre_ParVectorLocalVector(par_x);
+   aux_vector = hypre_ParVectorLocalVector(par_aux);
+   x = hypre_VectorData(x_vector);
+   aux = hypre_VectorData(aux_vector);
+   num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
+   i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
+
+   if (use_nonsymm)
+      uplo = 'N';
+
+   hypre_ParVectorCopy(par_rhs,par_aux);
+   hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,par_aux);
+   tmp = hypre_CTAlloc(HYPRE_Real,max_domain_size);
 
+   /* forward solve: ----------------------------------------------- */
 
-
-     if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
-     jj = 0;
-     for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-     {
-        x[j_domain_dof[j]]+=  scale[j_domain_dof[j]]*tmp[jj++];
-     }
-/* OLD -  HYPRE_USING_ESSL
-     matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
-     matrix_size_counter += matrix_size * matrix_size;
-     piv_counter += matrix_size;
-     
-  }
-  
-  hypre_TFree(tmp);
-
-  return hypre_error_flag;
-
-}
-
-HYPRE_Int hypre_AdSchwarzCFSolve(hypre_ParCSRMatrix *par_A,
-		       hypre_ParVector *par_rhs,
-		       hypre_CSRMatrix *domain_structure,
-		       HYPRE_Real *scale,
-		       hypre_ParVector *par_x,
-		       hypre_ParVector *par_aux,
-		       HYPRE_Int *CF_marker,
-		       HYPRE_Int rlx_pt, HYPRE_Int *pivots, 
-                         HYPRE_Int use_nonsymm)
-
-{
-  HYPRE_Int ierr = 0;
-  HYPRE_Real *x;
-  HYPRE_Real *aux;
-  HYPRE_Real *tmp;
-  hypre_Vector *x_vector;
-  hypre_Vector *aux_vector;
-  MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
-  HYPRE_Int num_domains;
-  HYPRE_Int max_domain_size;
-  HYPRE_Int *i_domain_dof;
-  HYPRE_Int *j_domain_dof;
-  HYPRE_Real *domain_matrixinverse;
-
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-
-  char uplo = 'L';
-  HYPRE_Int jj,i,j; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  HYPRE_Int num_procs;
-
-  hypre_MPI_Comm_size(comm,&num_procs);
-
-  /* initiate:      ----------------------------------------------- */
-  x_vector = hypre_ParVectorLocalVector(par_x);
-  aux_vector = hypre_ParVectorLocalVector(par_aux);
-  x = hypre_VectorData(x_vector);
-  aux = hypre_VectorData(aux_vector);
-  num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
-  i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-
-  if (use_nonsymm)
-     uplo = 'N';
-  
-
-  hypre_ParVectorCopy(par_rhs,par_aux);
-  hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,par_aux);
-  tmp = hypre_CTAlloc(HYPRE_Real,max_domain_size);
-
-  /* forward solve: ----------------------------------------------- */
-
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-    {
-      if (CF_marker[i] == rlx_pt)
-      {
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
       matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
       /* compute residual: ---------------------------------------- */
 
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  tmp[jj] = aux[j_domain_dof[j]]; 
-	  jj++;
-	}
+      {
+         tmp[jj] = aux[j_domain_dof[j]];
+         jj++;
+      }
       /* solve for correction: ------------------------------------- */
-/* OLD -  HYPRE_USING_ESSL
-	dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, tmp, 1);
-*/
       if (use_nonsymm)
       {
-         
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], tmp,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], tmp,
-                                               &matrix_size, &ierr); 
-#endif
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], tmp,
+                      &matrix_size, &ierr);
       }
-      
       else
       {
-         
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, tmp,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, tmp,
-                                               &matrix_size, &ierr); 
-#endif 
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, tmp,
+                      &matrix_size, &ierr);
       }
-      
-
 
       if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
       jj = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	   x[j_domain_dof[j]]+=  scale[j_domain_dof[j]]*tmp[jj++];
-	}
-/* OLD -  HYPRE_USING_ESSL
-      matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
+      {
+         x[j_domain_dof[j]]+=  scale[j_domain_dof[j]]*tmp[jj++];
+      }
       matrix_size_counter += matrix_size * matrix_size;
       piv_counter += matrix_size;
 
+   }
 
-      }
-    }
+   hypre_TFree(tmp);
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int hypre_AdSchwarzCFSolve(hypre_ParCSRMatrix *par_A,
+                                 hypre_ParVector *par_rhs,
+                                 hypre_CSRMatrix *domain_structure,
+                                 HYPRE_Real *scale,
+                                 hypre_ParVector *par_x,
+                                 hypre_ParVector *par_aux,
+                                 HYPRE_Int *CF_marker,
+                                 HYPRE_Int rlx_pt, HYPRE_Int *pivots,
+                                 HYPRE_Int use_nonsymm)
+{
+   HYPRE_Int ierr = 0;
+   HYPRE_Real *x;
+   HYPRE_Real *aux;
+   HYPRE_Real *tmp;
+   hypre_Vector *x_vector;
+   hypre_Vector *aux_vector;
+   MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
+   HYPRE_Int num_domains;
+   HYPRE_Int max_domain_size;
+   HYPRE_Int *i_domain_dof;
+   HYPRE_Int *j_domain_dof;
+   HYPRE_Real *domain_matrixinverse;
+
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+
+   char uplo = 'L';
+   HYPRE_Int jj,i,j; /*, j_loc, k_loc;*/
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   HYPRE_Int num_procs;
+
+   hypre_MPI_Comm_size(comm,&num_procs);
+
+   /* initiate:      ----------------------------------------------- */
+   x_vector = hypre_ParVectorLocalVector(par_x);
+   aux_vector = hypre_ParVectorLocalVector(par_aux);
+   x = hypre_VectorData(x_vector);
+   aux = hypre_VectorData(aux_vector);
+   num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
+   i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
+
+   if (use_nonsymm)
+      uplo = 'N';
+
+   hypre_ParVectorCopy(par_rhs,par_aux);
+   hypre_ParCSRMatrixMatvec(-1.0,par_A,par_x,1.0,par_aux);
+   tmp = hypre_CTAlloc(HYPRE_Real,max_domain_size);
+
+   /* forward solve: ----------------------------------------------- */
+
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
+      if (CF_marker[i] == rlx_pt)
+      {
+         matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
+
+         /* compute residual: ---------------------------------------- */
+
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            tmp[jj] = aux[j_domain_dof[j]];
+            jj++;
+         }
+         /* solve for correction: ------------------------------------- */
+         if (use_nonsymm)
+         {
+            hypre_dgetrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, &pivots[piv_counter], tmp,
+                         &matrix_size, &ierr);
+         }
+
+         else
+         {
+            hypre_dpotrs(&uplo, &matrix_size, &one,
+                         &domain_matrixinverse[matrix_size_counter],
+                         &matrix_size, tmp,
+                         &matrix_size, &ierr);
+         }
 
-			      
-  hypre_TFree(tmp);
+         if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
+         jj = 0;
+         for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+         {
+            x[j_domain_dof[j]]+=  scale[j_domain_dof[j]]*tmp[jj++];
+         }
+         matrix_size_counter += matrix_size * matrix_size;
+         piv_counter += matrix_size;
+      }
+   }
 
-  return hypre_error_flag;
+   hypre_TFree(tmp);
 
+   return hypre_error_flag;
 }
 
-HYPRE_Int 
+HYPRE_Int
 hypre_GenerateScale(hypre_CSRMatrix *domain_structure,
-		    HYPRE_Int              num_variables,
-		    HYPRE_Real 	     relaxation_weight,
-		    HYPRE_Real     **scale_pointer)
+                    HYPRE_Int              num_variables,
+                    HYPRE_Real       relaxation_weight,
+                    HYPRE_Real     **scale_pointer)
 {
    HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
    HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
@@ -3233,7 +2757,7 @@
 
    for (i=0; i < num_domains; i++)
       for (j = i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	 scale[j_domain_dof[j]] += 1.0;
+         scale[j_domain_dof[j]] += 1.0;
 
    for (i=0; i < num_variables; i++)
       scale[i] = relaxation_weight/scale[i];
@@ -3243,212 +2767,182 @@
    return hypre_error_flag;
 }
 
-
 HYPRE_Int hypre_ParAdSchwarzSolve(hypre_ParCSRMatrix *A,
-                            hypre_ParVector *F,
-                            hypre_CSRMatrix *domain_structure,
-                            HYPRE_Real *scale,
-                            hypre_ParVector *X,
-                            hypre_ParVector *Vtemp, 
-                            HYPRE_Int *pivots, 
-                            HYPRE_Int use_nonsymm)
-
+                                  hypre_ParVector *F,
+                                  hypre_CSRMatrix *domain_structure,
+                                  HYPRE_Real *scale,
+                                  hypre_ParVector *X,
+                                  hypre_ParVector *Vtemp,
+                                  HYPRE_Int *pivots,
+                                  HYPRE_Int use_nonsymm)
 {
-  hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
-  HYPRE_Int num_sends = 0;
-  HYPRE_Int *send_map_starts;
-  HYPRE_Int *send_map_elmts;
-
-  hypre_ParCSRCommHandle *comm_handle;
-
-  HYPRE_Int ierr = 0;
-  HYPRE_Real *x_data;
-  HYPRE_Real *x_ext_data;
-  HYPRE_Real *aux;
-  HYPRE_Real *vtemp_data;
-  HYPRE_Real *vtemp_ext_data;
-  HYPRE_Int num_domains, max_domain_size;
-  HYPRE_Int *i_domain_dof;
-  HYPRE_Int *j_domain_dof;
-  HYPRE_Real *domain_matrixinverse;
-  hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
-  HYPRE_Int num_variables;
-  HYPRE_Int num_cols_offd;
-  HYPRE_Real *scale_ext;
-  HYPRE_Real *buf_data;
-  HYPRE_Int index; 
- 
-  HYPRE_Int piv_counter = 0;
-  HYPRE_Int one = 1;
-
-  char uplo = 'L';
-  HYPRE_Int jj,i,j, j_loc; /*, j_loc, k_loc;*/
-
-
-  HYPRE_Int matrix_size, matrix_size_counter = 0;
-
-  /* initiate:      ----------------------------------------------- */
-  num_variables = hypre_CSRMatrixNumRows(A_diag); 
-  num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
-  x_data = hypre_VectorData(hypre_ParVectorLocalVector(X));
-  vtemp_data = hypre_VectorData(hypre_ParVectorLocalVector(Vtemp));
-
-  if (use_nonsymm)
-     uplo = 'N';
-  
-  hypre_ParVectorCopy(F,Vtemp);
-  hypre_ParCSRMatrixMatvec(-1.0,A,X,1.0,Vtemp); 
-
-  /* forward solve: ----------------------------------------------- */
-
-  num_domains = hypre_CSRMatrixNumRows(domain_structure);
-  max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
-  i_domain_dof = hypre_CSRMatrixI(domain_structure);
-  j_domain_dof = hypre_CSRMatrixJ(domain_structure);
-  domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
-  aux = hypre_CTAlloc(HYPRE_Real, max_domain_size);
+   hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+   HYPRE_Int num_sends = 0;
+   HYPRE_Int *send_map_starts;
+   HYPRE_Int *send_map_elmts;
 
-  if (comm_pkg)
-  {
-     num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
-     send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
-     send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
-
-     buf_data = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]);
-     x_ext_data = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
-     vtemp_ext_data = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
-     scale_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+   hypre_ParCSRCommHandle *comm_handle;
 
-     index = 0;
-     for (i=0; i < num_sends; i++)
-     {
-        for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-           buf_data[index++] = vtemp_data[send_map_elmts[j]];
-     }
+   HYPRE_Int ierr = 0;
+   HYPRE_Real *x_data;
+   HYPRE_Real *x_ext_data;
+   HYPRE_Real *aux;
+   HYPRE_Real *vtemp_data;
+   HYPRE_Real *vtemp_ext_data;
+   HYPRE_Int num_domains, max_domain_size;
+   HYPRE_Int *i_domain_dof;
+   HYPRE_Int *j_domain_dof;
+   HYPRE_Real *domain_matrixinverse;
+   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Int num_variables;
+   HYPRE_Int num_cols_offd;
+   HYPRE_Real *scale_ext;
+   HYPRE_Real *buf_data;
+   HYPRE_Int index;
+
+   HYPRE_Int piv_counter = 0;
+   HYPRE_Int one = 1;
+
+   char uplo = 'L';
+   HYPRE_Int jj,i,j, j_loc; /*, j_loc, k_loc;*/
+
+   HYPRE_Int matrix_size, matrix_size_counter = 0;
+
+   /* initiate:      ----------------------------------------------- */
+   num_variables = hypre_CSRMatrixNumRows(A_diag);
+   num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
+   x_data = hypre_VectorData(hypre_ParVectorLocalVector(X));
+   vtemp_data = hypre_VectorData(hypre_ParVectorLocalVector(Vtemp));
+
+   if (use_nonsymm)
+      uplo = 'N';
+
+   hypre_ParVectorCopy(F,Vtemp);
+   hypre_ParCSRMatrixMatvec(-1.0,A,X,1.0,Vtemp);
+
+   /* forward solve: ----------------------------------------------- */
+
+   num_domains = hypre_CSRMatrixNumRows(domain_structure);
+   max_domain_size = hypre_CSRMatrixNumCols(domain_structure);
+   i_domain_dof = hypre_CSRMatrixI(domain_structure);
+   j_domain_dof = hypre_CSRMatrixJ(domain_structure);
+   domain_matrixinverse = hypre_CSRMatrixData(domain_structure);
+   aux = hypre_CTAlloc(HYPRE_Real, max_domain_size);
+
+   if (comm_pkg)
+   {
+      num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
+      send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
+      send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
+
+      buf_data = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]);
+      x_ext_data = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+      vtemp_ext_data = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+      scale_ext = hypre_CTAlloc(HYPRE_Real, num_cols_offd);
+
+      index = 0;
+      for (i=0; i < num_sends; i++)
+      {
+         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+            buf_data[index++] = vtemp_data[send_map_elmts[j]];
+      }
 
-     comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,
-		vtemp_ext_data);
-     hypre_ParCSRCommHandleDestroy(comm_handle);
-     comm_handle = NULL;
+      comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,
+                                                 vtemp_ext_data);
+      hypre_ParCSRCommHandleDestroy(comm_handle);
+      comm_handle = NULL;
 
-     index = 0;
-     for (i=0; i < num_sends; i++)
-     {
-        for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-           buf_data[index++] = scale[send_map_elmts[j]];
-     }
+      index = 0;
+      for (i=0; i < num_sends; i++)
+      {
+         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+            buf_data[index++] = scale[send_map_elmts[j]];
+      }
 
+      comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,scale_ext);
+      hypre_ParCSRCommHandleDestroy(comm_handle);
+      comm_handle = NULL;
+   }
 
-     comm_handle = hypre_ParCSRCommHandleCreate(1,comm_pkg,buf_data,scale_ext);
-     hypre_ParCSRCommHandleDestroy(comm_handle);
-     comm_handle = NULL;
-  }
+   for (i=0; i < num_cols_offd; i++)
+      x_ext_data[i] = 0.0;
 
-  for (i=0; i < num_cols_offd; i++)
-     x_ext_data[i] = 0.0;
+   matrix_size_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
+      matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
 
-  matrix_size_counter = 0;
-  for (i=0; i < num_domains; i++)
-  {
-     matrix_size = i_domain_dof[i+1] - i_domain_dof[i];
-     
-     /* copy data contiguously into aux  --------------------------- */
-     
-     jj = 0;
-     for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-     {
-        j_loc = j_domain_dof[j]; 
-        if (j_loc < num_variables)
-           aux[jj] = vtemp_data[j_loc];
-        else
-           aux[jj] = vtemp_ext_data[j_loc-num_variables];
-        jj++;
-     }
-     /* solve for correction: ------------------------------------- */
-/* OLD -  HYPRE_USING_ESSL
-     dpps(&domain_matrixinverse[matrix_size_counter], matrix_size, aux, 1);
-*/
-     if (use_nonsymm)
-     {
-        
-#ifdef HYPRE_USING_ESSL
-         dgetrs(&uplo, matrix_size, one, 
-                &domain_matrixinverse[matrix_size_counter], 
-                matrix_size, &pivots[piv_counter], aux,
-                matrix_size, &ierr); 
-#else
-         hypre_F90_NAME_LAPACK(dgetrs, DGETRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, &pivots[piv_counter], aux,
-                                               &matrix_size, &ierr); 
-#endif
-     }
-     
-     else
-     {
-        
-#ifdef HYPRE_USING_ESSL
-            dpotrs(&uplo, matrix_size, one, 
-                   &domain_matrixinverse[matrix_size_counter], 
-                   matrix_size, aux,
-                   matrix_size, &ierr); 
-#else
-            hypre_F90_NAME_LAPACK(dpotrs, DPOTRS)(&uplo, &matrix_size, &one, 
-                                               &domain_matrixinverse[matrix_size_counter], 
-                                               &matrix_size, aux,
-                                               &matrix_size, &ierr); 
-#endif
-     }
-     
-     if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
-     jj = 0;
-     for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-     {
-        j_loc = j_domain_dof[j]; 
-        if (j_loc < num_variables)
-           x_data[j_loc]+= scale[j_loc] * aux[jj++];
-        else
-        {
-           j_loc -= num_variables;
-           x_ext_data[j_loc] += scale_ext[j_loc] * aux[jj++];
-        }
-     }
-/* OLD -  HYPRE_USING_ESSL
-     matrix_size_counter += matrix_size * (matrix_size+1)/2;  
-*/
-     matrix_size_counter += matrix_size * matrix_size;
-     piv_counter += matrix_size;
+      /* copy data contiguously into aux  --------------------------- */
 
-     
-  }
+      jj = 0;
+      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+      {
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+            aux[jj] = vtemp_data[j_loc];
+         else
+            aux[jj] = vtemp_ext_data[j_loc-num_variables];
+         jj++;
+      }
+      /* solve for correction: ------------------------------------- */
+      if (use_nonsymm)
+      {
+         hypre_dgetrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, &pivots[piv_counter], aux,
+                      &matrix_size, &ierr);
+      }
+      else
+      {
+         hypre_dpotrs(&uplo, &matrix_size, &one,
+                      &domain_matrixinverse[matrix_size_counter],
+                      &matrix_size, aux,
+                      &matrix_size, &ierr);
+      }
 
-  if (comm_pkg)
-  {
-     comm_handle=hypre_ParCSRCommHandleCreate (2,comm_pkg,x_ext_data,buf_data); 
+      if (ierr)  hypre_error(HYPRE_ERROR_GENERIC);
+      jj = 0;
+      for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
+      {
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+            x_data[j_loc]+= scale[j_loc] * aux[jj++];
+         else
+         {
+            j_loc -= num_variables;
+            x_ext_data[j_loc] += scale_ext[j_loc] * aux[jj++];
+         }
+      }
+      matrix_size_counter += matrix_size * matrix_size;
+      piv_counter += matrix_size;
+   }
 
-     hypre_ParCSRCommHandleDestroy(comm_handle);
-     comm_handle = NULL;
+   if (comm_pkg)
+   {
+      comm_handle=hypre_ParCSRCommHandleCreate (2,comm_pkg,x_ext_data,buf_data);
 
-     index = 0;
-     for (i=0; i < num_sends; i++)
-     {
-        for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
-	   x_data[send_map_elmts[j]] += buf_data[index++];
-     }
+      hypre_ParCSRCommHandleDestroy(comm_handle);
+      comm_handle = NULL;
 
-     hypre_TFree(buf_data);
-     hypre_TFree(x_ext_data);
-     hypre_TFree(vtemp_ext_data);
-     hypre_TFree(scale_ext);
-  }
-  hypre_TFree(aux);
+      index = 0;
+      for (i=0; i < num_sends; i++)
+      {
+         for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
+            x_data[send_map_elmts[j]] += buf_data[index++];
+      }
 
-  return hypre_error_flag;
+      hypre_TFree(buf_data);
+      hypre_TFree(x_ext_data);
+      hypre_TFree(vtemp_ext_data);
+      hypre_TFree(scale_ext);
+   }
+   hypre_TFree(aux);
 
+   return hypre_error_flag;
 }
 
 /*--------------------------------------------------------------------------
- * hypre_ParAMGCreateDomainDof:  
+ * hypre_ParAMGCreateDomainDof:
  *--------------------------------------------------------------------------*/
 
 /*****************************************************************************
@@ -3456,7 +2950,7 @@
  * Routine for constructing graph domain_dof with minimal overlap
  *             and computing the respective matrix inverses to be
  *             used in an overlapping additive Schwarz procedure (smoother
- *             in AMG); 
+ *             in AMG);
  *
  *****************************************************************************/
 HYPRE_Int
@@ -3467,556 +2961,512 @@
                             HYPRE_Int **piv_pointer, HYPRE_Int use_nonsymm)
 
 {
-  hypre_CSRMatrix *domain_structure = NULL;
-  HYPRE_Int *i_domain_dof, *j_domain_dof;
-  HYPRE_Real *domain_matrixinverse;
-  HYPRE_Int num_domains;
-
-  hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
-  HYPRE_Int *a_diag_i = hypre_CSRMatrixI(A_diag);
-  HYPRE_Int *a_diag_j = hypre_CSRMatrixJ(A_diag);
-  HYPRE_Real *a_diag_data = hypre_CSRMatrixData(A_diag);
-  HYPRE_Int num_variables = hypre_CSRMatrixNumRows(A_diag);
-  HYPRE_Int first_col_diag = hypre_ParCSRMatrixFirstColDiag(A);
-  HYPRE_Int col_0 = first_col_diag -1 ;
-  HYPRE_Int col_n = first_col_diag + num_variables;
-
-  hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
-  HYPRE_Int *a_offd_i = hypre_CSRMatrixI(A_offd);
-  HYPRE_Int *a_offd_j = hypre_CSRMatrixJ(A_offd);
-  HYPRE_Real *a_offd_data = hypre_CSRMatrixData(A_offd);
-  HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
-  HYPRE_Int *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
-
-  hypre_CSRMatrix *A_ext;
-  HYPRE_Int *a_ext_i;
-  HYPRE_Int *a_ext_j;
-  HYPRE_Real *a_ext_data;
-
-  /* HYPRE_Int *i_dof_to_accept_weight; */
-  HYPRE_Int *i_dof_to_prefer_weight,
-    *w_dof_dof, *i_dof_weight;
-  HYPRE_Int *i_dof_to_aggregate, *i_aggregate_dof, *j_aggregate_dof;
-  
-  HYPRE_Int *i_dof_index;
-  HYPRE_Int *i_dof_index_offd;
-  HYPRE_Int *i_proc;
-  /* HYPRE_Int *row_starts = hypre_ParCSRMatrixRowStarts(A);*/
-  hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
-  HYPRE_Int num_recvs = 0;
-  HYPRE_Int *recv_vec_starts = NULL;
-
-  HYPRE_Int ierr = 0;
-  HYPRE_Int i,j,k, jj,  l_loc, i_loc, j_loc;
-  HYPRE_Int i_dof;
-  HYPRE_Int nf;
-  HYPRE_Int *i_local_to_global;
-  HYPRE_Int *i_global_to_local;
-
-  HYPRE_Int local_dof_counter, max_local_dof_counter=0; 
-
-  HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0;
-
-  HYPRE_Real *AE;
-
- 
-  HYPRE_Int *ipiv;
-  char uplo = 'L';
-  HYPRE_Int piv_counter;
-  HYPRE_Int *piv = NULL;
+   hypre_CSRMatrix *domain_structure = NULL;
+   HYPRE_Int *i_domain_dof, *j_domain_dof;
+   HYPRE_Real *domain_matrixinverse;
+   HYPRE_Int num_domains;
+
+   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Int *a_diag_i = hypre_CSRMatrixI(A_diag);
+   HYPRE_Int *a_diag_j = hypre_CSRMatrixJ(A_diag);
+   HYPRE_Real *a_diag_data = hypre_CSRMatrixData(A_diag);
+   HYPRE_Int num_variables = hypre_CSRMatrixNumRows(A_diag);
+   HYPRE_Int first_col_diag = hypre_ParCSRMatrixFirstColDiag(A);
+   HYPRE_Int col_0 = first_col_diag -1 ;
+   HYPRE_Int col_n = first_col_diag + num_variables;
+
+   hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
+   HYPRE_Int *a_offd_i = hypre_CSRMatrixI(A_offd);
+   HYPRE_Int *a_offd_j = hypre_CSRMatrixJ(A_offd);
+   HYPRE_Real *a_offd_data = hypre_CSRMatrixData(A_offd);
+   HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
+   HYPRE_Int *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
 
+   hypre_CSRMatrix *A_ext;
+   HYPRE_Int *a_ext_i;
+   HYPRE_Int *a_ext_j;
+   HYPRE_Real *a_ext_data;
+
+   /* HYPRE_Int *i_dof_to_accept_weight; */
+   HYPRE_Int *i_dof_to_prefer_weight,
+      *w_dof_dof, *i_dof_weight;
+   HYPRE_Int *i_dof_to_aggregate, *i_aggregate_dof, *j_aggregate_dof;
+
+   HYPRE_Int *i_dof_index;
+   HYPRE_Int *i_dof_index_offd;
+   HYPRE_Int *i_proc;
+   /* HYPRE_Int *row_starts = hypre_ParCSRMatrixRowStarts(A);*/
+   hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+   HYPRE_Int num_recvs = 0;
+   HYPRE_Int *recv_vec_starts = NULL;
 
-  HYPRE_Int cnt, indx;
-  HYPRE_Int num_procs, my_id;
+   HYPRE_Int ierr = 0;
+   HYPRE_Int i,j,k, jj,  l_loc, i_loc, j_loc;
+   HYPRE_Int i_dof;
+   HYPRE_Int nf;
+   HYPRE_Int *i_local_to_global;
+   HYPRE_Int *i_global_to_local;
 
+   HYPRE_Int local_dof_counter, max_local_dof_counter=0;
 
+   HYPRE_Int domain_dof_counter = 0, domain_matrixinverse_counter = 0;
 
-  if (num_variables == 0)
-  {
-     *domain_structure_pointer = domain_structure;
+   HYPRE_Real *AE;
 
-     *piv_pointer = piv;
 
-     return hypre_error_flag;
-  }
+   HYPRE_Int *ipiv;
+   char uplo = 'L';
+   HYPRE_Int piv_counter;
+   HYPRE_Int *piv = NULL;
 
+   HYPRE_Int cnt, indx;
+   HYPRE_Int num_procs, my_id;
 
-  hypre_MPI_Comm_size(hypre_ParCSRMatrixComm(A),&num_procs);
-  hypre_MPI_Comm_size(hypre_ParCSRMatrixComm(A),&my_id);
+   if (num_variables == 0)
+   {
+      *domain_structure_pointer = domain_structure;
 
-  /* --------------------------------------------------------------------- */
+      *piv_pointer = piv;
 
-  /*=======================================================================*/
-  /*    create artificial domains by agglomeration;                        */
-  /*=======================================================================*/
+      return hypre_error_flag;
+   }
 
-  /*hypre_printf("----------- create artificials domain by agglomeration;  ======\n");
-*/
-  i_aggregate_dof = hypre_CTAlloc(HYPRE_Int,num_variables+1);
-  j_aggregate_dof = hypre_CTAlloc (HYPRE_Int,num_variables);
+   hypre_MPI_Comm_size(hypre_ParCSRMatrixComm(A),&num_procs);
+   hypre_MPI_Comm_size(hypre_ParCSRMatrixComm(A),&my_id);
 
-  if (domain_type == 2)
-  {
-     i_dof_to_prefer_weight = hypre_CTAlloc (HYPRE_Int, num_variables);
-     w_dof_dof = hypre_CTAlloc (HYPRE_Int, a_diag_i[num_variables]);
+   /* --------------------------------------------------------------------- */
 
-     for (i=0; i<num_variables; i++)
-        for (j=a_diag_i[i]; j< a_diag_i[i+1]; j++)
-        {
-	   if (a_diag_j[j] == i) 
-	      w_dof_dof[j]=0;
-	   else
-	      w_dof_dof[j]=1;
-        }
+   /*=======================================================================*/
+   /*    create artificial domains by agglomeration;                        */
+   /*=======================================================================*/
 
+   /*hypre_printf("----------- create artificials domain by agglomeration;  ======\n");
+    */
+   i_aggregate_dof = hypre_CTAlloc(HYPRE_Int,num_variables+1);
+   j_aggregate_dof = hypre_CTAlloc (HYPRE_Int,num_variables);
 
-     /*hypre_printf("end computing weights for agglomeration procedure: --------\n");
-*/
+   if (domain_type == 2)
+   {
+      i_dof_to_prefer_weight = hypre_CTAlloc (HYPRE_Int, num_variables);
+      w_dof_dof = hypre_CTAlloc (HYPRE_Int, a_diag_i[num_variables]);
 
-     i_dof_weight = hypre_CTAlloc (HYPRE_Int, num_variables);
-     hypre_AMGeAgglomerate(i_aggregate_dof, j_aggregate_dof,
-			       a_diag_i, a_diag_j, w_dof_dof,
-			       a_diag_i, a_diag_j,
-			       a_diag_i, a_diag_j,
-			       i_dof_to_prefer_weight,
-			       i_dof_weight,
-			       num_variables, num_variables,
-			       &num_domains);
-
-     hypre_TFree(i_dof_to_prefer_weight);
-     hypre_TFree(i_dof_weight);
-     hypre_TFree(w_dof_dof);
+      for (i=0; i<num_variables; i++)
+         for (j=a_diag_i[i]; j< a_diag_i[i+1]; j++)
+         {
+            if (a_diag_j[j] == i)
+               w_dof_dof[j]=0;
+            else
+               w_dof_dof[j]=1;
+         }
 
-  }
-  else
-  {
-     nf = 1;
-     if (domain_type == 1) nf = num_functions;
+      /*hypre_printf("end computing weights for agglomeration procedure: --------\n");
+       */
 
-     num_domains = num_variables/nf;
-     for (i=0; i < num_domains+1; i++)
-     {
-        i_aggregate_dof[i] = nf*i;
-     }
-     for (i=0; i < num_variables; i++)
-     {
-        j_aggregate_dof[i] = i;
-     }
-  }
-  /*hypre_printf("num_variables: %d, num_domains: %d\n", num_variables, num_domains);
-*/
-  if (overlap == 1)
-  {
-     i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
+      i_dof_weight = hypre_CTAlloc (HYPRE_Int, num_variables);
+      hypre_AMGeAgglomerate(i_aggregate_dof, j_aggregate_dof,
+                            a_diag_i, a_diag_j, w_dof_dof,
+                            a_diag_i, a_diag_j,
+                            a_diag_i, a_diag_j,
+                            i_dof_to_prefer_weight,
+                            i_dof_weight,
+                            num_variables, num_variables,
+                            &num_domains);
+
+      hypre_TFree(i_dof_to_prefer_weight);
+      hypre_TFree(i_dof_weight);
+      hypre_TFree(w_dof_dof);
+   }
+   else
+   {
+      nf = 1;
+      if (domain_type == 1) nf = num_functions;
 
-     i_dof_to_aggregate = hypre_CTAlloc (HYPRE_Int, num_variables);
-     for (i=0; i < num_domains; i++)
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-           i_dof_to_aggregate[j_aggregate_dof[j]] = i;
+      num_domains = num_variables/nf;
+      for (i=0; i < num_domains+1; i++)
+      {
+         i_aggregate_dof[i] = nf*i;
+      }
+      for (i=0; i < num_variables; i++)
+      {
+         j_aggregate_dof[i] = i;
+      }
+   }
+   /*hypre_printf("num_variables: %d, num_domains: %d\n", num_variables, num_domains);
+    */
+   if (overlap == 1)
+   {
+      i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
 
-     i_proc = hypre_CTAlloc (HYPRE_Int, num_cols_offd);
+      i_dof_to_aggregate = hypre_CTAlloc (HYPRE_Int, num_variables);
+      for (i=0; i < num_domains; i++)
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            i_dof_to_aggregate[j_aggregate_dof[j]] = i;
 
-     for (i=0; i < num_cols_offd; i++)
-	i_proc[i] = 0;
+      i_proc = hypre_CTAlloc (HYPRE_Int, num_cols_offd);
 
-     if (comm_pkg)
-     {
-	num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);    
-	recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);    
-     }
-     else if (num_procs > 1)
-     {
+      for (i=0; i < num_cols_offd; i++)
+         i_proc[i] = 0;
 
-        hypre_MatvecCommPkgCreate(A);
+      if (comm_pkg)
+      {
+         num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
+         recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);
+      }
+      else if (num_procs > 1)
+      {
 
-        comm_pkg = hypre_ParCSRMatrixCommPkg(A);
-	num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);    
-	recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);    
-     }
+         hypre_MatvecCommPkgCreate(A);
 
-     for (i=0; i < num_recvs; i++)
-	for (indx=recv_vec_starts[i]; indx < recv_vec_starts[i+1]; indx++)
-              i_proc[indx] = i;
+         comm_pkg = hypre_ParCSRMatrixCommPkg(A);
+         num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
+         recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);
+      }
 
-     /* make domains from aggregates: *********************************/
+      for (i=0; i < num_recvs; i++)
+         for (indx=recv_vec_starts[i]; indx < recv_vec_starts[i+1]; indx++)
+            i_proc[indx] = i;
 
-     i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables);
+      /* make domains from aggregates: *********************************/
 
-     i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd);
+      i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables);
 
-     for (i=0; i < num_variables; i++)
-        i_dof_index[i] = -1;
+      i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd);
 
-     for (i=0; i < num_cols_offd; i++)
-        i_dof_index_offd[i] = -1;
+      for (i=0; i < num_variables; i++)
+         i_dof_index[i] = -1;
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        i_domain_dof[i] =  domain_dof_counter;
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-	   i_dof_index[j_aggregate_dof[j]]=-1;
-     	}
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-           for (k=a_diag_i[j_aggregate_dof[j]];
-	          	k<a_diag_i[j_aggregate_dof[j]+1]; k++)
-	      if (i_dof_to_aggregate[a_diag_j[k]] >= i 
-	     	&& i_dof_index[a_diag_j[k]]==-1)
-	      {
-	         i_dof_index[a_diag_j[k]]++;
-	         domain_dof_counter++;
-	      }
-           for (k=a_offd_i[j_aggregate_dof[j]];
-	    		k<a_offd_i[j_aggregate_dof[j]+1]; k++)
-	      if (i_proc[a_offd_j[k]] > my_id
-	     	&& i_dof_index_offd[a_offd_j[k]]==-1)
-	      {
-	         i_dof_index_offd[a_offd_j[k]]++;
-	         domain_dof_counter++;
-	      }
-        }
-     }
+      for (i=0; i < num_cols_offd; i++)
+         i_dof_index_offd[i] = -1;
 
-     for (i=0; i < num_variables; i++)
-        i_dof_index[i] = -1;
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         i_domain_dof[i] =  domain_dof_counter;
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            i_dof_index[j_aggregate_dof[j]]=-1;
+         }
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            for (k=a_diag_i[j_aggregate_dof[j]];
+                 k<a_diag_i[j_aggregate_dof[j]+1]; k++)
+               if (i_dof_to_aggregate[a_diag_j[k]] >= i
+                   && i_dof_index[a_diag_j[k]]==-1)
+               {
+                  i_dof_index[a_diag_j[k]]++;
+                  domain_dof_counter++;
+               }
+            for (k=a_offd_i[j_aggregate_dof[j]];
+                 k<a_offd_i[j_aggregate_dof[j]+1]; k++)
+               if (i_proc[a_offd_j[k]] > my_id
+                   && i_dof_index_offd[a_offd_j[k]]==-1)
+               {
+                  i_dof_index_offd[a_offd_j[k]]++;
+                  domain_dof_counter++;
+               }
+         }
+      }
 
-     for (i=0; i < num_cols_offd; i++)
-        i_dof_index_offd[i] = -1;
+      for (i=0; i < num_variables; i++)
+         i_dof_index[i] = -1;
 
-     i_domain_dof[num_domains] =  domain_dof_counter;
-     j_domain_dof = hypre_CTAlloc (HYPRE_Int, domain_dof_counter);
+      for (i=0; i < num_cols_offd; i++)
+         i_dof_index_offd[i] = -1;
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-	   i_dof_index[j_aggregate_dof[j]]=-1;
-     	}
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-	   for (k=a_diag_i[j_aggregate_dof[j]];
-	     		k<a_diag_i[j_aggregate_dof[j]+1]; k++)
-	      if (i_dof_to_aggregate[a_diag_j[k]] >= i
-	      		&& i_dof_index[a_diag_j[k]]==-1)
-	      {
-	         i_dof_index[a_diag_j[k]]++;
-	         j_domain_dof[domain_dof_counter] = a_diag_j[k];
-	         domain_dof_counter++;
-	      }
-	   for (k=a_offd_i[j_aggregate_dof[j]];
-	     		k<a_offd_i[j_aggregate_dof[j]+1]; k++)
-	      if (i_proc[a_offd_j[k]] > my_id
-	      		&& i_dof_index_offd[a_offd_j[k]]==-1)
-	      {
-	         i_dof_index_offd[a_offd_j[k]]++;
-	         j_domain_dof[domain_dof_counter] = a_offd_j[k]+num_variables;
-	         domain_dof_counter++;
-	      }
-         }
-     }
+      i_domain_dof[num_domains] =  domain_dof_counter;
+      j_domain_dof = hypre_CTAlloc (HYPRE_Int, domain_dof_counter);
 
-     hypre_TFree(i_aggregate_dof);
-     hypre_TFree(j_aggregate_dof);
-     hypre_TFree(i_dof_to_aggregate);
-     hypre_TFree(i_dof_index);
-     hypre_TFree(i_dof_index_offd);
-     hypre_TFree(i_proc);
-  }
-  else if (overlap == 2)
-  {
-     i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            i_dof_index[j_aggregate_dof[j]]=-1;
+         }
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            for (k=a_diag_i[j_aggregate_dof[j]];
+                 k<a_diag_i[j_aggregate_dof[j]+1]; k++)
+               if (i_dof_to_aggregate[a_diag_j[k]] >= i
+                   && i_dof_index[a_diag_j[k]]==-1)
+               {
+                  i_dof_index[a_diag_j[k]]++;
+                  j_domain_dof[domain_dof_counter] = a_diag_j[k];
+                  domain_dof_counter++;
+               }
+            for (k=a_offd_i[j_aggregate_dof[j]];
+                 k<a_offd_i[j_aggregate_dof[j]+1]; k++)
+               if (i_proc[a_offd_j[k]] > my_id
+                   && i_dof_index_offd[a_offd_j[k]]==-1)
+               {
+                  i_dof_index_offd[a_offd_j[k]]++;
+                  j_domain_dof[domain_dof_counter] = a_offd_j[k]+num_variables;
+                  domain_dof_counter++;
+               }
+         }
+      }
 
-     i_dof_to_aggregate = hypre_CTAlloc (HYPRE_Int, num_variables);
-     for (i=0; i < num_domains; i++)
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-           i_dof_to_aggregate[j_aggregate_dof[j]] = i;
+      hypre_TFree(i_aggregate_dof);
+      hypre_TFree(j_aggregate_dof);
+      hypre_TFree(i_dof_to_aggregate);
+      hypre_TFree(i_dof_index);
+      hypre_TFree(i_dof_index_offd);
+      hypre_TFree(i_proc);
+   }
+   else if (overlap == 2)
+   {
+      i_domain_dof = hypre_CTAlloc(HYPRE_Int, num_domains+1);
 
-     /* make domains from aggregates: *********************************/
+      i_dof_to_aggregate = hypre_CTAlloc (HYPRE_Int, num_variables);
+      for (i=0; i < num_domains; i++)
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+            i_dof_to_aggregate[j_aggregate_dof[j]] = i;
 
-     i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables);
+      /* make domains from aggregates: *********************************/
 
-     i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd);
+      i_dof_index = hypre_CTAlloc (HYPRE_Int, num_variables);
 
-     for (i=0; i < num_variables; i++)
-        i_dof_index[i] = -1;
+      i_dof_index_offd = hypre_CTAlloc (HYPRE_Int, num_cols_offd);
 
-     for (i=0; i < num_cols_offd; i++)
-        i_dof_index_offd[i] = -1;
+      for (i=0; i < num_variables; i++)
+         i_dof_index[i] = -1;
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        i_domain_dof[i] =  domain_dof_counter;
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-           for (k=a_diag_i[j_aggregate_dof[j]];
-	          	k<a_diag_i[j_aggregate_dof[j]+1]; k++)
-	      if ( i_dof_index[a_diag_j[k]]==-1)
-	      {
-	         i_dof_index[a_diag_j[k]]++;
-	         domain_dof_counter++;
-	      }
-           for (k=a_offd_i[j_aggregate_dof[j]];
-	    		k<a_offd_i[j_aggregate_dof[j]+1]; k++)
-	      if ( i_dof_index_offd[a_offd_j[k]]==-1)
-	      {
-	         i_dof_index_offd[a_offd_j[k]]++;
-	         domain_dof_counter++;
-	      }
-        }
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-           for (k=a_diag_i[j_aggregate_dof[j]];
-	    		k<a_diag_i[j_aggregate_dof[j]+1]; k++)
-	      i_dof_index[a_diag_j[k]]=-1;
-           for (k=a_offd_i[j_aggregate_dof[j]];
-	    		k<a_offd_i[j_aggregate_dof[j]+1]; k++)
-	      i_dof_index_offd[a_offd_j[k]]=-1;
-     	}
+      for (i=0; i < num_cols_offd; i++)
+         i_dof_index_offd[i] = -1;
 
-     }
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         i_domain_dof[i] =  domain_dof_counter;
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            for (k=a_diag_i[j_aggregate_dof[j]];
+                 k<a_diag_i[j_aggregate_dof[j]+1]; k++)
+               if ( i_dof_index[a_diag_j[k]]==-1)
+               {
+                  i_dof_index[a_diag_j[k]]++;
+                  domain_dof_counter++;
+               }
+            for (k=a_offd_i[j_aggregate_dof[j]];
+                 k<a_offd_i[j_aggregate_dof[j]+1]; k++)
+               if ( i_dof_index_offd[a_offd_j[k]]==-1)
+               {
+                  i_dof_index_offd[a_offd_j[k]]++;
+                  domain_dof_counter++;
+               }
+         }
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            for (k=a_diag_i[j_aggregate_dof[j]];
+                 k<a_diag_i[j_aggregate_dof[j]+1]; k++)
+               i_dof_index[a_diag_j[k]]=-1;
+            for (k=a_offd_i[j_aggregate_dof[j]];
+                 k<a_offd_i[j_aggregate_dof[j]+1]; k++)
+               i_dof_index_offd[a_offd_j[k]]=-1;
+         }
+      }
 
-     for (i=0; i < num_variables; i++)
-        i_dof_index[i] = -1;
+      for (i=0; i < num_variables; i++)
+         i_dof_index[i] = -1;
 
-     for (i=0; i < num_cols_offd; i++)
-        i_dof_index_offd[i] = -1;
+      for (i=0; i < num_cols_offd; i++)
+         i_dof_index_offd[i] = -1;
 
-     i_domain_dof[num_domains] =  domain_dof_counter;
-     j_domain_dof = hypre_CTAlloc (HYPRE_Int, domain_dof_counter);
+      i_domain_dof[num_domains] =  domain_dof_counter;
+      j_domain_dof = hypre_CTAlloc (HYPRE_Int, domain_dof_counter);
 
-     domain_dof_counter=0;
-     for (i=0; i < num_domains; i++)
-     {
-        for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
-        {
-	   for (k=a_diag_i[j_aggregate_dof[j]];
-	     		k<a_diag_i[j_aggregate_dof[j]+1]; k++)
-	      if ( i_dof_index[a_diag_j[k]]==-1)
-	      {
-	         i_dof_index[a_diag_j[k]]++;
-	         j_domain_dof[domain_dof_counter] = a_diag_j[k];
-	         domain_dof_counter++;
-	      }
-	   for (k=a_offd_i[j_aggregate_dof[j]];
-	     		k<a_offd_i[j_aggregate_dof[j]+1]; k++)
-	      if ( i_dof_index_offd[a_offd_j[k]]==-1)
-	      {
-	         i_dof_index_offd[a_offd_j[k]]++;
-	         j_domain_dof[domain_dof_counter] = a_offd_j[k]+num_variables;
-	         domain_dof_counter++;
-	      }
+      domain_dof_counter=0;
+      for (i=0; i < num_domains; i++)
+      {
+         for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
+         {
+            for (k=a_diag_i[j_aggregate_dof[j]];
+                 k<a_diag_i[j_aggregate_dof[j]+1]; k++)
+               if ( i_dof_index[a_diag_j[k]]==-1)
+               {
+                  i_dof_index[a_diag_j[k]]++;
+                  j_domain_dof[domain_dof_counter] = a_diag_j[k];
+                  domain_dof_counter++;
+               }
+            for (k=a_offd_i[j_aggregate_dof[j]];
+                 k<a_offd_i[j_aggregate_dof[j]+1]; k++)
+               if ( i_dof_index_offd[a_offd_j[k]]==-1)
+               {
+                  i_dof_index_offd[a_offd_j[k]]++;
+                  j_domain_dof[domain_dof_counter] = a_offd_j[k]+num_variables;
+                  domain_dof_counter++;
+               }
          }
 
          for (j=i_aggregate_dof[i]; j < i_aggregate_dof[i+1]; j++)
          {
-	    for (k=a_diag_i[j_aggregate_dof[j]];
-	     	k<a_diag_i[j_aggregate_dof[j]+1]; k++)
-	    i_dof_index[a_diag_j[k]]=-1;
-	    for (k=a_offd_i[j_aggregate_dof[j]];
-	     	k<a_offd_i[j_aggregate_dof[j]+1]; k++)
-	       i_dof_index_offd[a_offd_j[k]]=-1;
+            for (k=a_diag_i[j_aggregate_dof[j]];
+                 k<a_diag_i[j_aggregate_dof[j]+1]; k++)
+               i_dof_index[a_diag_j[k]]=-1;
+            for (k=a_offd_i[j_aggregate_dof[j]];
+                 k<a_offd_i[j_aggregate_dof[j]+1]; k++)
+               i_dof_index_offd[a_offd_j[k]]=-1;
          }
+      }
 
-     }
-
-     hypre_TFree(i_aggregate_dof);
-     hypre_TFree(j_aggregate_dof);
-     hypre_TFree(i_dof_to_aggregate);
-     hypre_TFree(i_dof_index);
-     hypre_TFree(i_dof_index_offd);
-  }
-  else
-  {
-     i_domain_dof = i_aggregate_dof;
-     j_domain_dof = j_aggregate_dof;
-  }
+      hypre_TFree(i_aggregate_dof);
+      hypre_TFree(j_aggregate_dof);
+      hypre_TFree(i_dof_to_aggregate);
+      hypre_TFree(i_dof_index);
+      hypre_TFree(i_dof_index_offd);
+   }
+   else
+   {
+      i_domain_dof = i_aggregate_dof;
+      j_domain_dof = j_aggregate_dof;
+   }
 
-  /*hypre_printf("END domain_dof computations: =================================\n");
-*/
-  domain_matrixinverse_counter = 0;
-  local_dof_counter = 0;
-  piv_counter = 0;
+   /*hypre_printf("END domain_dof computations: =================================\n");
+    */
+   domain_matrixinverse_counter = 0;
+   local_dof_counter = 0;
+   piv_counter = 0;
 
-  for (i=0; i < num_domains; i++)
-  {
-     local_dof_counter = i_domain_dof[i+1]-i_domain_dof[i];
-/* OLD- HYPRE_USING_ESSL
-      domain_matrixinverse_counter+= local_dof_counter*(local_dof_counter+1)/2;
-*/
+   for (i=0; i < num_domains; i++)
+   {
+      local_dof_counter = i_domain_dof[i+1]-i_domain_dof[i];
       domain_matrixinverse_counter+= local_dof_counter * local_dof_counter;
       piv_counter += local_dof_counter;
 
       if (local_dof_counter > max_local_dof_counter)
-	max_local_dof_counter = local_dof_counter;
-    }
-
-  domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter);
-  if (use_nonsymm)
-     piv = hypre_CTAlloc(HYPRE_Int, piv_counter);
+         max_local_dof_counter = local_dof_counter;
+   }
 
-/* OLD- HYPRE_USING_ESSL
-  AE = hypre_CTAlloc(HYPRE_Real, max_local_dof_counter*max_local_dof_counter);
-*/
+   domain_matrixinverse = hypre_CTAlloc(HYPRE_Real, domain_matrixinverse_counter);
+   if (use_nonsymm)
+      piv = hypre_CTAlloc(HYPRE_Int, piv_counter);
 
-  if (num_procs > 1)
-  {
-     A_ext = hypre_ParCSRMatrixExtractBExt(A,A,1);
-     a_ext_i = hypre_CSRMatrixI(A_ext);
-     a_ext_j = hypre_CSRMatrixJ(A_ext);
-     a_ext_data = hypre_CSRMatrixData(A_ext);
-  }
-  else 
-     A_ext = NULL;
+   if (num_procs > 1)
+   {
+      A_ext = hypre_ParCSRMatrixExtractBExt(A,A,1);
+      a_ext_i = hypre_CSRMatrixI(A_ext);
+      a_ext_j = hypre_CSRMatrixJ(A_ext);
+      a_ext_data = hypre_CSRMatrixData(A_ext);
+   }
+   else
+      A_ext = NULL;
 
-  i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
+   i_local_to_global = hypre_CTAlloc(HYPRE_Int, max_local_dof_counter);
 
-  i_global_to_local = hypre_CTAlloc(HYPRE_Int, num_variables+num_cols_offd);
+   i_global_to_local = hypre_CTAlloc(HYPRE_Int, num_variables+num_cols_offd);
 
-  for (i=0; i < num_variables+num_cols_offd; i++)
-    i_global_to_local[i] = -1;
+   for (i=0; i < num_variables+num_cols_offd; i++)
+      i_global_to_local[i] = -1;
 
-  piv_counter = 0;
-  domain_matrixinverse_counter = 0;
-  for (i=0; i < num_domains; i++)
-    {
+   piv_counter = 0;
+   domain_matrixinverse_counter = 0;
+   for (i=0; i < num_domains; i++)
+   {
       local_dof_counter = 0;
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
-	{
-	  i_global_to_local[j_domain_dof[j]] = local_dof_counter;
-	  i_local_to_global[local_dof_counter] = j_domain_dof[j];
-	  local_dof_counter++;
-	}
+      {
+         i_global_to_local[j_domain_dof[j]] = local_dof_counter;
+         i_local_to_global[local_dof_counter] = j_domain_dof[j];
+         local_dof_counter++;
+      }
 
       /* get local matrix in AE: ======================================== */
 
       AE = &domain_matrixinverse[domain_matrixinverse_counter];
       ipiv = &piv[piv_counter];
 
-      cnt = 0;  
+      cnt = 0;
       for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	for (j_loc=0; j_loc < local_dof_counter; j_loc++)
-	  AE[cnt++] = 0.e0;
+         for (j_loc=0; j_loc < local_dof_counter; j_loc++)
+            AE[cnt++] = 0.e0;
 
       for (i_loc=0; i_loc < local_dof_counter; i_loc++)
-	{
-	  i_dof = i_local_to_global[i_loc];
-	  if (i_dof < num_variables)
-	  {
-	    for (j=a_diag_i[i_dof]; j < a_diag_i[i_dof+1]; j++)
-	    {
-	      j_loc = i_global_to_local[a_diag_j[j]];
-	      if (j_loc >=0)
-		AE[i_loc + j_loc * local_dof_counter] = a_diag_data[j];
-	    }
-	    for (j=a_offd_i[i_dof]; j < a_offd_i[i_dof+1]; j++)
-	    {
-	      j_loc = i_global_to_local[a_offd_j[j]+num_variables];
-	      if (j_loc >=0)
-		AE[i_loc + j_loc * local_dof_counter] = a_offd_data[j];
-	    }
-	  }
-	  else
-	  {
-	    i_dof -= num_variables;
-	    for (j=a_ext_i[i_dof]; j < a_ext_i[i_dof+1]; j++)
-	    {
-	      jj = a_ext_j[j];
-	      if (jj > col_0 && jj < col_n)
-	      {
-	         jj = jj - first_col_diag;
-	      }
-	      else
-	      {
-		 jj = hypre_BinarySearch(col_map_offd,jj,num_cols_offd);
-		 if (jj > -1) jj += num_variables;
-	      }
-	      if (jj > -1)
-	      {
-	         j_loc = i_global_to_local[jj];
-	         if (j_loc >=0)
-		    AE[i_loc + j_loc * local_dof_counter] = a_ext_data[j];
-	      }
-	    }
-	  }
-	}
-
-/* OLD- HYPRE_USING_ESSL
- 	cnt = local_dof_counter;
-        for (j_loc=1; j_loc < local_dof_counter; j_loc++)
-	   for (i_loc=j_loc; i_loc < local_dof_counter; i_loc++)
-	      AE[cnt++] = AE[i_loc + j_loc * local_dof_counter];
-   	dppf(AE, local_dof_counter, 1);
-        size = local_dof_counter*(local_dof_counter+1)/2;
- 	cnt = domain_matrixinverse_counter;
-	for (i_loc = 0; i_loc < size; i_loc++)
-	   domain_matrixinverse[cnt++] = AE[i_loc];
-        domain_matrixinverse_counter += size;
-*/
-       
+      {
+         i_dof = i_local_to_global[i_loc];
+         if (i_dof < num_variables)
+         {
+            for (j=a_diag_i[i_dof]; j < a_diag_i[i_dof+1]; j++)
+            {
+               j_loc = i_global_to_local[a_diag_j[j]];
+               if (j_loc >=0)
+                  AE[i_loc + j_loc * local_dof_counter] = a_diag_data[j];
+            }
+            for (j=a_offd_i[i_dof]; j < a_offd_i[i_dof+1]; j++)
+            {
+               j_loc = i_global_to_local[a_offd_j[j]+num_variables];
+               if (j_loc >=0)
+                  AE[i_loc + j_loc * local_dof_counter] = a_offd_data[j];
+            }
+         }
+         else
+         {
+            i_dof -= num_variables;
+            for (j=a_ext_i[i_dof]; j < a_ext_i[i_dof+1]; j++)
+            {
+               jj = a_ext_j[j];
+               if (jj > col_0 && jj < col_n)
+               {
+                  jj = jj - first_col_diag;
+               }
+               else
+               {
+                  jj = hypre_BinarySearch(col_map_offd,jj,num_cols_offd);
+                  if (jj > -1) jj += num_variables;
+               }
+               if (jj > -1)
+               {
+                  j_loc = i_global_to_local[jj];
+                  if (j_loc >=0)
+                     AE[i_loc + j_loc * local_dof_counter] = a_ext_data[j];
+               }
+            }
+         }
+      }
+
       if (use_nonsymm)
       {
-#ifdef HYPRE_USING_ESSL
-         dgetrf(local_dof_counter,local_dof_counter, AE,
-                local_dof_counter, ipiv, &ierr);
-#else
-         hypre_F90_NAME_LAPACK(dgetrf,DGETRF)(&local_dof_counter,
-                                              &local_dof_counter, AE,
-                                              &local_dof_counter, ipiv, &ierr);
-#endif
+         hypre_dgetrf(&local_dof_counter,
+                      &local_dof_counter, AE,
+                      &local_dof_counter, ipiv, &ierr);
          piv_counter +=local_dof_counter;
       }
-      
+
       else
       {
-#ifdef HYPRE_USING_ESSL         
-         dpotrf(&uplo,local_dof_counter, AE,
-                local_dof_counter, &ierr);
-#else
-         hypre_F90_NAME_LAPACK(dpotrf,DPOTRF)(&uplo,&local_dof_counter, AE,
-                                              &local_dof_counter, &ierr);
-#endif
-        
+         hypre_dpotrf(&uplo,&local_dof_counter, AE,
+                      &local_dof_counter, &ierr);
       }
-      
+
       domain_matrixinverse_counter+=local_dof_counter*local_dof_counter;
 
 
 
       for (l_loc=0; l_loc < local_dof_counter; l_loc++)
-	i_global_to_local[i_local_to_global[l_loc]] = -1;
-	  
-    }
+         i_global_to_local[i_local_to_global[l_loc]] = -1;
 
-  hypre_TFree(i_local_to_global);
-  hypre_TFree(i_global_to_local);
-  hypre_CSRMatrixDestroy(A_ext);
-
-/* OLD -HYPRE_USING_ESSL
-  hypre_TFree(AE);
-*/  
-
-  domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter,
-			i_domain_dof[num_domains]);
-
-  hypre_CSRMatrixI(domain_structure) = i_domain_dof;
-  hypre_CSRMatrixJ(domain_structure) = j_domain_dof;
-  hypre_CSRMatrixData(domain_structure) = domain_matrixinverse;
-
-  *domain_structure_pointer = domain_structure;
-  *piv_pointer = piv;
-  return hypre_error_flag;
+   }
+
+   hypre_TFree(i_local_to_global);
+   hypre_TFree(i_global_to_local);
+   hypre_CSRMatrixDestroy(A_ext);
+
+   domain_structure = hypre_CSRMatrixCreate(num_domains, max_local_dof_counter,
+                                            i_domain_dof[num_domains]);
+
+   hypre_CSRMatrixI(domain_structure) = i_domain_dof;
+   hypre_CSRMatrixJ(domain_structure) = j_domain_dof;
+   hypre_CSRMatrixData(domain_structure) = domain_matrixinverse;
 
+   *domain_structure_pointer = domain_structure;
+   *piv_pointer = piv;
+   return hypre_error_flag;
 }
 
 HYPRE_Int
 hypre_ParGenerateScale(hypre_ParCSRMatrix *A,
-		    hypre_CSRMatrix *domain_structure,
-                    HYPRE_Real       relaxation_weight,
-                    HYPRE_Real     **scale_pointer)
+                       hypre_CSRMatrix *domain_structure,
+                       HYPRE_Real       relaxation_weight,
+                       HYPRE_Real     **scale_pointer)
 {
    HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
    HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
@@ -4034,7 +3484,7 @@
    HYPRE_Int num_variables = hypre_ParCSRMatrixNumRows(A);
    HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
    HYPRE_Int j_loc, index, start;
-  
+
    hypre_ParCSRCommHandle *comm_handle;
 
    if (comm_pkg)
@@ -4048,20 +3498,20 @@
    if (num_cols_offd) scale_ext = hypre_CTAlloc(HYPRE_Real,num_cols_offd);
 
    for (i=0; i < num_domains; i++)
-   {  
+   {
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
       {
-	 j_loc = j_domain_dof[j];
-	 if (j_loc < num_variables)
-	    scale[j_loc] += 1.0;
-	 else
-	    scale_ext[j_loc-num_variables] += 1.0;
+         j_loc = j_domain_dof[j];
+         if (j_loc < num_variables)
+            scale[j_loc] += 1.0;
+         else
+            scale_ext[j_loc-num_variables] += 1.0;
       }
    }
    if (comm_pkg)
    {
       scale_int = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]);
-      comm_handle = hypre_ParCSRCommHandleCreate (2,comm_pkg,scale_ext,scale_int); 
+      comm_handle = hypre_ParCSRCommHandleCreate (2,comm_pkg,scale_ext,scale_int);
 
       hypre_ParCSRCommHandleDestroy(comm_handle);
       comm_handle = NULL;
@@ -4072,7 +3522,7 @@
    {
       start = send_map_starts[i];
       for (j=start; j < send_map_starts[i+1]; j++)
-	 scale[send_map_elmts[j]] += scale_int[index++];
+         scale[send_map_elmts[j]] += scale_int[index++];
    }
 
    if (comm_pkg) hypre_TFree(scale_int);
@@ -4088,9 +3538,9 @@
 
 HYPRE_Int
 hypre_ParGenerateHybridScale(hypre_ParCSRMatrix *A,
-		    hypre_CSRMatrix  *domain_structure,
-                    hypre_CSRMatrix **A_boundary_pointer,
-                    HYPRE_Real      **scale_pointer)
+                             hypre_CSRMatrix  *domain_structure,
+                             hypre_CSRMatrix **A_boundary_pointer,
+                             HYPRE_Real      **scale_pointer)
 {
    hypre_CSRMatrix *A_ext;
    HYPRE_Int *A_ext_i;
@@ -4101,7 +3551,7 @@
    HYPRE_Int *A_boundary_i;
    HYPRE_Int *A_boundary_j;
    HYPRE_Real *A_boundary_data;
- 
+
    HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
    HYPRE_Int *i_domain_dof = hypre_CSRMatrixI(domain_structure);
    HYPRE_Int *j_domain_dof = hypre_CSRMatrixJ(domain_structure);
@@ -4121,7 +3571,7 @@
    HYPRE_Int jj, j_loc, index, start;
    HYPRE_Int col_0, col_n;
    HYPRE_Int *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
-  
+
    hypre_ParCSRCommHandle *comm_handle;
 
    col_0 = hypre_ParCSRMatrixFirstColDiag(A)-1;
@@ -4137,7 +3587,7 @@
    }
 
    scale = hypre_CTAlloc(HYPRE_Real, num_variables);
-   if (num_cols_offd) 
+   if (num_cols_offd)
    {
       scale_ext = hypre_CTAlloc(HYPRE_Real,num_cols_offd);
       index_ext = hypre_CTAlloc(HYPRE_Int,num_cols_offd);
@@ -4150,25 +3600,25 @@
       index_ext[i] = -1;
 
    for (i=0; i < num_domains; i++)
-   {  
+   {
       for (j=i_domain_dof[i]; j < i_domain_dof[i+1]; j++)
       {
-	 j_loc = j_domain_dof[j];
-	 if (j_loc >= num_variables)
-	 {
-	    j_loc -= num_variables;
-	    if (index_ext[j_loc] == -1)
-	    {
-	       scale_ext[j_loc] += 1.0;
-	       index_ext[j_loc] ++;
-	    }
-	 }
+         j_loc = j_domain_dof[j];
+         if (j_loc >= num_variables)
+         {
+            j_loc -= num_variables;
+            if (index_ext[j_loc] == -1)
+            {
+               scale_ext[j_loc] += 1.0;
+               index_ext[j_loc] ++;
+            }
+         }
       }
    }
    if (comm_pkg)
    {
       scale_int = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends]);
-      comm_handle=hypre_ParCSRCommHandleCreate(2,comm_pkg,scale_ext,scale_int); 
+      comm_handle=hypre_ParCSRCommHandleCreate(2,comm_pkg,scale_ext,scale_int);
 
       hypre_ParCSRCommHandleDestroy(comm_handle);
       comm_handle = NULL;
@@ -4177,34 +3627,34 @@
       A_boundary_i = hypre_CTAlloc(HYPRE_Int,num_cols_offd+1);
       A_ext_j = hypre_CSRMatrixJ(A_ext);
       A_ext_data = hypre_CSRMatrixData(A_ext);
-      /* compress A_ext to contain only local data and 
+      /* compress A_ext to contain only local data and
          necessary boundary points*/
       index = 0;
       for (i=0; i < num_cols_offd; i++)
       {
-	 A_boundary_i[i] = index;
+         A_boundary_i[i] = index;
          for (j = A_ext_i[i]; j < A_ext_i[i+1]; j++)
          {
-	    j_col = A_ext_j[j];
-	    if (j_col > col_0 && j_col < col_n)
-	    {
-	       A_ext_j[j] = j_col-col_0;
-	       index++;
-	    }
-	    else  
-	    {
-	       jj = hypre_BinarySearch(col_map_offd,j_col,num_cols_offd);
-	       if (jj > -1 && (scale_ext[jj] > 0))
-	       {
-		  A_ext_j[j] = num_variables+jj;
-	          index++;
-	       }
-	       else
-	       {
-		  A_ext_j[j] = -1;
-	       }
-	    }
-	 }
+            j_col = A_ext_j[j];
+            if (j_col > col_0 && j_col < col_n)
+            {
+               A_ext_j[j] = j_col-col_0;
+               index++;
+            }
+            else
+            {
+               jj = hypre_BinarySearch(col_map_offd,j_col,num_cols_offd);
+               if (jj > -1 && (scale_ext[jj] > 0))
+               {
+                  A_ext_j[j] = num_variables+jj;
+                  index++;
+               }
+               else
+               {
+                  A_ext_j[j] = -1;
+               }
+            }
+         }
       }
       A_boundary_i[num_cols_offd] = index;
 
@@ -4220,12 +3670,12 @@
       index = 0;
       for (i=0; i < A_ext_i[num_cols_offd]; i++)
       {
-	 if (A_ext_j[i] > -1)
-	 {
-	    A_boundary_j[index] = A_ext_j[i];
-	    A_boundary_data[index] = A_ext_data[i];
-	    index++;
-	 }
+         if (A_ext_j[i] > -1)
+         {
+            A_boundary_j[index] = A_ext_j[i];
+            A_boundary_data[index] = A_ext_data[i];
+            index++;
+         }
       }
       A_boundary = hypre_CSRMatrixCreate(num_cols_offd,num_variables, index);
       hypre_CSRMatrixI(A_boundary) = A_boundary_i;
@@ -4239,11 +3689,11 @@
    {
       start = send_map_starts[i];
       for (j=start; j < send_map_starts[i+1]; j++)
-	 scale[send_map_elmts[j]] += scale_int[index++];
+         scale[send_map_elmts[j]] += scale_int[index++];
    }
 
    if (comm_pkg) hypre_TFree(scale_int);
-   if (num_cols_offd) 
+   if (num_cols_offd)
    {
       hypre_TFree(scale_ext);
       hypre_TFree(index_ext);
diff -Nru hypre-2.11.2/src/parcsr_mv/_hypre_parcsr_mv.h hypre-2.13.0/src/parcsr_mv/_hypre_parcsr_mv.h
--- hypre-2.11.2/src/parcsr_mv/_hypre_parcsr_mv.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_mv/_hypre_parcsr_mv.h	2017-10-20 17:42:22.000000000 +0000
@@ -831,6 +831,9 @@
 HYPRE_Int hypre_FillResponseParToCSRMatrix ( void *p_recv_contact_buf , HYPRE_Int contact_size , HYPRE_Int contact_proc , void *ro , MPI_Comm comm , void **p_send_response_buf , HYPRE_Int *response_message_size );
 hypre_ParCSRMatrix *hypre_ParCSRMatrixCompleteClone ( hypre_ParCSRMatrix *A );
 hypre_ParCSRMatrix *hypre_ParCSRMatrixUnion ( hypre_ParCSRMatrix *A , hypre_ParCSRMatrix *B );
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParCSRMatrixIsManaged(hypre_ParCSRMatrix *a);
+#endif
 
 /* parcsr_matrix.c */
 
@@ -869,6 +872,9 @@
 HYPRE_Int hypre_ParVectorReadIJ ( MPI_Comm comm , const char *filename , HYPRE_Int *base_j_ptr , hypre_ParVector **vector_ptr );
 HYPRE_Int hypre_FillResponseParToVectorAll ( void *p_recv_contact_buf , HYPRE_Int contact_size , HYPRE_Int contact_proc , void *ro , MPI_Comm comm , void **p_send_response_buf , HYPRE_Int *response_message_size );
 HYPRE_Complex hypre_ParVectorLocalSumElts ( hypre_ParVector *vector );
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParVectorIsManaged(hypre_ParVector *vector);
+#endif
 
 #ifdef __cplusplus
 }
diff -Nru hypre-2.11.2/src/parcsr_mv/par_csr_communication.c hypre-2.13.0/src/parcsr_mv/par_csr_communication.c
--- hypre-2.11.2/src/parcsr_mv/par_csr_communication.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_mv/par_csr_communication.c	2017-10-20 17:42:22.000000000 +0000
@@ -71,7 +71,7 @@
       case HYPRE_COMM_PKG_JOB_COMPLEX:
          if (!send_data)
          {
-            send_data = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));  
+            send_data = hypre_PinnedTAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));
          }
          if (!recv_data)
          {
diff -Nru hypre-2.11.2/src/parcsr_mv/par_csr_matrix.c hypre-2.13.0/src/parcsr_mv/par_csr_matrix.c
--- hypre-2.11.2/src/parcsr_mv/par_csr_matrix.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_mv/par_csr_matrix.c	2017-10-20 17:42:22.000000000 +0000
@@ -2153,3 +2153,11 @@
 
    return C;
 }
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParCSRMatrixIsManaged(hypre_ParCSRMatrix *a){
+  if (hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(a)))
+    return ((hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixDiag(a))) && (hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixOffd(a))));
+  else
+    return hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixDiag(a)); 
+}
+#endif
diff -Nru hypre-2.11.2/src/parcsr_mv/par_csr_matvec.c hypre-2.13.0/src/parcsr_mv/par_csr_matvec.c
--- hypre-2.11.2/src/parcsr_mv/par_csr_matvec.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_mv/par_csr_matvec.c	2017-10-20 17:42:22.000000000 +0000
@@ -18,6 +18,12 @@
 
 #include "_hypre_parcsr_mv.h"
 #include <assert.h>
+//#ifdef HYPRE_USE_GPU
+//extern "C"
+//{
+//void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, HYPRE_Int *send_map, HYPRE_Int begin,HYPRE_Int end,cudaStream_t s);
+//}
+//#endif
 
 /*--------------------------------------------------------------------------
  * hypre_ParCSRMatrixMatvec
@@ -66,7 +72,7 @@
     *  these conditions terminates processing, and the ierr flag
     *  is informational only.
     *--------------------------------------------------------------------*/
- 
+   PUSH_RANGE_PAYLOAD("PAR_CSR_MATVEC",5,x_size);
    hypre_assert( idxstride>0 );
 
    if (num_cols != x_size)
@@ -102,7 +108,7 @@
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
 #endif
-
+   PUSH_RANGE("MPI_PACK",3);
    HYPRE_Int use_persistent_comm = 0;
 #ifdef HYPRE_USING_PERSISTENT_COMM
    use_persistent_comm = num_vectors == 1;
@@ -115,6 +121,7 @@
    if ( use_persistent_comm )
    {
 #ifdef HYPRE_USING_PERSISTENT_COMM
+     PUSH_RANGE("PERCOMM1",0);
       persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(1, comm_pkg);
 
       HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
@@ -122,6 +129,7 @@
 
       hypre_VectorData(x_tmp) = (HYPRE_Complex *)persistent_comm_handle->recv_data;
       hypre_SeqVectorSetDataOwner(x_tmp, 0);
+      POP_RANGE;
 #endif
    }
    else
@@ -144,6 +152,20 @@
    {
       HYPRE_Int begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
       HYPRE_Int end   = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
+#ifdef HYPRE_USE_GPU
+      PUSH_RANGE("PERCOMM2DEVICE",4);
+#ifdef HYPRE_USING_PERSISTENT_COMM
+      PackOnDevice((HYPRE_Complex*)persistent_comm_handle->send_data,x_local_data,hypre_ParCSRCommPkgSendMapElmts(comm_pkg),begin,end,HYPRE_STREAM(4));
+      //PrintPointerAttributes(persistent_comm_handle->send_data);
+#else
+      PackOnDevice((HYPRE_Complex*)x_buf_data[0],x_local_data,hypre_ParCSRCommPkgSendMapElmts(comm_pkg),begin,end,HYPRE_STREAM(4));
+#endif
+      POP_RANGE;
+      SetAsyncMode(1);
+      hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0);
+      SetAsyncMode(0);
+      //gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(7)));
+#else
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for HYPRE_SMP_SCHEDULE
 #endif
@@ -156,6 +178,7 @@
 #endif
             = x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)];
       }
+#endif
    }
    else
       for ( jv=0; jv<num_vectors; ++jv )
@@ -185,7 +208,8 @@
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
 #endif
-
+   POP_RANGE;
+   PUSH_RANGE("MPI_HALO_EXC_SEND",4);
    if (use_persistent_comm)
    {
 #ifdef HYPRE_USING_PERSISTENT_COMM
@@ -204,13 +228,14 @@
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
 #endif
-
+   POP_RANGE;
+#ifndef HYPRE_USE_GPU
    hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0);
-
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
 #endif
-   
+   PUSH_RANGE("MPI_HALO_EXC_RECV",6);
    if (use_persistent_comm)
    {
 #ifdef HYPRE_USING_PERSISTENT_COMM
@@ -226,7 +251,7 @@
       }
       hypre_TFree(comm_handle);
    }
-
+   POP_RANGE;
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
 #endif
@@ -236,7 +261,7 @@
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
 #endif
-
+   PUSH_RANGE("MPI_UNPACK",5);
    hypre_SeqVectorDestroy(x_tmp);
    x_tmp = NULL;
    if (!use_persistent_comm)
@@ -248,7 +273,11 @@
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
 #endif
-
+   POP_RANGE;
+#ifdef HYPRE_USE_GPU
+   gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+#endif
+   POP_RANGE; // PAR_CSR
    return ierr;
 }
 
diff -Nru hypre-2.11.2/src/parcsr_mv/par_vector.c hypre-2.13.0/src/parcsr_mv/par_vector.c
--- hypre-2.11.2/src/parcsr_mv/par_vector.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/parcsr_mv/par_vector.c	2017-10-20 17:42:22.000000000 +0000
@@ -1076,3 +1076,9 @@
 {
    return hypre_VectorSumElts( hypre_ParVectorLocalVector(vector) );
 }
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParVectorIsManaged(hypre_ParVector *vector){
+  if (vector==NULL) return 1;
+  return hypre_SeqVectorIsManaged(hypre_ParVectorLocalVector(vector));
+}
+#endif
diff -Nru hypre-2.11.2/src/seq_mv/csr_matrix.c hypre-2.13.0/src/seq_mv/csr_matrix.c
--- hypre-2.11.2/src/seq_mv/csr_matrix.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/csr_matrix.c	2017-10-20 17:42:22.000000000 +0000
@@ -17,7 +17,9 @@
  *****************************************************************************/
 
 #include "seq_mv.h"
-
+#ifdef HYPRE_USE_GPU
+#include "gpukernels.h"
+#endif
 #ifdef HYPRE_PROFILE
 HYPRE_Real hypre_profile_times[HYPRE_TIMER_ID_COUNT] = { 0 };
 #endif
@@ -33,7 +35,7 @@
 {
    hypre_CSRMatrix  *matrix;
 
-   matrix = hypre_CTAlloc(hypre_CSRMatrix, 1);
+   matrix = hypre_HostCTAlloc(hypre_CSRMatrix, 1);
 
    hypre_CSRMatrixData(matrix) = NULL;
    hypre_CSRMatrixI(matrix)    = NULL;
@@ -47,7 +49,9 @@
    hypre_CSRMatrixOwnsData(matrix) = 1;
    hypre_CSRMatrixNumRownnz(matrix) = num_rows;
 
-
+#ifdef HYPRE_USE_GPU
+   matrix->on_device=0;
+#endif
    return matrix;
 }
 /*--------------------------------------------------------------------------
@@ -72,7 +76,7 @@
          hypre_CSRMatrixData(matrix) = NULL;
          hypre_CSRMatrixJ(matrix)    = NULL;
       }
-      hypre_TFree(matrix);
+      hypre_HostTFree(matrix);
       matrix = NULL;
    }
 
@@ -670,3 +674,36 @@
 {
    return hypre_CSRMatrixGetLoadBalancedPartitionBoundary(A, hypre_GetThreadNum() + 1);
 }
+#ifdef HYPRE_USE_GPU
+void hypre_CSRMatrixPrefetchToDevice(hypre_CSRMatrix *A){
+  if (hypre_CSRMatrixNumNonzeros(A)==0) return;
+
+  PUSH_RANGE_PAYLOAD("hypre_CSRMatrixPrefetchToDevice",0,hypre_CSRMatrixNumNonzeros(A));
+  if ((!A->on_device)&&(hypre_CSRMatrixNumNonzeros(A)>8192)){
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixData(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(4)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixI(A),(hypre_CSRMatrixNumRows(A)+1)*sizeof(HYPRE_Int),HYPRE_DEVICE,HYPRE_STREAM(5)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixJ(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Int),HYPRE_DEVICE,HYPRE_STREAM(6)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(5)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(6)));
+    A->on_device=1;
+  }
+  POP_RANGE;
+}
+void hypre_CSRMatrixPrefetchToHost(hypre_CSRMatrix *A){
+  PUSH_RANGE("hypre_CSRMatrixPrefetchToDevice",0);
+  if (A->on_device){
+    A->on_device=0;
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixData(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Complex),cudaCpuDeviceId,HYPRE_STREAM(4)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixI(A),(hypre_CSRMatrixNumRows(A)+1)*sizeof(HYPRE_Int),cudaCpuDeviceId,HYPRE_STREAM(4)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixJ(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Int),cudaCpuDeviceId,HYPRE_STREAM(4)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  }
+  POP_RANGE;
+}
+hypre_int hypre_CSRMatrixIsManaged(hypre_CSRMatrix *a){
+  return ((pointerIsManaged((void*)hypre_CSRMatrixData(a))) 
+	  && (pointerIsManaged((void*)hypre_CSRMatrixI(a)))
+	  && (pointerIsManaged((void*)hypre_CSRMatrixJ(a))));
+}
+#endif
diff -Nru hypre-2.11.2/src/seq_mv/csr_matvec.c hypre-2.13.0/src/seq_mv/csr_matvec.c
--- hypre-2.11.2/src/seq_mv/csr_matvec.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/csr_matvec.c	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,7 @@
 #include "seq_mv.h"
 #include <assert.h>
 
+
 /*--------------------------------------------------------------------------
  * hypre_CSRMatrixMatvec
  *--------------------------------------------------------------------------*/
@@ -36,7 +37,15 @@
 #ifdef HYPRE_PROFILE
    HYPRE_Real time_begin = hypre_MPI_Wtime();
 #endif
-
+#ifdef HYPRE_USE_GPU
+   PUSH_RANGE_PAYLOAD("MATVEC",0, hypre_CSRMatrixNumRows(A));
+   HYPRE_Int ret=hypre_CSRMatrixMatvecDevice( alpha,A,x,beta,b,y,offset);
+   POP_RANGE;
+  return ret;
+#ifdef HYPRE_PROFILE
+   hypre_profile_times[HYPRE_TIMER_ID_MATVEC] += hypre_MPI_Wtime() - time_begin;
+#endif
+#endif
    HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
    HYPRE_Int        *A_i      = hypre_CSRMatrixI(A) + offset;
    HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
@@ -765,3 +774,77 @@
 
    return ierr;
 }
+#ifdef HYPRE_USE_GPU
+HYPRE_Int
+hypre_CSRMatrixMatvecDevice( HYPRE_Complex    alpha,
+                       hypre_CSRMatrix *A,
+                       hypre_Vector    *x,
+                       HYPRE_Complex    beta,
+		       hypre_Vector    *b,
+		       hypre_Vector    *y,
+		       HYPRE_Int offset )
+{
+
+  static cusparseHandle_t handle;
+  static cusparseMatDescr_t descr;
+  static HYPRE_Int FirstCall=1;
+  cusparseStatus_t status;
+  static cudaStream_t s[10];
+  static HYPRE_Int myid;
+
+  if (b!=y){
+
+    PUSH_RANGE_PAYLOAD("MEMCPY",1,y->size-offset);
+    VecCopy(y->data,b->data,(y->size-offset),HYPRE_STREAM(4));
+    POP_RANGE
+  }
+
+  if (x==y) fprintf(stderr,"ERROR::x and y are the same pointer in hypre_CSRMatrixMatvecDevice\n");
+
+  if (FirstCall){
+    PUSH_RANGE("FIRST_CALL",4);
+
+    handle=getCusparseHandle();
+    
+    status= cusparseCreateMatDescr(&descr); 
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+      printf("ERROR:: Matrix descriptor initialization failed\n");
+      exit(2);
+    } 
+    
+    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+    
+    FirstCall=0;
+    hypre_int jj;
+    for(jj=0;jj<5;jj++)
+      s[jj]=HYPRE_STREAM(jj);
+    nvtxNameCudaStreamA(s[4], "HYPRE_COMPUTE_STREAM");
+    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+    myid++;
+    POP_RANGE;
+  }
+
+  PUSH_RANGE("PREFETCH+SPMV",2);
+
+  hypre_CSRMatrixPrefetchToDevice(A);
+  hypre_SeqVectorPrefetchToDevice(x);
+  hypre_SeqVectorPrefetchToDevice(y);
+  
+  if (offset!=0) printf("WARNING:: Offset is not zero in hypre_CSRMatrixMatvecDevice :: %d \n",offset);
+  cusparseErrchk(cusparseDcsrmv(handle ,
+				CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				A->num_rows-offset, A->num_cols, A->num_nonzeros,
+				&alpha, descr,
+				A->data ,A->i+offset,A->j,
+				x->data, &beta, y->data+offset));
+  
+  if (!GetAsyncMode()){
+  gpuErrchk(cudaStreamSynchronize(s[4]));
+  }
+  POP_RANGE;
+  
+  return 0;
+  
+}
+#endif
diff -Nru hypre-2.11.2/src/seq_mv/gpukernels.cu hypre-2.13.0/src/seq_mv/gpukernels.cu
--- hypre-2.11.2/src/seq_mv/gpukernels.cu	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/gpukernels.cu	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,243 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+//#include <cublas_v2.h>
+#include "_hypre_utilities.h"
+#define gpuErrchk2(ans) { gpuAssert2((ans), __FILE__, __LINE__); }
+inline void gpuAssert2(cudaError_t code, const char *file, hypre_int line)
+{
+   if (code != cudaSuccess) 
+   {
+     printf("GPUassert2: %s %s %d\n", cudaGetErrorString(code), file, line);
+     exit(2);
+   }
+}
+
+
+
+extern "C"{
+  __global__
+  void VecScaleKernelText(HYPRE_Complex *__restrict__ u, const HYPRE_Complex *__restrict__ v, const HYPRE_Complex *__restrict__ l1_norm, hypre_int num_rows){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<num_rows){
+      u[i]+=__ldg(v+i)/__ldg(l1_norm+i);
+    }
+  }
+}
+
+extern "C"{
+  __global__
+  void VecScaleKernel(HYPRE_Complex *__restrict__ u, const HYPRE_Complex *__restrict__ v, const HYPRE_Complex * __restrict__ l1_norm, hypre_int num_rows){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<num_rows){
+      u[i]+=v[i]/l1_norm[i];
+  }
+  }
+}
+
+extern "C"{
+  void VecScale(HYPRE_Complex *u, HYPRE_Complex *v, HYPRE_Complex *l1_norm, hypre_int num_rows,cudaStream_t s){
+    PUSH_RANGE_PAYLOAD("VECSCALE",1,num_rows);
+    const hypre_int tpb=64;
+    hypre_int num_blocks=num_rows/tpb+1;
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    MemPrefetchSized(l1_norm,num_rows*sizeof(HYPRE_Complex),HYPRE_DEVICE,s);
+    VecScaleKernel<<<num_blocks,tpb,0,s>>>(u,v,l1_norm,num_rows);
+#ifdef CATCH_LAUNCH_ERRORS    
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    gpuErrchk2(cudaStreamSynchronize(s));
+    POP_RANGE;
+  }
+}
+
+
+extern "C"{
+
+  __global__
+  void VecCopyKernel(HYPRE_Complex* __restrict__ tgt, const HYPRE_Complex* __restrict__ src, hypre_int size){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<size) tgt[i]=src[i];
+}
+  void VecCopy(HYPRE_Complex* tgt, const HYPRE_Complex* src, hypre_int size,cudaStream_t s){
+    hypre_int tpb=64;
+    hypre_int num_blocks=size/tpb+1;
+    PUSH_RANGE_PAYLOAD("VecCopy",5,size);
+    //MemPrefetch(tgt,0,s);
+    //MemPrefetch(src,0,s);
+    VecCopyKernel<<<num_blocks,tpb,0,s>>>(tgt,src,size);
+    //gpuErrchk2(cudaStreamSynchronize(s));
+    POP_RANGE;
+  }
+}
+extern "C"{
+
+  __global__
+  void VecSetKernel(HYPRE_Complex* __restrict__ tgt, const HYPRE_Complex value,hypre_int size){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<size) tgt[i]=value;
+}
+  void VecSet(HYPRE_Complex* tgt, hypre_int size, HYPRE_Complex value, cudaStream_t s){
+    hypre_int tpb=64;
+    //cudaDeviceSynchronize();
+    MemPrefetchSized(tgt,size*sizeof(HYPRE_Complex),HYPRE_DEVICE,s);
+    hypre_int num_blocks=size/tpb+1;
+    VecSetKernel<<<num_blocks,tpb,0,s>>>(tgt,value,size);
+    cudaStreamSynchronize(s);
+    //cudaDeviceSynchronize();
+  }
+}
+extern "C"{
+  __global__
+  void  PackOnDeviceKernel(HYPRE_Complex* __restrict__ send_data,const HYPRE_Complex* __restrict__ x_local_data, const hypre_int* __restrict__ send_map, hypre_int begin,hypre_int end){
+    hypre_int i = begin+blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<end){
+      send_data[i-begin]=x_local_data[send_map[i]];
+    }
+  }
+  void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, hypre_int *send_map, hypre_int begin,hypre_int end,cudaStream_t s){
+    if ((end-begin)<=0) return;
+    hypre_int tpb=64;
+    hypre_int num_blocks=(end-begin)/tpb+1;
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    PackOnDeviceKernel<<<num_blocks,tpb,0,s>>>(send_data,x_local_data,send_map,begin,end);
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    PUSH_RANGE("PACK_PREFETCH",1);
+#ifndef HYPRE_GPU_USE_PINNED
+    MemPrefetchSized((void*)send_data,(end-begin)*sizeof(HYPRE_Complex),cudaCpuDeviceId,s);
+#endif
+    POP_RANGE;
+    //gpuErrchk2(cudaStreamSynchronize(s));
+  }
+}
+  
+  // Scale vector by scalar
+
+extern "C"{
+__global__
+void VecScaleScalarKernel(HYPRE_Complex *__restrict__ u, const HYPRE_Complex alpha ,hypre_int num_rows){
+  hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  //if (i<5) printf("DEVICE %d %lf %lf %lf\n",i,u[i],v[i],l1_norm[i]);
+  if (i<num_rows){
+    u[i]*=alpha;
+    //if (i==0) printf("Diff Device %d %lf %lf %lf\n",i,u[i],v[i],l1_norm[i]);
+  }
+}
+}
+extern "C"{
+  hypre_int VecScaleScalar(HYPRE_Complex *u, const HYPRE_Complex alpha,  hypre_int num_rows,cudaStream_t s){
+    PUSH_RANGE("SEQVECSCALE",4);
+    hypre_int num_blocks=num_rows/64+1;
+    
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    VecScaleScalarKernel<<<num_blocks,64,0,s>>>(u,alpha,num_rows);
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    gpuErrchk2(cudaStreamSynchronize(s));
+    POP_RANGE;
+    return 0;
+  }
+}
+
+
+extern "C"{
+__global__
+void SpMVCudaKernel(HYPRE_Complex* __restrict__ y,HYPRE_Complex alpha, const HYPRE_Complex* __restrict__ A_data, const hypre_int* __restrict__ A_i, const hypre_int* __restrict__ A_j, const HYPRE_Complex* __restrict__ x, HYPRE_Complex beta, hypre_int num_rows)
+{
+  hypre_int i= blockIdx.x * blockDim.x + threadIdx.x;
+  if (i<num_rows){
+    HYPRE_Complex temp = 0.0;
+    hypre_int jj;
+    for (jj = A_i[i]; jj < A_i[i+1]; jj++){
+      hypre_int ajj=A_j[jj];
+      temp += A_data[jj] * x[ajj];
+    }
+    y[i] =y[i]*beta+alpha*temp;
+  }
+}
+
+__global__
+void SpMVCudaKernelZB(HYPRE_Complex* __restrict__ y,HYPRE_Complex alpha, const HYPRE_Complex* __restrict__ A_data, const hypre_int* __restrict__ A_i, const hypre_int* __restrict__ A_j, const HYPRE_Complex* __restrict__ x, hypre_int num_rows)
+{
+  hypre_int i= blockIdx.x * blockDim.x + threadIdx.x;
+  if (i<num_rows){
+    HYPRE_Complex temp = 0.0;
+    hypre_int jj;
+    for (jj = A_i[i]; jj < A_i[i+1]; jj++){
+      hypre_int ajj=A_j[jj];
+      temp += A_data[jj] * x[ajj];
+    }
+    y[i] = alpha*temp;
+  }
+}
+  void SpMVCuda(hypre_int num_rows,HYPRE_Complex alpha, HYPRE_Complex *A_data,hypre_int *A_i, hypre_int *A_j, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y){
+    hypre_int num_threads=64;
+    hypre_int num_blocks=num_rows/num_threads+1;
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif    
+    if (beta==0.0)
+      SpMVCudaKernelZB<<<num_blocks,num_threads>>>(y,alpha,A_data,A_i,A_j,x,num_rows);
+    else
+      SpMVCudaKernel<<<num_blocks,num_threads>>>(y,alpha,A_data,A_i,A_j,x,beta,num_rows);
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+
+}
+}
+extern "C"{
+  __global__
+  void CompileFlagSafetyCheck(hypre_int actual){
+#ifdef __CUDA_ARCH__
+    hypre_int cudarch=__CUDA_ARCH__;
+    if (cudarch!=actual){
+      printf("WARNING :: nvcc -arch flag does not match actual device architecture\nWARNING :: The code can fail silently and produce wrong results\n");
+      printf("Arch specified at compile = sm_%d Actual device = sm_%d\n",cudarch/10,actual/10);
+    } 
+#else
+    printf("ERROR:: CUDA_ ARCH is not defined \n This should not be happening\n");
+#endif
+  }
+}
+extern "C"{
+  void CudaCompileFlagCheck(){
+    hypre_int devCount;
+    cudaGetDeviceCount(&devCount);
+    hypre_int i;
+    hypre_int cudarch_actual;
+    for(i = 0; i < devCount; ++i)
+      {
+	struct cudaDeviceProp props;
+	cudaGetDeviceProperties(&props, i);
+	cudarch_actual=props.major*100+props.minor*10;
+    }
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+    CompileFlagSafetyCheck<<<1,1,0,0>>>(cudarch_actual);
+    cudaError_t code=cudaPeekAtLastError();
+    if (code != cudaSuccess)
+      {
+	fprintf(stderr,"ERROR in CudaCompileFlagCheck%s \n", cudaGetErrorString(code));
+	fprintf(stderr,"ERROR :: Check if compile arch flags match actual device arch = sm_%d\n",cudarch_actual/10);
+	exit(2);
+      }
+    gpuErrchk2(cudaDeviceSynchronize());
+  }
+}
diff -Nru hypre-2.11.2/src/seq_mv/gpukernels.h hypre-2.13.0/src/seq_mv/gpukernels.h
--- hypre-2.11.2/src/seq_mv/gpukernels.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/gpukernels.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,9 @@
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+#endif
diff -Nru hypre-2.11.2/src/seq_mv/headers hypre-2.13.0/src/seq_mv/headers
--- hypre-2.11.2/src/seq_mv/headers	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/headers	2017-10-20 17:42:22.000000000 +0000
@@ -49,6 +49,7 @@
 cat mapped_matrix.h       >> $INTERNAL_HEADER	
 cat multiblock_matrix.h   >> $INTERNAL_HEADER	
 cat vector.h              >> $INTERNAL_HEADER	
+cat gpukernel.h           >> $INTERNAL_HEADER
 
 ../utilities/protos *.c               >> $INTERNAL_HEADER
 
diff -Nru hypre-2.11.2/src/seq_mv/Makefile hypre-2.13.0/src/seq_mv/Makefile
--- hypre-2.11.2/src/seq_mv/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -13,6 +13,9 @@
 
 include ../config/Makefile.config
 
+FILES_NVCC =
+include $(HYPRE_NVCC_MAKEFILE)
+
 CINCLUDES = ${INCLUDES} ${MPIINCLUDE}
 
 C_COMPILE_FLAGS =\
@@ -44,6 +47,7 @@
  vector.c
 
 OBJS = ${FILES:.c=.o}
+CUOBJS = ${FILES_NVCC:.cu=.o}
 
 SONAME = libHYPRE_seq_mv-${HYPRE_RELEASE_VERSION}.so
 
@@ -71,12 +75,12 @@
 # Rules
 ##################################################################
 
-libHYPRE_seq_mv.a: ${OBJS}
+libHYPRE_seq_mv.a: ${OBJS} ${CUOBJS}
 	@echo  "Building $@ ... "
 	${AR} $@ ${OBJS}
 	${RANLIB} $@
 
-libHYPRE_seq_mv.so: ${OBJS}
+libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS}
 	@echo  "Building $@ ... "
 	${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME}
 	ln -s ${SONAME} $@
diff -Nru hypre-2.11.2/src/seq_mv/Makefile.empty hypre-2.13.0/src/seq_mv/Makefile.empty
--- hypre-2.11.2/src/seq_mv/Makefile.empty	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/Makefile.empty	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1 @@
+
diff -Nru hypre-2.11.2/src/seq_mv/Makefile.nvcc hypre-2.13.0/src/seq_mv/Makefile.nvcc
--- hypre-2.11.2/src/seq_mv/Makefile.nvcc	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/Makefile.nvcc	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,4 @@
+
+FILES_NVCC =\
+ gpukernels.cu
+
diff -Nru hypre-2.11.2/src/seq_mv/seq_mv.h hypre-2.13.0/src/seq_mv/seq_mv.h
--- hypre-2.11.2/src/seq_mv/seq_mv.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/seq_mv.h	2017-10-20 17:42:22.000000000 +0000
@@ -59,6 +59,11 @@
    HYPRE_Int     *rownnz;
    HYPRE_Int      num_rownnz;
 
+#ifdef HYPRE_USE_GPU
+  /* Flag to keeping track of prefetching */
+  HYPRE_Int on_device;
+#endif
+
 } hypre_CSRMatrix;
 
 /*--------------------------------------------------------------------------
@@ -208,6 +213,9 @@
       With rowwise storage, vj[i] = data[ j + num_vectors*i] */
    HYPRE_Int  vecstride, idxstride;
    /* ... so vj[i] = data[ j*vecstride + i*idxstride ] regardless of row_storage.*/
+#ifdef HYPRE_USE_GPU
+  HYPRE_Int on_device;
+#endif
 
 } hypre_Vector;
 
@@ -225,6 +233,20 @@
 
 #endif
 
+#ifndef hypre_GPUKERNELS_HEADER
+#define hypre_GPUKERNELS_HEADER
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, hypre_int *send_map, hypre_int begin,hypre_int end,cudaStream_t s);
+#endif
+#endif
+
 /* csr_matop.c */
 hypre_CSRMatrix *hypre_CSRMatrixAdd ( hypre_CSRMatrix *A , hypre_CSRMatrix *B );
 hypre_CSRMatrix *hypre_CSRMatrixMultiply ( hypre_CSRMatrix *A , hypre_CSRMatrix *B );
@@ -245,6 +267,11 @@
 HYPRE_Int hypre_CSRMatrixCopy ( hypre_CSRMatrix *A , hypre_CSRMatrix *B , HYPRE_Int copy_data );
 hypre_CSRMatrix *hypre_CSRMatrixClone ( hypre_CSRMatrix *A );
 hypre_CSRMatrix *hypre_CSRMatrixUnion ( hypre_CSRMatrix *A , hypre_CSRMatrix *B , HYPRE_Int *col_map_offd_A , HYPRE_Int *col_map_offd_B , HYPRE_Int **col_map_offd_C );
+#ifdef HYPRE_USE_GPU
+void hypre_CSRMatrixPrefetchToDevice(hypre_CSRMatrix *A);
+void hypre_CSRMatrixPrefetchToHost(hypre_CSRMatrix *A);
+hypre_int hypre_CSRMatrixIsManaged(hypre_CSRMatrix *a);
+#endif
 
 /* csr_matvec.c */
 // y[offset:end] = alpha*A[offset:end,:]*x + beta*b[offset:end]
@@ -253,7 +280,9 @@
 HYPRE_Int hypre_CSRMatrixMatvec ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y );
 HYPRE_Int hypre_CSRMatrixMatvecT ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y );
 HYPRE_Int hypre_CSRMatrixMatvec_FF ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y , HYPRE_Int *CF_marker_x , HYPRE_Int *CF_marker_y , HYPRE_Int fpt );
-
+#ifdef HYPRE_USE_GPU
+HYPRE_Int hypre_CSRMatrixMatvecDevice( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *b, hypre_Vector *y, HYPRE_Int offset );
+#endif
 /* genpart.c */
 HYPRE_Int hypre_GeneratePartitioning ( HYPRE_Int length , HYPRE_Int num_procs , HYPRE_Int **part_ptr );
 HYPRE_Int hypre_GenerateLocalPartitioning ( HYPRE_Int length , HYPRE_Int num_procs , HYPRE_Int myid , HYPRE_Int **part_ptr );
@@ -337,7 +366,16 @@
 HYPRE_Int hypre_SeqVectorAxpy ( HYPRE_Complex alpha , hypre_Vector *x , hypre_Vector *y );
 HYPRE_Real hypre_SeqVectorInnerProd ( hypre_Vector *x , hypre_Vector *y );
 HYPRE_Complex hypre_VectorSumElts ( hypre_Vector *vector );
-
+#ifdef HYPRE_USE_GPU
+HYPRE_Complex hypre_VectorSumAbsElts ( hypre_Vector *vector );
+HYPRE_Int hypre_SeqVectorCopyDevice ( hypre_Vector *x , hypre_Vector *y );
+HYPRE_Int hypre_SeqVectorAxpyDevice( HYPRE_Complex alpha , hypre_Vector *x , hypre_Vector *y );
+HYPRE_Real hypre_SeqVectorInnerProdDevice ( hypre_Vector *x , hypre_Vector *y );
+void hypre_SeqVectorPrefetchToDevice(hypre_Vector *x);
+void hypre_SeqVectorPrefetchToHost(hypre_Vector *x);
+void hypre_SeqVectorPrefetchToDeviceInStream(hypre_Vector *x, HYPRE_Int index);
+hypre_int hypre_SeqVectorIsManaged(hypre_Vector *x);
+#endif
 #ifdef __cplusplus
 }
 #endif
diff -Nru hypre-2.11.2/src/seq_mv/vector.c hypre-2.13.0/src/seq_mv/vector.c
--- hypre-2.11.2/src/seq_mv/vector.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/seq_mv/vector.c	2017-10-20 17:42:22.000000000 +0000
@@ -18,6 +18,11 @@
 
 #include "seq_mv.h"
 #include <assert.h>
+#ifdef HYPRE_USE_GPU
+#include <cublas_v2.h>
+#include <cusparse.h>
+#include "gpukernels.h"
+#endif
 
 /*--------------------------------------------------------------------------
  * hypre_SeqVectorCreate
@@ -28,7 +33,11 @@
 {
    hypre_Vector  *vector;
 
-   vector = hypre_CTAlloc(hypre_Vector, 1);
+   vector = hypre_HostCTAlloc(hypre_Vector, 1);
+
+#ifdef HYPRE_USE_GPU
+   vector->on_device=0;
+#endif
 
    hypre_VectorData(vector) = NULL;
    hypre_VectorSize(vector) = size;
@@ -69,7 +78,7 @@
       {
          hypre_TFree(hypre_VectorData(vector));
       }
-      hypre_TFree(vector);
+      hypre_HostTFree(vector);
    }
 
    return ierr;
@@ -246,6 +255,10 @@
 hypre_SeqVectorSetConstantValues( hypre_Vector *v,
                                   HYPRE_Complex value )
 {
+#ifdef HYPRE_USE_GPU
+  VecSet(hypre_VectorData(v),hypre_VectorSize(v),value,HYPRE_STREAM(4));
+  return 0;
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -310,6 +323,9 @@
 hypre_SeqVectorCopy( hypre_Vector *x,
                      hypre_Vector *y )
 {
+#ifdef HYPRE_USE_GPU
+  return hypre_SeqVectorCopyDevice(x,y);
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -394,7 +410,10 @@
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
-
+   
+#ifdef HYPRE_USE_GPU
+   return VecScaleScalar(y->data,alpha, hypre_VectorSize(y),HYPRE_STREAM(4));
+#endif
    HYPRE_Complex *y_data = hypre_VectorData(y);
    HYPRE_Int      size   = hypre_VectorSize(y);
            
@@ -426,6 +445,9 @@
                      hypre_Vector *x,
                      hypre_Vector *y     )
 {
+#ifdef  HYPRE_USE_GPU
+  return hypre_SeqVectorAxpyDevice(alpha,x,y);
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -460,6 +482,9 @@
 HYPRE_Real   hypre_SeqVectorInnerProd( hypre_Vector *x,
                                        hypre_Vector *y )
 {
+#ifdef HYPRE_USE_GPU
+  return hypre_SeqVectorInnerProdDevice(x,y);
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -506,3 +531,133 @@
 
    return sum;
 }
+
+#ifdef HYPRE_USE_GPU
+/* Sums of the absolute value of the elements for comparison to cublas device side routine */
+HYPRE_Complex hypre_VectorSumAbsElts( hypre_Vector *vector )
+{
+   HYPRE_Complex  sum = 0;
+   HYPRE_Complex *data = hypre_VectorData( vector );
+   HYPRE_Int      size = hypre_VectorSize( vector );
+   HYPRE_Int      i;
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) reduction(+:sum) HYPRE_SMP_SCHEDULE
+#endif
+   for ( i=0; i<size; ++i ) sum += fabs(data[i]); 
+
+   return sum;
+}
+HYPRE_Int
+hypre_SeqVectorCopyDevice( hypre_Vector *x,
+                     hypre_Vector *y )
+{
+  
+  HYPRE_Complex *x_data = hypre_VectorData(x);
+  HYPRE_Complex *y_data = hypre_VectorData(y);
+  HYPRE_Int      size   = hypre_VectorSize(x);
+  HYPRE_Int      size_y   = hypre_VectorSize(y);
+  
+  HYPRE_Int      i;
+  
+  HYPRE_Int      ierr = 0;
+  
+  if (size > size_y) size = size_y;
+  size *=hypre_VectorNumVectors(x);
+  PUSH_RANGE_PAYLOAD("VECCOPYDEVICE",2,size);
+  hypre_SeqVectorPrefetchToDevice(x);
+  hypre_SeqVectorPrefetchToDevice(y);
+  VecCopy(y_data,x_data,size,HYPRE_STREAM(4));
+  cudaStreamSynchronize(HYPRE_STREAM(4));
+  POP_RANGE;
+  return ierr;
+}
+HYPRE_Int
+hypre_SeqVectorAxpyDevice( HYPRE_Complex alpha,
+                     hypre_Vector *x,
+		     hypre_Vector *y     ){
+
+  HYPRE_Complex *x_data = hypre_VectorData(x);
+  HYPRE_Complex *y_data = hypre_VectorData(y);
+  HYPRE_Int      size   = hypre_VectorSize(x);
+           
+  HYPRE_Int      i;
+           
+  HYPRE_Int      ierr = 0;
+  cublasStatus_t stat;
+  size *=hypre_VectorNumVectors(x);
+
+  PUSH_RANGE_PAYLOAD("DEVAXPY",0,hypre_VectorSize(x));
+  hypre_SeqVectorPrefetchToDevice(x);
+  hypre_SeqVectorPrefetchToDevice(y);
+  static cublasHandle_t handle;
+  static HYPRE_Int firstcall=1;
+  if (firstcall){
+    handle=getCublasHandle();
+    firstcall=0;
+  }
+  cublasErrchk(cublasDaxpy(handle,(HYPRE_Int)size,&alpha,x_data,1,y_data,1));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+  return ierr;
+}
+
+HYPRE_Real   hypre_SeqVectorInnerProdDevice( hypre_Vector *x,
+                                       hypre_Vector *y )
+{
+  PUSH_RANGE_PAYLOAD("DEVDOT",4,hypre_VectorSize(x));
+  static cublasHandle_t handle;
+  static HYPRE_Int firstcall=1;
+
+  HYPRE_Complex *x_data = hypre_VectorData(x);
+  HYPRE_Complex *y_data = hypre_VectorData(y);
+  HYPRE_Int      size   = hypre_VectorSize(x);
+           
+  HYPRE_Int      i;
+
+  HYPRE_Real     result = 0.0;
+  cublasStatus_t stat;
+  if (firstcall){
+    handle = getCublasHandle();
+    firstcall=0;
+  }
+  PUSH_RANGE_PAYLOAD("DEVDOT-PRFETCH",5,hypre_VectorSize(x));
+  //hypre_SeqVectorPrefetchToDevice(x);
+  //hypre_SeqVectorPrefetchToDevice(y);
+  POP_RANGE;
+  PUSH_RANGE_PAYLOAD("DEVDOT-ACTUAL",0,hypre_VectorSize(x));
+  stat=cublasDdot(handle, (HYPRE_Int)size,
+		  x_data, 1,
+		  y_data, 1,
+		  &result);
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+  POP_RANGE;
+  return result;
+  
+}
+void hypre_SeqVectorPrefetchToDevice(hypre_Vector *x){
+  if (hypre_VectorSize(x)==0) return;
+  PUSH_RANGE("hypre_SeqVectorPrefetchToDevice",0);
+  gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(4)));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+}
+void hypre_SeqVectorPrefetchToHost(hypre_Vector *x){
+  if (hypre_VectorSize(x)==0) return;
+  PUSH_RANGE("hypre_SeqVectorPrefetchToHost",0);
+  gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),cudaCpuDeviceId,HYPRE_STREAM(4)));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+}
+void hypre_SeqVectorPrefetchToDeviceInStream(hypre_Vector *x, HYPRE_Int index){
+  if (hypre_VectorSize(x)==0) return;
+  PUSH_RANGE("hypre_SeqVectorPrefetchToDevice",0);
+  gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(index)));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(index)));
+  POP_RANGE;
+}
+hypre_int hypre_SeqVectorIsManaged(hypre_Vector *x){
+  return pointerIsManaged((void*)hypre_VectorData(x));
+}
+#endif
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_amr_fcoarsen.c hypre-2.13.0/src/sstruct_ls/fac_amr_fcoarsen.c
--- hypre-2.11.2/src/sstruct_ls/fac_amr_fcoarsen.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_amr_fcoarsen.c	2017-10-20 17:42:22.000000000 +0000
@@ -53,13 +53,13 @@
    }
 
 
-#define AbsStencilShape(stencil, abs_shape)     \
-   {                                            \
-      HYPRE_Int ii,jj,kk;                       \
-      ii = hypre_IndexX(stencil);               \
-      jj = hypre_IndexY(stencil);               \
-      kk = hypre_IndexZ(stencil);               \
-      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk);   \
+#define AbsStencilShape(stencil, abs_shape)                     \
+   {                                                            \
+      HYPRE_Int ii,jj,kk;                                       \
+      ii = hypre_IndexX(stencil);                               \
+      jj = hypre_IndexY(stencil);                               \
+      kk = hypre_IndexZ(stencil);                               \
+      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \
    }
 
 /*--------------------------------------------------------------------------
@@ -157,7 +157,7 @@
 
    HYPRE_Int               i, j, k, l, m, n, ll, kk, jj;
    HYPRE_Int               nvars, var1, var2, var2_start; 
-   HYPRE_Int               iA, iAc, iA_shift_z, iA_shift_zy, iA_shift_zyx;
+   HYPRE_Int               iA_shift_z, iA_shift_zy, iA_shift_zyx;
 
    hypre_Index             lindex;
    hypre_Index             index1, index2;
@@ -1868,19 +1868,17 @@
                   /*----------------------------------------------------------------
                    * Loop over interior grid box. 
                    *----------------------------------------------------------------*/
+
                   hypre_BoxGetSize(&fine_box, loop_size);
 
-                  hypre_BoxLoop2Begin(ndim, loop_size,
-                                      A_dbox, fstart, stridef, iA,
-                                      crse_dbox, cstart, stridec, iAc);
+                  hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                            A_dbox, fstart, stridef, iA,
+                                            crse_dbox, cstart, stridec, iAc);
 #if 0
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,i,rank,index1,index2,m,l,k,j,iA_shift_z,iA_shift_zy,iA_shift_zyx,stencil_i,sum,vals) HYPRE_SMP_SCHEDULE
 #endif
-#else
-                  hypre_BoxLoopSetOneBlock();
 #endif
-                  hypre_BoxLoop2For(iA, iAc)
                   {
                      for (i= 0; i< stencil_size; i++)
                      {
@@ -1986,7 +1984,7 @@
                      }
                      crse_ptrs[ rank_stencils[0] ][iAc]= sum;
                   }
-                  hypre_BoxLoop2End(iA, iAc);
+                  hypre_SerialBoxLoop2End(iA, iAc);
                }    /* end hypre_ForBoxI(fi, fbox_interior_ci) */
 
                /*------------------------------------------------------------------
@@ -2050,19 +2048,12 @@
                      /*--------------------------------------------------------------
                       * Loop over boundary grid box.
                       *--------------------------------------------------------------*/
+
                      hypre_BoxGetSize(&fine_box, loop_size);
 
-                     hypre_BoxLoop2Begin(ndim, loop_size,
-                                         A_dbox, fstart, stridef, iA,
-                                         crse_dbox, cstart, stridec, iAc);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,i,rank,index1,index2,m,l,k,j,iA_shift_z,iA_shift_zy,iA_shift_zyx,stencil_i,temp3,ll,kk,jj,temp2,cnt1,index_temp,boxman_entry,found,Uventry,nUentries,ncols,rows,cols,vals2,sum,vals) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop2For(iA, iAc)
+                     hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                               A_dbox, fstart, stridef, iA,
+                                               crse_dbox, cstart, stridec, iAc);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         for (i= 0; i< stencil_size; i++)
@@ -2338,7 +2329,7 @@
                         hypre_TFree(temp3);
 
                      }
-                     hypre_BoxLoop2End(iA, iAc);
+                     hypre_SerialBoxLoop2End(iA, iAc);
 
                   }  /* hypre_ForBoxI(fi, fbox_bdy_ci_fi) */
                }      /* hypre_ForBoxArrayI(arrayi, fbox_bdy_ci) */
@@ -2940,7 +2931,7 @@
                   }
 
                   hypre_qsort1(interface_stencil_ranks[i], (HYPRE_Real *) temp1, 0,
-                         coarse_stencil_cnt[i]-1);
+                               coarse_stencil_cnt[i]-1);
 
                   /*---------------------------------------------------------------
                    * swap the stencil_vals to agree with the rank swapping.
@@ -3494,6 +3485,7 @@
 #endif
                hypre_BoxLoop1For(iA)
                {
+                  HYPRE_Int i;
                   for (i= 0; i< stencil_size; i++)
                   {
                      if (i != centre)
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_amr_rap.c hypre-2.13.0/src/sstruct_ls/fac_amr_rap.c
--- hypre-2.11.2/src/sstruct_ls/fac_amr_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_amr_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -86,7 +86,6 @@
    hypre_Index                  index, stride, zero_index;
    HYPRE_Int                    nvars, var1, var2, part, cbox;
    HYPRE_Int                    i, j, k, size;
-   HYPRE_Int                    iA, iAc;
 
    HYPRE_Int                    myid;
    HYPRE_Int                    ierr= 0;
@@ -220,7 +219,7 @@
                                       smatrix_dbox, ilower, stride, iA,
                                       fac_smatrix_dbox, ilower, stride, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(iA, iAc)
                   {
@@ -277,7 +276,7 @@
                                       smatrix_dbox, ilower, stride, iA,
                                       fac_smatrix_dbox, ilower, stride, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(iA, iAc)
                   {
@@ -415,7 +414,7 @@
                                       smatrix_dbox, ilower, stride, iA,
                                       fac_smatrix_dbox, ilower, stride, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(iA, iAc)
                   {
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_cf_coarsen.c hypre-2.13.0/src/sstruct_ls/fac_cf_coarsen.c
--- hypre-2.11.2/src/sstruct_ls/fac_cf_coarsen.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_cf_coarsen.c	2017-10-20 17:42:22.000000000 +0000
@@ -48,17 +48,17 @@
          jj= -1;                                \
       if (kk==2)                                \
          kk= -1;                                \
-      hypre_SetIndex3(stencil, ii, jj, kk);      \
+      hypre_SetIndex3(stencil, ii, jj, kk);     \
    }
 
 
-#define AbsStencilShape(stencil, abs_shape)     \
-   {                                            \
-      HYPRE_Int ii,jj,kk;                       \
-      ii = hypre_IndexX(stencil);               \
-      jj = hypre_IndexY(stencil);               \
-      kk = hypre_IndexZ(stencil);               \
-      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk);   \
+#define AbsStencilShape(stencil, abs_shape)                     \
+   {                                                            \
+      HYPRE_Int ii,jj,kk;                                       \
+      ii = hypre_IndexX(stencil);                               \
+      jj = hypre_IndexY(stencil);                               \
+      kk = hypre_IndexZ(stencil);                               \
+      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \
    }
 
 /*--------------------------------------------------------------------------
@@ -130,7 +130,7 @@
    HYPRE_Int               rank, startrank;
    HYPRE_Real             *vals;
 
-   HYPRE_Int               i, j, iA;
+   HYPRE_Int               i, j;
    HYPRE_Int               nvars, var1; 
 
    hypre_Index             lindex, zero_index;
@@ -219,7 +219,7 @@
          hypre_StructMapCoarseToFine(hypre_BoxIMin(cgrid_box), zero_index,
                                      refine_factors, hypre_BoxIMin(&refined_box));
          hypre_SetIndex3(index1, refine_factors[0]-1, refine_factors[1]-1,
-                        refine_factors[2]-1);
+                         refine_factors[2]-1);
          hypre_StructMapCoarseToFine(hypre_BoxIMax(cgrid_box), index1,
                                      refine_factors, hypre_BoxIMax(&refined_box));
 
@@ -340,17 +340,9 @@
                   fgrid_cinterface= hypre_BoxArrayBox(cinterface_array, boxi);
                   hypre_CopyIndex(hypre_BoxIMin(fgrid_cinterface), node_extents);
                   hypre_BoxGetSize(fgrid_cinterface, loop_size);
-                    
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      A_dbox, node_extents, stridec, iA);
-#if 0 /* Are private static arrays a problem? */
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,lindex,i,index_temp,boxman_entry,rank,found,Uventry,nUentries,temp1,cnt1,ncols,rows,cols,temp2,vals,index2,index1,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(iA)
+
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            A_dbox, node_extents, stridec, iA);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      for (i= 0; i< stencil_size; i++)
@@ -482,7 +474,7 @@
                         }   /* if (Uventry != NULL) */
                      }       /* if (nUventries > 0) */
                   }
-                  hypre_BoxLoop1End(iA);
+                  hypre_SerialBoxLoop1End(iA);
                }  /* for (boxi= stencil_size; boxi< box_array_size; boxi++) */
             }     /* hypre_ForBoxArrayI(fi, cinterface_arrays) */
          }        /* hypre_ForBoxI(ci, cgrid_boxes) */
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_interp2.c hypre-2.13.0/src/sstruct_ls/fac_interp2.c
--- hypre-2.11.2/src/sstruct_ls/fac_interp2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_interp2.c	2017-10-20 17:42:22.000000000 +0000
@@ -739,9 +739,6 @@
    hypre_StructVector     *e_var;
    hypre_StructVector     *recv_var;
 
-   HYPRE_Int               xci;
-   HYPRE_Int               ei;
-
    HYPRE_Real           ***xcp;
    HYPRE_Real           ***ep;
 
@@ -967,17 +964,9 @@
             hypre_CopyIndex(hypre_BoxIMin(ownbox), startc);
             hypre_BoxGetSize(ownbox, loop_size);
 
-            hypre_BoxLoop2Begin(ndim, loop_size,
-                                e_dbox,  start,  stride,  ei,
-                                xc_dbox, startc, stridec, xci);
-#if 1
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci,lindex,imax,jmax,kmax,k,offset_kp1,zweight2,kshift,zweight1,j,offset_jp1,yweight2,jshift,yweight1,i,offset_ip1,xweight2,ishift,xweight1) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop2For(ei, xci)
+            hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                      e_dbox,  start,  stride,  ei,
+                                      xc_dbox, startc, stridec, xci);
             {
                /*--------------------------------------------------------
                 * Linear interpolation. Determine the weights and the
@@ -1191,7 +1180,7 @@
                   }         /* for (j= 0; j< jmax; j++) */
                }            /* for (k= 0; k< kmax; k++) */ 
             }
-            hypre_BoxLoop2End(ei, xci);
+            hypre_SerialBoxLoop2End(ei, xci);
 
          }/* hypre_ForBoxI(bi, own_abox) */
       }   /* hypre_ForBoxArray(fi, fgrid_boxes) */
@@ -1297,7 +1286,7 @@
                for (j=0; j< jsize; j++)
                {
                   hypre_SetIndex3(temp_index2, 
-                                 ptr_ishift, j+ptr_jshift, k+ptr_kshift);
+                                  ptr_ishift, j+ptr_jshift, k+ptr_kshift);
                   xcp[k][j]= hypre_StructVectorBoxData(recv_var, bi) +
                      hypre_BoxOffsetDistance(xc_dbox, temp_index2);
                }
@@ -1306,17 +1295,9 @@
             hypre_CopyIndex(hypre_BoxIMin(ownbox), startc);
             hypre_BoxGetSize(ownbox, loop_size);
 
-            hypre_BoxLoop2Begin(ndim, loop_size,
-                                e_dbox,  start,  stride,  ei,
-                                xc_dbox, startc, stridec, xci);
-#if 1
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci,lindex,imax,jmax,kmax,k,offset_kp1,zweight2,kshift,zweight1,j,offset_jp1,yweight2,jshift,yweight1,i,offset_ip1,xweight2,ishift,xweight1) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop2For(ei, xci)
+            hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                      e_dbox,  start,  stride,  ei,
+                                      xc_dbox, startc, stridec, xci);
             {
                /*--------------------------------------------------------
                 * Linear interpolation. Determine the weights and the
@@ -1533,7 +1514,7 @@
                   }         /* for (j= 0; j< jmax; j++) */
                }            /* for (k= 0; k< kmax; k++) */
             }
-            hypre_BoxLoop2End(ei, xci);
+            hypre_SerialBoxLoop2End(ei, xci);
 
          }  /* if (hypre_BoxVolume(ownbox)) */
       }     /* hypre_ForBoxI(bi, own_abox) */
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_restrict2.c hypre-2.13.0/src/sstruct_ls/fac_restrict2.c
--- hypre-2.11.2/src/sstruct_ls/fac_restrict2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_restrict2.c	2017-10-20 17:42:22.000000000 +0000
@@ -38,7 +38,7 @@
       ii = (ij%2);                              \
       jj = (ij-ii)/2;                           \
       kk = (rank-2*jj-ii)/4;                    \
-      hypre_SetIndex3(stencil, ii, jj, kk);      \
+      hypre_SetIndex3(stencil, ii, jj, kk);     \
    }
 
 /*--------------------------------------------------------------------------
@@ -60,8 +60,8 @@
 
    hypre_CommPkg       **interlevel_comm;
 /*   hypre_CommPkg       **intralevel_comm;*/ /* may need to build an intra comm so
-                                                 that each processor only fullwts its
-                                                 own fine data- may need to add contrib */
+     that each processor only fullwts its
+     own fine data- may need to add contrib */
 
 } hypre_FacSemiRestrictData2;
 
@@ -518,9 +518,6 @@
    hypre_StructVector     *xc_var;
    hypre_StructVector     *xf_var;
 
-   HYPRE_Int               xci;
-   HYPRE_Int               xfi;
-
    HYPRE_Real           ***xfp;
    HYPRE_Real           ***xcp;
    HYPRE_Real           ***xcp_temp;
@@ -730,17 +727,9 @@
          hypre_BoxGetSize(fgrid_box, temp_index1);
          hypre_StructMapFineToCoarse(temp_index1, temp_index2, rfactors, loop_size);
 
-         hypre_BoxLoop2Begin(ndim, loop_size,
-                             xf_dbox, start, stride,  xfi,
-                             xc_temp_dbox, startc, stridec, xci);
-#if 0 /* Are private static arrays a problem? */
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xfi,xci,imax,jmax,kmax,k,kcell,j,jcell,i,icell,ijkcell,temp_index2) HYPRE_SMP_SCHEDULE
-#endif
-#else
-         hypre_BoxLoopSetOneBlock();
-#endif
-         hypre_BoxLoop2For(xfi, xci)
+         hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                   xf_dbox, start, stride,  xfi,
+                                   xc_temp_dbox, startc, stridec, xci);
          {
             /*-----------------------------------------------------------------
              * Arithmetic average the refinement patch values to get 
@@ -804,7 +793,7 @@
             }
 
          }
-         hypre_BoxLoop2End(xfi, xci);
+         hypre_SerialBoxLoop2End(xfi, xci);
 
       }   /* hypre_ForBoxI(fi, fgrid_boxes) */
    }      /* for (var= 0; var< nvars; var++)*/
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_setup2.c hypre-2.13.0/src/sstruct_ls/fac_setup2.c
--- hypre-2.11.2/src/sstruct_ls/fac_setup2.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_setup2.c	2017-10-20 17:42:22.000000000 +0000
@@ -25,7 +25,7 @@
                  hypre_SStructVector  *b,
                  hypre_SStructVector  *x )
 {
-	hypre_FACData          *fac_data      =  (hypre_FACData*)fac_vdata;
+   hypre_FACData          *fac_data      =  (hypre_FACData*)fac_vdata;
 
    HYPRE_Int              *plevels       = (fac_data-> plevels);
    hypre_Index            *rfactors      = (fac_data-> prefinements);
@@ -105,7 +105,6 @@
    HYPRE_Int              *stencil_vars;
    HYPRE_Real             *values;
    HYPRE_Real             *A_smatrix_value;
-   HYPRE_Int               iA;
  
    HYPRE_Int              *nrows;
    HYPRE_Int             **ncols;
@@ -126,8 +125,8 @@
    HYPRE_Int               ierr = 0;
 /*hypre_SStructMatrix *nested_A;
 
-nested_A= hypre_TAlloc(hypre_SStructMatrix , 1);
-nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/
+  nested_A= hypre_TAlloc(hypre_SStructMatrix , 1);
+  nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/
 
    /* generate the composite operator with the computed coarse-grid operators */
    hypre_AMR_RAP(A_in, rfactors, &A_rap);
@@ -498,7 +497,7 @@
                                    sgrid_box, box_start, stride, k,
                                    A_smatrix_dbox, box_start, stride, iA);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,k,iA) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(k, iA)
                {
@@ -573,7 +572,7 @@
                                       sgrid_box, box_start, stride, k,
                                       A_smatrix_dbox, box_start, stride, iA);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,k,iA) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(k, iA)
                   {
diff -Nru hypre-2.11.2/src/sstruct_ls/fac_zero_stencilcoef.c hypre-2.13.0/src/sstruct_ls/fac_zero_stencilcoef.c
--- hypre-2.11.2/src/sstruct_ls/fac_zero_stencilcoef.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/fac_zero_stencilcoef.c	2017-10-20 17:42:22.000000000 +0000
@@ -13,13 +13,13 @@
 #include "_hypre_sstruct_ls.h"
 #include "fac.h"
 
-#define AbsStencilShape(stencil, abs_shape)     \
-   {                                            \
-      HYPRE_Int ii,jj,kk;                       \
-      ii = hypre_IndexX(stencil);               \
-      jj = hypre_IndexY(stencil);               \
-      kk = hypre_IndexZ(stencil);               \
-      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk);   \
+#define AbsStencilShape(stencil, abs_shape)                     \
+   {                                                            \
+      HYPRE_Int ii,jj,kk;                                       \
+      ii = hypre_IndexX(stencil);                               \
+      jj = hypre_IndexY(stencil);                               \
+      kk = hypre_IndexZ(stencil);                               \
+      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \
    }
 
 /*--------------------------------------------------------------------------
@@ -71,7 +71,6 @@
    HYPRE_Real            *ac_ptr;
    hypre_Index            loop_size;
 
-   HYPRE_Int              iac;
    HYPRE_Int              ci, i, j;
 
    HYPRE_Int              abs_shape;
@@ -176,7 +175,7 @@
                                                ac_dbox, hypre_BoxIMin(shift_ibox),
                                                stride, iac);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iac) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                            hypre_BoxLoop1For(iac)
                            {
@@ -258,7 +257,6 @@
    HYPRE_Real            *a_ptr;
    hypre_Index            loop_size;
 
-   HYPRE_Int              ia;
    HYPRE_Int              fi, fj, i, j;
    HYPRE_Int              abs_shape;
    HYPRE_Int              myid, proc;
@@ -402,7 +400,7 @@
                                             a_dbox, hypre_BoxIMin(&intersect_box),
                                             stride, ia);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ia) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                         hypre_BoxLoop1For(ia)
                         {
diff -Nru hypre-2.11.2/src/sstruct_ls/HYPRE_sstruct_int.c hypre-2.13.0/src/sstruct_ls/HYPRE_sstruct_int.c
--- hypre-2.11.2/src/sstruct_ls/HYPRE_sstruct_int.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/HYPRE_sstruct_int.c	2017-10-20 17:42:22.000000000 +0000
@@ -27,12 +27,12 @@
    hypre_StructVector *svector;
    HYPRE_Int           var;
 
-   srand( seed );
+   hypre_SeedRand( seed );
 
    for (var = 0; var < nvars; var++)
    {
       svector = hypre_SStructPVectorSVector(pvector, var);
-	  seed = rand();
+      seed = hypre_RandI();
       hypre_StructVectorSetRandomValues(svector, seed);
    }
 
@@ -47,12 +47,12 @@
    hypre_SStructPVector *pvector;
    HYPRE_Int             part;
 
-   srand( seed );
+   hypre_SeedRand( seed );
 
    for (part = 0; part < nparts; part++)
    {
       pvector = hypre_SStructVectorPVector(vector, part);
-	  seed = rand();
+      seed = hypre_RandI();
       hypre_SStructPVectorSetRandomValues(pvector, seed);
    }
 
diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_grad.c hypre-2.13.0/src/sstruct_ls/maxwell_grad.c
--- hypre-2.11.2/src/sstruct_ls/maxwell_grad.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/maxwell_grad.c	2017-10-20 17:42:22.000000000 +0000
@@ -326,15 +326,13 @@
                hypre_BoxGetSize(box_piece, loop_size);
                hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
          
-               hypre_BoxLoop0Begin(ndim, loop_size);
+               hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #else
-               hypre_BoxLoopSetOneBlock();
 #endif
-               hypre_BoxLoop0For()
                {
                   hypre_BoxLoopGetIndex(lindex);
                   hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -344,7 +342,7 @@
                                                         &rank, matrix_type);
                   nflag[rank-start_rank1]= 0; 
                }
-               hypre_BoxLoop0End();
+               hypre_SerialBoxLoop0End();
             }  /* if (hypre_BoxVolume(box_piece) < i) */
 
          }  /* for (m= 0; m< hypre_BoxArraySize(tmp_box_array1); m++) */
@@ -435,15 +433,13 @@
                      hypre_BoxGetSize(box_piece, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #else
-                     hypre_BoxLoopSetOneBlock();
 #endif
-                     hypre_BoxLoop0For()
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -453,7 +449,7 @@
                                                               &rank, matrix_type);
                         eflag[rank-start_rank2]= 0;
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* if (hypre_BoxVolume(box_piece) < i) */
                }     /* for (k= 0; k< hypre_BoxArraySize(tmp_box_array1); k++) */
 
@@ -467,15 +463,13 @@
                      hypre_BoxGetSize(box_piece, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #else
-                     hypre_BoxLoopSetOneBlock();
 #endif
-                     hypre_BoxLoop0For()
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -485,7 +479,7 @@
                                                               &rank, matrix_type);
                         eflag[rank-start_rank2]= 0;
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* if (hypre_BoxVolume(box_piece) < i) */
                }     /* for (k= 0; k< hypre_BoxArraySize(tmp_box_array2); k++) */
                hypre_BoxArrayDestroy(tmp_box_array2);
@@ -603,15 +597,13 @@
             /* Interior box- loop over each edge and find the row rank and 
                then the column ranks for the connected nodes. Change the 
                appropriate values to 1. */
-            hypre_BoxLoop0Begin(ndim, loop_size);
+            hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,m,i,nrows) HYPRE_SMP_SCHEDULE
 #endif
 #else
-            hypre_BoxLoopSetOneBlock();
 #endif
-            hypre_BoxLoop0For()
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -641,7 +633,7 @@
                ncols[nrows]= 2;
                nrows++;
             }
-            hypre_BoxLoop0End();
+            hypre_SerialBoxLoop0End();
 
             /* now the boundary layers. To cases to consider: is the
                edge totally on the boundary or is the edge connected
@@ -672,15 +664,13 @@
                   hypre_BoxGetSize(&layer, loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&layer), start);
 
-                  hypre_BoxLoop0Begin(ndim, loop_size);
+                  hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,m,i,nrows) HYPRE_SMP_SCHEDULE
 #endif
 #else
-                  hypre_BoxLoopSetOneBlock();
 #endif
-                  hypre_BoxLoop0For()
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -755,7 +745,7 @@
                      }  /* if (eflag[m-start_rank2]) */
 
                   }
-                  hypre_BoxLoop0End();
+                  hypre_SerialBoxLoop0End();
                }  /* for (ndirection= 0; ndirection< 2; ndirection++) */
             }     /* for (d= 0; d< ndim; d++) */
 
diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_physbdy.c hypre-2.13.0/src/sstruct_ls/maxwell_physbdy.c
--- hypre-2.11.2/src/sstruct_ls/maxwell_physbdy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/maxwell_physbdy.c	2017-10-20 17:42:22.000000000 +0000
@@ -476,15 +476,7 @@
                      hypre_BoxGetSize(box, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(box), start);
       
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,boxman_entry,cnt) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -497,7 +489,7 @@
                         cnt++;
 
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(p, box_array) */
                }     /* hypre_ForBoxArrayI(m, fbdry) */
 
diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_PNedelec.c hypre-2.13.0/src/sstruct_ls/maxwell_PNedelec.c
--- hypre-2.11.2/src/sstruct_ls/maxwell_PNedelec.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/maxwell_PNedelec.c	2017-10-20 17:42:22.000000000 +0000
@@ -69,7 +69,7 @@
    HYPRE_Int              nvars, Edge_nvars, part, var;
    HYPRE_Int              tot_vars= 8;
 
-   HYPRE_Int              t, i, j, k, l, m, n, p, r, size;
+   HYPRE_Int              t, i, j, k, l, m, n, p, size;
 
    HYPRE_Int              ilower, iupper;
    HYPRE_Int              jlower, jupper;
@@ -538,16 +538,8 @@
 
             hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,entry,p,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -568,7 +560,7 @@
                   j++;
                }
             }
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
 
          }   /* hypre_ForBoxI */
 
@@ -663,16 +655,8 @@
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -711,7 +695,7 @@
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -744,16 +728,9 @@
                                                  loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -788,7 +765,7 @@
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -847,16 +824,9 @@
                      /* increase the loop_size by one in the Z_Face direction to
                         cover upper boundary Z_Faces. */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -893,7 +863,7 @@
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -927,16 +897,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -973,7 +936,7 @@
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -1031,16 +994,9 @@
 
                      /* increase the loop_size by one in the X_Face direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1076,7 +1032,7 @@
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -1109,16 +1065,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1154,7 +1103,7 @@
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -1204,16 +1153,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1245,7 +1186,7 @@
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1268,16 +1209,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1311,7 +1244,7 @@
                      }     /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1334,16 +1267,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1386,7 +1311,7 @@
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1409,16 +1334,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1461,7 +1378,7 @@
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -1485,16 +1402,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1536,7 +1445,7 @@
                         var_index[0]-= (rfactor[0]-1);
                      }  /* for (p= 1; p< rfactor[1]; p++) */
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1727,16 +1636,8 @@
 
             /* note that the correct cbox corresponding to this non-vanishing
                fbox is used. */
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,j,entry,p,cindex,l,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1774,8 +1675,7 @@
                   k++;
                }  /* if ((p <= upper_ranks[part][t]) && (p >= lower_ranks[part][t])) */
             }
-
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
          }   /* hypre_ForBoxI */
 
          hypre_TFree(boxoffset);
@@ -1871,16 +1771,9 @@
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1943,7 +1836,7 @@
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y plane direction */
                      hypre_CopyIndex(Edge_cstarts[part][i], cstart);
@@ -1980,16 +1873,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2048,7 +1934,7 @@
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -2113,16 +1999,9 @@
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2183,7 +2062,7 @@
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2221,16 +2100,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+                     
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2289,7 +2161,7 @@
                         }     /* for (p= 0; p< rfactor[1]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -2350,16 +2222,9 @@
 
                      /* increase the loop_size by one in the X plane direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2420,7 +2285,7 @@
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y plane */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2457,16 +2322,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2525,7 +2383,7 @@
                         }     /* for (p= 0; p< rfactor[2]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -2576,16 +2434,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2627,7 +2477,7 @@
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2652,16 +2502,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2703,7 +2545,7 @@
                      }     /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2728,16 +2570,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2806,7 +2640,7 @@
                         }     /* for (n= 1; n< rfactor[1]; n++) */
                      }        /* for (p= 1; p< rfactor[2]; p++) */
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2831,16 +2665,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2909,7 +2735,7 @@
                      }        /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2934,16 +2760,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3011,7 +2829,7 @@
                         }     /* for (n= 1; n< rfactor[0]; n++) */
                      }        /* for (p= 1; p< rfactor[1]; p++) */
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_semi_interp.c hypre-2.13.0/src/sstruct_ls/maxwell_semi_interp.c
--- hypre-2.11.2/src/sstruct_ls/maxwell_semi_interp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/maxwell_semi_interp.c	2017-10-20 17:42:22.000000000 +0000
@@ -48,7 +48,7 @@
 HYPRE_Int
 hypre_DestroyPTopology(void *PTopology_vdata)
 {
-	hypre_PTopology       *PTopology= (hypre_PTopology       *)PTopology_vdata;
+   hypre_PTopology       *PTopology= (hypre_PTopology       *)PTopology_vdata;
    HYPRE_Int              ierr     = 0;
 
    if (PTopology)
@@ -148,7 +148,7 @@
    HYPRE_Int              nvars, Face_nvars, Edge_nvars, part, var, box, fboxi;
    HYPRE_Int              tot_vars= 8;
 
-   HYPRE_Int              t, i, j, k, l, m, n, p, r;
+   HYPRE_Int              t, i, j, k, l, m, n, p;
 
    HYPRE_Int              ilower, iupper;
    HYPRE_Int              jlower, jupper;
@@ -1000,15 +1000,7 @@
 
          /* loop over each cell and find the row rank of Element_edge and then
             the column ranks of the connected fine edges. */
-         hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,findex,entry,rank,nElements,low_index,t,hi_index,var,m,k,j,var_index,nElements_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-         hypre_BoxLoopSetOneBlock();
-#endif
-         hypre_BoxLoop0For()
+         hypre_SerialBoxLoop0Begin(ndim, loop_size);
          {
             hypre_BoxLoopGetIndex(lindex);
             hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1122,7 +1114,7 @@
                }        /* for (t= 0; t< Face_nvars; t++) */
             }           /* if (ndim == 2) */
          }
-         hypre_BoxLoop0End();
+         hypre_SerialBoxLoop0End();
       }  /* hypre_ForBoxI(i, cboxes) */
    }     /* for (part= 0; part< nparts; part++) */
 
@@ -1277,15 +1269,7 @@
                      hypre_ClearIndex(stride);
                      hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1354,7 +1338,7 @@
                               (rank >= clower_ranks[part][var])) */
                      }
 
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(i, cboxes) */
                   break;
                }   /* case 2:  x_Faces-> y_iedges, z_iedges */
@@ -1382,15 +1366,7 @@
                      hypre_ClearIndex(stride);
                      hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1458,7 +1434,7 @@
                               (rank >= clower_ranks[part][var])) */
                      }
 
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(i, cboxes) */
                   break;
                }   /* case 3:  y_Faces-> x_iedges, z_iedges */
@@ -1486,15 +1462,7 @@
                      hypre_ClearIndex(stride);
                      hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1562,7 +1530,7 @@
                         }  /* if ((rank <= cupper_ranks[part][var]) &&
                               (rank >= clower_ranks[part][var])) */
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(i, cboxes) */
                   break;
                }   /* case 4:  z_Faces-> x_iedges, y_iedges */
@@ -1713,15 +1681,7 @@
             hypre_ClearIndex(stride);
             hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-            hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,entry,rank,nEdges,cell_index,findex,j,var_index,m,entry,rank,nEdges_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop0For()
+            hypre_SerialBoxLoop0Begin(ndim, loop_size);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1852,7 +1812,7 @@
                }   /* if ((rank <= cupper_ranks[part][var]) &&
                       (rank >= clower_ranks[part][var])) */
             }
-            hypre_BoxLoop0End();
+            hypre_SerialBoxLoop0End();
 
          }  /* hypre_ForBoxI(i, cboxes) */
       }     /* for (t= 0; t< Edge_nvars; t++) */
@@ -1935,15 +1895,7 @@
             hypre_BoxGetSize(cbox, loop_size);
             hypre_CopyIndex(hypre_BoxIMin(cbox), start);
 
-            hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,t,var,entry,rank,nElements_Faces,var_index) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop0For()
+            hypre_SerialBoxLoop0Begin(ndim, loop_size);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1973,7 +1925,7 @@
                }
 
             }
-            hypre_BoxLoop0End();
+            hypre_SerialBoxLoop0End();
          }  /* hypre_ForBoxI(i, cboxes) */
       }  /* if (ndim == 3) */
 
@@ -1994,15 +1946,7 @@
          hypre_BoxGetSize(cbox, loop_size);
          hypre_CopyIndex(hypre_BoxIMin(cbox), start);
 
-         hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,t,var,entry,rank,nElements_Edges,var_index) HYPRE_SMP_SCHEDULE
-#endif
-#else
-         hypre_BoxLoopSetOneBlock();
-#endif
-         hypre_BoxLoop0For()
+         hypre_SerialBoxLoop0Begin(ndim, loop_size);
          {
             hypre_BoxLoopGetIndex(lindex);
             hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -2161,7 +2105,7 @@
                }   /* switch (var) */
             }      /* for (t= 0; t< Edge_nvars; t++) */
          }
-         hypre_BoxLoop0End();
+         hypre_SerialBoxLoop0End();
       }  /* hypre_ForBoxI(i, cboxes) */
    }     /* for (part= 0; part< nparts; part++) */
 
@@ -2368,16 +2312,8 @@
 
             hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2398,7 +2334,7 @@
                   j++;
                }
             }
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
 
          }   /* hypre_ForBoxI */
          hypre_TFree(boxoffset);
@@ -2499,16 +2435,8 @@
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2549,7 +2477,7 @@
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2582,16 +2510,9 @@
 
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2631,7 +2552,7 @@
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -2690,16 +2611,9 @@
 
                      /* reset and then increase the loop_size by one in the Z_Face direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2738,7 +2652,7 @@
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2772,16 +2686,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2820,7 +2727,7 @@
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -2879,16 +2786,9 @@
 
                      /* increase the loop_size by one in the X_Face direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2930,7 +2830,7 @@
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2964,16 +2864,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3014,7 +2907,7 @@
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -3064,16 +2957,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3105,7 +2990,7 @@
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3128,16 +3013,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3170,7 +3047,7 @@
                         }  /* for (n= 0; n< rfactor[0]; n++) */
                      }     /* for (p= 1; p< rfactor[1]; p++) */
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3193,16 +3070,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3245,7 +3114,7 @@
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3268,16 +3137,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3320,7 +3181,7 @@
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -3344,16 +3205,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3396,7 +3249,7 @@
                      }  /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3596,16 +3449,8 @@
             /* note that the correct cbox corresponding to this non-vanishing
                fbox is used. */
 
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,j,entry,cindex,var_index,rank,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3643,7 +3488,7 @@
                   k++;
                }
             }
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
          }   /* hypre_ForBoxI */
          hypre_TFree(boxoffset);
          hypre_TFree(suboffset);
@@ -3754,16 +3599,9 @@
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3862,7 +3700,7 @@
                            }
                         }  /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyIndex(Edge_cstarts[part][i], cstart);
@@ -3899,16 +3737,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4006,7 +3837,7 @@
                         }  /* for (p= 0; p< rfactor[0]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -4070,16 +3901,9 @@
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4172,7 +3996,7 @@
                            }
                         }  /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -4210,16 +4034,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4317,7 +4134,7 @@
                         }  /* for (p= 0; p< rfactor[1]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -4381,16 +4198,9 @@
 
                      /* increase the loop_size by one in the X plane direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4483,7 +4293,7 @@
                            }
                         }  /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -4520,16 +4330,9 @@
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4627,7 +4430,7 @@
                         }  /* for (p= 0; p< rfactor[2]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -4685,16 +4488,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4751,7 +4546,7 @@
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -4776,16 +4571,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4842,7 +4629,7 @@
                      }     /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -4867,16 +4654,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -5004,7 +4783,7 @@
                         }     /* for (n= 1; n< rfactor[1]; n++) */
                      }        /* for (p= 1; p< rfactor[2]; p++) */
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -5029,16 +4808,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -5167,7 +4938,7 @@
                      }        /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -5192,16 +4963,8 @@
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -5330,7 +5093,7 @@
                      }        /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
diff -Nru hypre-2.11.2/src/sstruct_ls/maxwell_TV_setup.c hypre-2.13.0/src/sstruct_ls/maxwell_TV_setup.c
--- hypre-2.11.2/src/sstruct_ls/maxwell_TV_setup.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/maxwell_TV_setup.c	2017-10-20 17:42:22.000000000 +0000
@@ -32,7 +32,7 @@
                       hypre_SStructVector  *b_in,
                       hypre_SStructVector  *x_in)
 {
-	hypre_MaxwellData     *maxwell_TV_data = (hypre_MaxwellData     *)maxwell_vdata;
+   hypre_MaxwellData     *maxwell_TV_data = (hypre_MaxwellData     *)maxwell_vdata;
 
    MPI_Comm               comm = hypre_SStructMatrixComm(Aee_in);
 
@@ -387,13 +387,12 @@
                hypre_BoxGetSize(box_piece, loop_size);
                hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
 
-               hypre_BoxLoop0Begin(ndim, loop_size);
+               hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #endif
-               hypre_BoxLoop0For()
                {
                   hypre_BoxLoopGetIndex(lindex);
                   hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -404,7 +403,7 @@
                   flag[rank-start_rank] = 0;
                   flag2[rank-start_rank]= rank;
                }
-               hypre_BoxLoop0End();
+               hypre_SerialBoxLoop0End();
             }  /* if (hypre_BoxVolume(box_piece) < i) */
          }  /* for (m= 0; m< hypre_BoxArraySize(tmp_box_array); m++) */
          hypre_BoxArrayDestroy(tmp_box_array);
@@ -490,7 +489,7 @@
                         hypre_SStructVectorParVector(bn),
                         hypre_SStructVectorParVector(xn));
    {
-	   amg_data = (hypre_ParAMGData*) amg_vdata;
+      amg_data = (hypre_ParAMGData*) amg_vdata;
 
       node_numlevels= hypre_ParAMGDataNumLevels(amg_data);
 
diff -Nru hypre-2.11.2/src/sstruct_ls/node_relax.c hypre-2.13.0/src/sstruct_ls/node_relax.c
--- hypre-2.11.2/src/sstruct_ls/node_relax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/node_relax.c	2017-10-20 17:42:22.000000000 +0000
@@ -51,7 +51,7 @@
    HYPRE_Real           **A_loc;
    HYPRE_Real            *x_loc;
 
-   /* pointers for vector and matrix data */	
+   /* pointers for vector and matrix data */    
    HYPRE_Real          ***Ap;
    HYPRE_Real           **bp;
    HYPRE_Real           **xp;
@@ -118,7 +118,7 @@
 HYPRE_Int
 hypre_NodeRelaxDestroy( void *relax_vdata )
 {
-	hypre_NodeRelaxData  *relax_data = (hypre_NodeRelaxData  *)relax_vdata;
+   hypre_NodeRelaxData  *relax_data = (hypre_NodeRelaxData  *)relax_vdata;
    HYPRE_Int             i,vi;
    HYPRE_Int             nvars;
 
@@ -147,8 +147,8 @@
       hypre_TFree(relax_data -> compute_pkgs);
       hypre_SStructPVectorDestroy(relax_data -> t);
 
-      hypre_TFree(relax_data -> x_loc);
-      hypre_TFree((relax_data ->A_loc)[0]);
+      hypre_UMTFree(relax_data -> x_loc);
+      hypre_UMTFree((relax_data ->A_loc)[0]);
       hypre_TFree(relax_data -> A_loc);
       hypre_TFree(relax_data -> bp);
       hypre_TFree(relax_data -> xp);
@@ -277,9 +277,9 @@
     * Allocate storage used to invert local diagonal blocks
     *----------------------------------------------------------*/
 
-   x_loc    = hypre_TAlloc(HYPRE_Real   , hypre_NumThreads()*nvars);
+   x_loc    = hypre_UMTAlloc(HYPRE_Real   , hypre_NumThreads()*nvars);
    A_loc    = hypre_TAlloc(HYPRE_Real  *, hypre_NumThreads()*nvars);
-   A_loc[0] = hypre_TAlloc(HYPRE_Real   , hypre_NumThreads()*nvars*nvars);
+   A_loc[0] = hypre_UMTAlloc(HYPRE_Real   , hypre_NumThreads()*nvars*nvars);
    for (vi = 1; vi < hypre_NumThreads()*nvars; vi++)
    {
       A_loc[vi] = A_loc[0] + vi*nvars;
@@ -564,15 +564,8 @@
    hypre_Box             *x_data_box;
    hypre_Box             *t_data_box;
                         
-   HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
-                        
    HYPRE_Real           **tA_loc = (relax_data -> A_loc);
    HYPRE_Real            *tx_loc = (relax_data -> x_loc);
-   HYPRE_Real           **A_loc;
-   HYPRE_Real            *x_loc;
 
    HYPRE_Real          ***Ap = (relax_data -> Ap);
    HYPRE_Real           **bp = (relax_data -> bp);
@@ -704,12 +697,14 @@
                                    b_data_box, start, stride, bi,
                                    x_data_box, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,bi,xi,vi,vj,x_loc,A_loc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ai, bi, xi)
                {
-                  A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
-                  x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real   **A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real    *x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Int vi, vj;
+                   
                   /*------------------------------------------------
                    * Copy rhs and matrix for diagonal coupling
                    * (intra-nodal) into local storage.
@@ -840,6 +835,7 @@
 #endif
                hypre_BoxLoop2For(bi, ti)
                {
+                  HYPRE_Int vi;
                   /* Copy rhs into temp vector */ 
                   for (vi = 0; vi < nvars; vi++)
                   {
@@ -904,12 +900,14 @@
                                    A_data_box, start, stride, Ai,
                                    t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,ti,vi,vj,x_loc,A_loc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(Ai, ti)
                {
-                  A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
-                  x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real   **A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real    *x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Int vi, vj;
+                  
                   /*------------------------------------------------
                    * Copy rhs and matrix for diagonal coupling
                    * (intra-nodal) into local storage.
diff -Nru hypre-2.11.2/src/sstruct_ls/sstruct_sharedDOFComm.c hypre-2.13.0/src/sstruct_ls/sstruct_sharedDOFComm.c
--- hypre-2.11.2/src/sstruct_ls/sstruct_sharedDOFComm.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_ls/sstruct_sharedDOFComm.c	2017-10-20 17:42:22.000000000 +0000
@@ -48,7 +48,7 @@
 HYPRE_Int
 hypre_MaxwellOffProcRowDestroy(void *OffProcRow_vdata)
 {
-	hypre_MaxwellOffProcRow  *OffProcRow= (hypre_MaxwellOffProcRow  *)OffProcRow_vdata;
+   hypre_MaxwellOffProcRow  *OffProcRow= (hypre_MaxwellOffProcRow  *)OffProcRow_vdata;
    HYPRE_Int                 ierr= 0;
 
    if (OffProcRow)
@@ -689,15 +689,7 @@
                      hypre_BoxGetSize(&boxman_entry_box, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(&boxman_entry_box), start);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,rank,tot_nsendRowsNcols,n,col_inds,values,send_ColsData_alloc,k,tot_sendColsData) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -742,7 +734,7 @@
                            }  /* if (rank <= end_rank && rank >= start_rank) */
                         }     /* if (entry) */
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
 
                   }  /* if (proc != myproc) */
                }     /* for (m= 0; m< nboxman_entries; m++) */
diff -Nru hypre-2.11.2/src/sstruct_mv/headers hypre-2.13.0/src/sstruct_mv/headers
--- hypre-2.11.2/src/sstruct_mv/headers	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_mv/headers	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,9 @@
 
 cat > $INTERNAL_HEADER <<@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_SSTRUCT_MV_HEADER
 #define hypre_SSTRUCT_MV_HEADER
 
diff -Nru hypre-2.11.2/src/sstruct_mv/_hypre_sstruct_mv.h hypre-2.13.0/src/sstruct_mv/_hypre_sstruct_mv.h
--- hypre-2.11.2/src/sstruct_mv/_hypre_sstruct_mv.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_mv/_hypre_sstruct_mv.h	2017-10-20 17:42:22.000000000 +0000
@@ -1,4 +1,7 @@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_SSTRUCT_MV_HEADER
 #define hypre_SSTRUCT_MV_HEADER
 
diff -Nru hypre-2.11.2/src/sstruct_mv/sstruct_matrix.c hypre-2.13.0/src/sstruct_mv/sstruct_matrix.c
--- hypre-2.11.2/src/sstruct_mv/sstruct_matrix.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_mv/sstruct_matrix.c	2017-10-20 17:42:22.000000000 +0000
@@ -728,7 +728,7 @@
             box = hypre_BoxArrayBox(boxes, b);
             hypre_CopyBox(box, ghost_box);
             if (matrix_type == HYPRE_SSTRUCT || matrix_type == HYPRE_STRUCT)
-	    {
+            {
                hypre_BoxGrowByArray(ghost_box, hypre_StructGridNumGhost(sgrid));
             }
             start = hypre_BoxIMin(box);
@@ -963,7 +963,7 @@
    hypre_IndexRef        start;
    hypre_Index           rs, cs;
    HYPRE_Int             row_base, col_base;
-   HYPRE_Int             d, ei, entry, ii, jj, i, mi, vi;
+   HYPRE_Int             ei, entry, ii, jj, i;
    HYPRE_Int             matrix_type = hypre_SStructMatrixObjectType(matrix);
 
    box  = hypre_BoxCreate(ndim);
@@ -982,7 +982,7 @@
       int_box = hypre_BoxCreate(ndim);
 
       nrows    = hypre_BoxVolume(vbox)*nentries;
-      ncols    = hypre_CTAlloc(HYPRE_Int, nrows);
+      ncols    = hypre_UMCTAlloc(HYPRE_Int, nrows);
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
@@ -990,9 +990,9 @@
       {
          ncols[i] = 1;
       }
-      rows     = hypre_CTAlloc(HYPRE_Int, nrows);
-      cols     = hypre_CTAlloc(HYPRE_Int, nrows);
-      ijvalues = hypre_CTAlloc(HYPRE_Complex, nrows);
+      rows     = hypre_UMCTAlloc(HYPRE_Int, nrows);
+      cols     = hypre_UMCTAlloc(HYPRE_Int, nrows);
+      ijvalues = hypre_UMCTAlloc(HYPRE_Complex, nrows);
 
       hypre_SetIndex(stride, 1);
 
@@ -1045,14 +1045,18 @@
                      
                start = hypre_BoxIMin(int_box);
                hypre_BoxGetSize(int_box, loop_size);
-               hypre_BoxLoop2Begin(ndim, loop_size,
+               /*FIXME: It has to be the old boxloop */
+               zypre_BoxLoop2Begin(ndim, loop_size,
                                    int_box, start, stride, mi,
                                    vbox,    start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi,index,d) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-               hypre_BoxLoop2For(mi, vi)
+               zypre_BoxLoop2For(mi, vi)
                {
+                  hypre_Index index;
+                  HYPRE_Int   d;
+                  
                   hypre_BoxLoopGetIndex(index);
                   rows[nrows + mi] = row_base;
                   cols[nrows + mi] = col_base;
@@ -1063,7 +1067,7 @@
                   }
                   ijvalues[nrows + mi] = values[ei + vi*nentries];
                }
-               hypre_BoxLoop2End(mi, vi);
+               zypre_BoxLoop2End(mi, vi);
 
                nrows += hypre_BoxVolume(int_box);
 
@@ -1099,10 +1103,10 @@
 
       hypre_TFree(boxman_entries);
       
-      hypre_TFree(ncols);
-      hypre_TFree(rows);
-      hypre_TFree(cols);
-      hypre_TFree(ijvalues);
+      hypre_UMTFree(ncols);
+      hypre_UMTFree(rows);
+      hypre_UMTFree(cols);
+      hypre_UMTFree(ijvalues);
 
       hypre_BoxDestroy(to_box);
       hypre_BoxDestroy(map_box);
@@ -1117,9 +1121,7 @@
    {
       /* RDF: THREAD (Check safety on UMatrixSetValues call) */
       hypre_BoxGetSize(vbox, loop_size);
-      hypre_BoxLoop0Begin(ndim, loop_size);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop0For()
+      hypre_SerialBoxLoop0Begin(ndim, loop_size);
       {
          hypre_BoxLoopGetIndex(index);
          hypre_AddIndexes(index, hypre_BoxIMin(vbox), ndim, index);
@@ -1127,7 +1129,7 @@
                                        nentries, entries, values, action);
          values += nentries;
       }
-      hypre_BoxLoop0End();
+      hypre_SerialBoxLoop0End();
    }
 
    hypre_BoxDestroy(box);
@@ -1383,7 +1385,7 @@
    hypre_SStructBoxManInfo *frinfo, *toinfo;
    HYPRE_Complex           *tvalues = NULL;
    HYPRE_Int                nfrentries, ntoentries, frpart, topart;
-   HYPRE_Int                entry, sentry, ei, fri, toi, vi, mi;
+   HYPRE_Int                entry, sentry, ei, fri, toi;
 
    pmatrix = hypre_SStructMatrixPMatrix(matrix, part);
 
@@ -1481,7 +1483,7 @@
                                          ibox1, start, stride, mi,
                                          vbox,  start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop2For(mi, vi)
                      {
@@ -1513,7 +1515,7 @@
                                          ibox1, start, stride, mi,
                                          vbox,  start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop2For(mi, vi)
                      {
diff -Nru hypre-2.11.2/src/sstruct_mv/sstruct_vector.c hypre-2.13.0/src/sstruct_mv/sstruct_vector.c
--- hypre-2.11.2/src/sstruct_mv/sstruct_vector.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/sstruct_mv/sstruct_vector.c	2017-10-20 17:42:22.000000000 +0000
@@ -614,11 +614,9 @@
    hypre_SStructPVector *pvector;
    hypre_StructVector   *y;
    hypre_Box            *y_data_box;
-   HYPRE_Int             yi;
    HYPRE_Complex        *yp;
    hypre_BoxArray       *boxes;
    hypre_Box            *box;
-   HYPRE_Int             bi;
    hypre_Index           loop_size;
    hypre_IndexRef        start;
    hypre_Index           stride;
@@ -655,7 +653,7 @@
                                 y_data_box, start, stride, yi,
                                 box,        start, stride, bi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,bi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(yi, bi)
             {
@@ -698,11 +696,9 @@
    hypre_SStructPVector *pvector;
    hypre_StructVector   *y;
    hypre_Box            *y_data_box;
-   HYPRE_Int             yi;
    HYPRE_Complex        *yp;
    hypre_BoxArray       *boxes;
    hypre_Box            *box;
-   HYPRE_Int             bi;
    hypre_Index           loop_size;
    hypre_IndexRef        start;
    hypre_Index           stride;
@@ -741,7 +737,7 @@
                                    y_data_box, start, stride, yi,
                                    box,        start, stride, bi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,bi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(yi, bi)
                {
diff -Nru hypre-2.11.2/src/struct_ls/cyclic_reduction.c hypre-2.13.0/src/struct_ls/cyclic_reduction.c
--- hypre-2.11.2/src/struct_ls/cyclic_reduction.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/cyclic_reduction.c	2017-10-20 17:42:22.000000000 +0000
@@ -26,7 +26,7 @@
 #define hypre_CycRedSetCIndex(base_index, base_stride, level, cdir, cindex) \
    {                                                                    \
       if (level > 0)                                                    \
-         hypre_SetIndex3(cindex, 0, 0, 0);                               \
+         hypre_SetIndex3(cindex, 0, 0, 0);                              \
       else                                                              \
          hypre_CopyIndex(base_index,  cindex);                          \
       hypre_IndexD(cindex, cdir) += 0;                                  \
@@ -35,7 +35,7 @@
 #define hypre_CycRedSetFIndex(base_index, base_stride, level, cdir, findex) \
    {                                                                    \
       if (level > 0)                                                    \
-         hypre_SetIndex3(findex, 0, 0, 0);                               \
+         hypre_SetIndex3(findex, 0, 0, 0);                              \
       else                                                              \
          hypre_CopyIndex(base_index,  findex);                          \
       hypre_IndexD(findex, cdir) += 1;                                  \
@@ -44,7 +44,7 @@
 #define hypre_CycRedSetStride(base_index, base_stride, level, cdir, stride) \
    {                                                                    \
       if (level > 0)                                                    \
-         hypre_SetIndex3(stride, 1, 1, 1);                               \
+         hypre_SetIndex3(stride, 1, 1, 1);                              \
       else                                                              \
          hypre_CopyIndex(base_stride, stride);                          \
       hypre_IndexD(stride, cdir) *= 2;                                  \
@@ -238,11 +238,8 @@
 
    HYPRE_Real             *a_cc, *a_cw, *a_ce;
    HYPRE_Real             *ac_cc, *ac_cw, *ac_ce;
-                    
-   HYPRE_Int               iA, iAm1, iAp1;
-   HYPRE_Int               iAc;
                          
-   HYPRE_Int               offsetA; 
+   HYPRE_Int               offsetA;
                          
    stridef = cstride;
    hypre_SetIndex3(stridec, 1, 1, 1);
@@ -334,12 +331,12 @@
                              A_dbox, fstart, stridef, iA,
                              Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,iAm1,iAp1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(iA, iAc)
          {
-            iAm1 = iA - offsetA;
-            iAp1 = iA + offsetA;
+            HYPRE_Int iAm1 = iA - offsetA;
+            HYPRE_Int iAp1 = iA + offsetA;
 
             ac_cw[iAc] = - a_cw[iA] *a_cw[iAm1] / a_cc[iAm1];
 
@@ -365,12 +362,12 @@
                              A_dbox, fstart, stridef, iA,
                              Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,iAm1,iAp1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(iA, iAc)
          {
-            iAm1 = iA - offsetA;
-            iAp1 = iA + offsetA;
+            HYPRE_Int iAm1 = iA - offsetA;
+            HYPRE_Int iAp1 = iA + offsetA;
 
             ac_cw[iAc] = - a_cw[iA] *a_cw[iAm1] / a_cc[iAm1];
 
@@ -431,7 +428,7 @@
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                                 Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
@@ -453,7 +450,7 @@
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                                 Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
@@ -482,7 +479,7 @@
                             hypre_StructVector *b,
                             hypre_StructVector *x             )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *) cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *) cyc_red_vdata;
 
    MPI_Comm                comm        = (cyc_red_data -> comm);
    HYPRE_Int               cdir        = (cyc_red_data -> cdir);
@@ -594,7 +591,7 @@
    /*-----------------------------------------------------
     * Set up matrix and vector structures
     *-----------------------------------------------------*/
-
+   
    A_l  = hypre_TAlloc(hypre_StructMatrix *, num_levels);
    x_l  = hypre_TAlloc(hypre_StructVector *, num_levels);
 
@@ -615,8 +612,9 @@
       data_size += hypre_StructVectorDataSize(x_l[l+1]);
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
-
+   //data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   data =  hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+   
    (cyc_red_data -> data) = data;
 
    for (l = 0; l < (num_levels - 1); l++)
@@ -739,7 +737,7 @@
                        hypre_StructVector *b,
                        hypre_StructVector *x             )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
 
    HYPRE_Int             num_levels      = (cyc_red_data -> num_levels);
    HYPRE_Int             cdir            = (cyc_red_data -> cdir);
@@ -773,11 +771,6 @@
    HYPRE_Real           *xp, *xwp, *xep;
    HYPRE_Real           *bp;
    HYPRE_Real           *xcp;
-                       
-   HYPRE_Int             Ai;
-   HYPRE_Int             xi;
-   HYPRE_Int             bi;
-   HYPRE_Int             xci;
                      
    hypre_Index           cindex;
    hypre_Index           stride;
@@ -821,12 +814,12 @@
 
       hypre_CopyIndex(hypre_BoxIMin(compute_box), start);
       hypre_BoxGetStrideSize(compute_box, base_stride, loop_size);
-
+          
       hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size,
                           x_dbox, start, base_stride, xi,
                           b_dbox, start, base_stride, bi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,bi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, bi)
       {
@@ -879,7 +872,7 @@
                              A_dbox, start, stride, Ai,
                              x_dbox, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(Ai, xi)
          {
@@ -952,13 +945,13 @@
                hypre_StructMapFineToCoarse(start, cindex, stride, startc);
 
                hypre_BoxGetStrideSize(compute_box, stride, loop_size);
-
+                           
                hypre_BoxLoop3Begin(hypre_StructVectorNDim(x), loop_size,
                                    A_dbox, start, stride, Ai,
                                    x_dbox, start, stride, xi,
                                    xc_dbox, startc, stridec, xci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,xci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ai, xi, xci)
                {
@@ -1003,7 +996,7 @@
                           A_dbox, start, stride, Ai,
                           x_dbox, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(Ai, xi)
       {
@@ -1128,7 +1121,7 @@
                                    A_dbox, start, stride, Ai,
                                    x_dbox, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(Ai, xi)
                {
@@ -1159,7 +1152,7 @@
                               hypre_Index  base_index,
                               hypre_Index  base_stride )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
    HYPRE_Int                d;
  
    for (d = 0; d < 3; d++)
@@ -1181,7 +1174,7 @@
 hypre_CyclicReductionSetCDir( void        *cyc_red_vdata,
                               HYPRE_Int    cdir )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
  
    (cyc_red_data -> cdir) = cdir;
 
@@ -1195,7 +1188,7 @@
 HYPRE_Int
 hypre_CyclicReductionDestroy( void *cyc_red_vdata )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
 
    HYPRE_Int l;
 
@@ -1215,7 +1208,7 @@
          hypre_ComputePkgDestroy(cyc_red_data -> up_compute_pkg_l[l]);
       }
       hypre_BoxArrayDestroy(cyc_red_data -> fine_points_l[l]);
-      hypre_SharedTFree(cyc_red_data -> data); 
+      hypre_DeviceTFree(cyc_red_data -> data);
       hypre_TFree(cyc_red_data -> grid_l);
       hypre_TFree(cyc_red_data -> fine_points_l);
       hypre_TFree(cyc_red_data -> A_l);
diff -Nru hypre-2.11.2/src/struct_ls/HYPRE_struct_int.c hypre-2.13.0/src/struct_ls/HYPRE_struct_int.c
--- hypre-2.11.2/src/struct_ls/HYPRE_struct_int.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/HYPRE_struct_int.c	2017-10-20 17:42:22.000000000 +0000
@@ -19,7 +19,6 @@
 {
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Real         *vp;
 
    hypre_BoxArray     *boxes;
@@ -34,7 +33,8 @@
     * Set the vector coefficients
     *-----------------------------------------------------------------------*/
 
-   srand( seed );
+//   srand( seed );
+   hypre_SeedRand(seed);
 
    hypre_SetIndex3(unit_stride, 1, 1, 1);
  
@@ -53,11 +53,12 @@
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
-         vp[vi] = 2.0*rand()/RAND_MAX - 1.0;
+//         vp[vi] = 2.0*rand()/RAND_MAX - 1.0;
+         vp[vi] = 2.0*hypre_Rand() - 1.0;
       }
       hypre_BoxLoop1End(vi);
    }
diff -Nru hypre-2.11.2/src/struct_ls/_hypre_struct_ls.h hypre-2.13.0/src/struct_ls/_hypre_struct_ls.h
--- hypre-2.11.2/src/struct_ls/_hypre_struct_ls.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/_hypre_struct_ls.h	2017-10-20 17:42:22.000000000 +0000
@@ -32,8 +32,6 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-
 /* coarsen.c */
 HYPRE_Int hypre_StructMapFineToCoarse ( hypre_Index findex , hypre_Index index , hypre_Index stride , hypre_Index cindex );
 HYPRE_Int hypre_StructMapCoarseToFine ( hypre_Index cindex , hypre_Index index , hypre_Index stride , hypre_Index findex );
diff -Nru hypre-2.11.2/src/struct_ls/HYPRE_struct_pcg.c hypre-2.13.0/src/struct_ls/HYPRE_struct_pcg.c
--- hypre-2.11.2/src/struct_ls/HYPRE_struct_pcg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/HYPRE_struct_pcg.c	2017-10-20 17:42:22.000000000 +0000
@@ -203,10 +203,6 @@
    HYPRE_Real           *Ap;
    HYPRE_Real           *yp;
    HYPRE_Real           *xp;
-                       
-   HYPRE_Int             Ai;
-   HYPRE_Int             yi;
-   HYPRE_Int             xi;
                      
    hypre_Index           index;
    hypre_IndexRef        start;
@@ -240,7 +236,7 @@
                           x_data_box, start, stride, xi,
                           y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop3For(Ai, xi, yi)
       {
diff -Nru hypre-2.11.2/src/struct_ls/pfmg2_setup_rap.c hypre-2.13.0/src/struct_ls/pfmg2_setup_rap.c
--- hypre-2.11.2/src/struct_ls/pfmg2_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg2_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -311,11 +311,7 @@
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int             iA, iAm1, iAp1;
    HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                       
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd; 
    HYPRE_Int             xOffsetP; 
@@ -475,21 +471,20 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
-
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
+           
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1];
-
+           
          iP1 = iP - yOffsetP;
          rap_cs[iAc] = rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_cs[iAm1]
             +                   a_cs[iA]   * pa[iP1];
-
          iP1 = iP - yOffsetP + xOffsetP;
          rap_cse[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1];
 
@@ -497,7 +492,7 @@
          rap_cw[iAc] =          a_cw[iA]
             +          rb[iR] * a_cw[iAm1] * pb[iP1]
             +          ra[iR] * a_cw[iAp1] * pa[iP1];
-
+           
          rap_cc[iAc] =          a_cc[iA]
             +          rb[iR] * a_cc[iAm1] * pb[iP]
             +          ra[iR] * a_cc[iAp1] * pa[iP]
@@ -529,14 +524,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA_diag;
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAm1 = iA - yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1];
 
          iP1 = iP - yOffsetP;
@@ -817,11 +812,7 @@
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int             iA, iAm1, iAp1;
    HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                       
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd; 
    HYPRE_Int             xOffsetP; 
@@ -1000,14 +991,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
 
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
             +           rb[iR] * a_csw[iAm1]
             +                    a_csw[iA]  * pa[iP1];
@@ -1064,21 +1055,20 @@
       a_cnw_offd = a_cnw[iA_offd];
       a_cnw_offdm1 = a_cnw[iA_offdm1];
 
-
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA_diag;
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAm1 = iA - yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1]
             +           rb[iR] * a_csw_offdm1
             +                    a_csw_offd  * pa[iP1];
@@ -1511,10 +1501,8 @@
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd;
    HYPRE_Int             xOffsetP;
    HYPRE_Int             yOffsetP;
@@ -1668,20 +1656,21 @@
    if ( constant_coefficient_A == 0 )
    {
       /*hypre_printf("nosym 5.0.0\n");*/
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
 
          iP1 = iP + yOffsetP;
@@ -1713,20 +1702,20 @@
       a_ce_offd = a_ce[iA_offd];
       a_ce_offdm1 = a_ce[iA_offdm1];
       a_ce_offdp1 = a_ce[iA_offdp1];
-
+ 
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce_offdp1 * pb[iP1];
 
          iP1 = iP + yOffsetP;
@@ -1981,10 +1970,7 @@
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd;
    HYPRE_Int             xOffsetP;
    HYPRE_Int             yOffsetP;
@@ -2164,20 +2150,21 @@
    if ( constant_coefficient_A==0 )
    {
       /*hypre_printf("nosym 9.0.0\n");*/
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]
             +           ra[iR] * a_cne[iAp1]
             +                    a_cne[iA]  * pb[iP1];
@@ -2231,14 +2218,13 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA_diag;
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce_offdp1 * pb[iP1]
             +           ra[iR] * a_cne_offdp1
             +                    a_cne_offd  * pb[iP1];
diff -Nru hypre-2.11.2/src/struct_ls/pfmg3_setup_rap.c hypre-2.13.0/src/struct_ls/pfmg3_setup_rap.c
--- hypre-2.11.2/src/struct_ls/pfmg3_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg3_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -359,10 +359,7 @@
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn;
    HYPRE_Real           *rap_csw, *rap_cse;
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             zOffsetA_diag; 
@@ -565,14 +562,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP - zOffsetP - yOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
          rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1];
 
          iP1 = iP - zOffsetP - xOffsetP;
@@ -632,21 +629,21 @@
       a_bc_offdp1 = a_bc[iA_offdp1];
       a_ac_offd = a_ac[iA_offd];
       a_ac_offdm1 = a_ac[iA_offdm1];
-
+       
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP - zOffsetP - yOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
          rap_bs[iAc] = rb[iR] * a_cs_offdm1 * pa[iP1];
 
          iP1 = iP - zOffsetP - xOffsetP;
@@ -656,7 +653,7 @@
          rap_bc[iAc] =          a_bc_offd   * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc_offdm1;
- 
+
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1];
  
@@ -1015,11 +1012,8 @@
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
-                        
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+            
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             zOffsetA_diag; 
    HYPRE_Int             zOffsetA_offd; 
@@ -1305,14 +1299,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1];
 
          iP1 = iP - zOffsetP - yOffsetP;
@@ -1327,7 +1321,7 @@
          rap_bw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
             +          rb[iR] * a_bw[iAm1]
             +                   a_bw[iA]   * pa[iP1];
- 
+
          iP1 = iP - zOffsetP; 
          rap_bc[iAc] =          a_bc[iA] * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
@@ -1348,7 +1342,7 @@
  
          iP1 = iP - zOffsetP + yOffsetP + xOffsetP;
          rap_bne[iAc] = rb[iR] * a_cne[iAm1] * pa[iP1];
-
+         
          iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] =         a_csw[iA]
             +          rb[iR] * a_csw[iAm1] * pb[iP1]
@@ -1435,14 +1429,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw_offdm1 * pa[iP1];
 
          iP1 = iP - zOffsetP - yOffsetP;
@@ -1457,12 +1451,12 @@
          rap_bw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1]
             +          rb[iR] * a_bw_offdm1
             +                   a_bw_offd   * pa[iP1];
- 
+
          iP1 = iP - zOffsetP; 
          rap_bc[iAc] =          a_bc_offd * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc_offdm1;
- 
+
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1]
             +          rb[iR] * a_be_offdm1
@@ -1483,7 +1477,7 @@
          rap_csw[iAc] =         a_csw_offd
             +          rb[iR] * a_csw_offdm1 * pb[iP1]
             +          ra[iR] * a_csw_offdp1 * pa[iP1];
-
+         
          iP1 = iP - yOffsetP;
          rap_cs[iAc] =          a_cs_offd
             +          rb[iR] * a_cs_offdm1 * pb[iP1]
@@ -1975,10 +1969,8 @@
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             zOffsetA_diag; 
@@ -2302,14 +2294,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]
             +           rb[iR] * a_bsw[iAm1]
             +                    a_bsw[iA]   * pa[iP1];
@@ -2333,7 +2325,7 @@
          rap_bc[iAc] =          a_bc[iA]   * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc[iAm1];
- 
+
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1]
             +          rb[iR] * a_be[iAm1]
@@ -2362,7 +2354,7 @@
             +                   a_asw[iA]   * pa[iP1]
             +          rb[iR] * a_asw[iAm1]
             +          ra[iR] * a_bsw[iAp1];
-
+         
          iP1 = iP - yOffsetP;
          rap_cs[iAc] =          a_cs[iA]
             +          rb[iR] * a_cs[iAm1] * pb[iP1]
@@ -2461,14 +2453,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw_offdm1 * pa[iP1]
             +           rb[iR] * a_bsw_offdm1
             +                    a_bsw_offd   * pa[iP1];
@@ -2492,7 +2484,7 @@
          rap_bc[iAc] =          a_bc_offd   * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc_offdm1;
- 
+         
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1]
             +          rb[iR] * a_be_offdm1
@@ -2512,7 +2504,7 @@
          rap_bne[iAc] = rb[iR] * a_cne_offdm1 * pa[iP1]
             +           rb[iR] * a_bne_offdm1
             +                    a_bne_offd   * pa[iP1];
-
+         
          iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] =          a_csw_offd
             +          rb[iR] * a_csw_offdm1 * pb[iP1]
@@ -2530,7 +2522,7 @@
             +                   a_as_offd   * pa[iP1]
             +          rb[iR] * a_as_offdm1
             +          ra[iR] * a_bs_offdp1;
- 
+         
          iP1 = iP - yOffsetP + xOffsetP;
          rap_cse[iAc] =          a_cse_offd
             +          rb[iR] * a_cse_offdm1 * pb[iP1]
@@ -2539,7 +2531,7 @@
             +                   a_ase_offd   * pa[iP1]
             +          rb[iR] * a_ase_offdm1
             +          ra[iR] * a_bse_offdp1;
-
+                 
          iP1 = iP - xOffsetP;
          rap_cw[iAc] =          a_cw_offd
             +          rb[iR] * a_cw_offdm1 * pb[iP1]
@@ -3179,10 +3171,8 @@
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an;
    HYPRE_Real           *rap_cnw, *rap_cne;
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+
    HYPRE_Int             zOffsetA;
    HYPRE_Int             zOffsetA_diag; 
    HYPRE_Int             zOffsetA_offd; 
@@ -3382,42 +3372,42 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
-
-         iP1 = iP + zOffsetP + yOffsetP;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
+           
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
          rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1];
 
          iP1 = iP + zOffsetP + xOffsetP;
          rap_ae[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
- 
+           
          iP1 = iP + zOffsetP; 
          rap_ac[iAc] =          a_ac[iA]   * pb[iP1]
             +          ra[iR] * a_cc[iAp1] * pb[iP1]
             +          ra[iR] * a_ac[iAp1];
- 
+           
          iP1 = iP + zOffsetP - xOffsetP;
          rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1];
- 
+           
          iP1 = iP + zOffsetP - yOffsetP;
          rap_as[iAc] = ra[iR] * a_cs[iAp1] * pb[iP1];
- 
+           
          iP1 = iP + yOffsetP;
          rap_cn[iAc] =          a_cn[iA]
             +          rb[iR] * a_cn[iAm1] * pb[iP1]
             +          ra[iR] * a_cn[iAp1] * pa[iP1];
- 
+           
          iP1 = iP + xOffsetP;
          rap_ce[iAc] =          a_ce[iA]
             +          rb[iR] * a_ce[iAm1] * pb[iP1]
             +          ra[iR] * a_ce[iAp1] * pa[iP1];
-
+           
          rap_cnw[iAc] = 0.0;
- 
+           
          rap_cne[iAc] = 0.0;
       }
       hypre_BoxLoop4End(iP, iR, iA, iAc);
@@ -3444,14 +3434,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         //HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP + zOffsetP + yOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
          rap_an[iAc] = ra[iR] * a_cn_offdp1 * pb[iP1];
 
          iP1 = iP + zOffsetP + xOffsetP;
@@ -3785,10 +3775,7 @@
    HYPRE_Real           *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an;
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
    HYPRE_Int             zOffsetA;
    HYPRE_Int             zOffsetA_diag; 
    HYPRE_Int             zOffsetA_offd; 
@@ -4068,20 +4055,21 @@
 
    if ( constant_coefficient_A == 0 )
    {
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1];
 
          iP1 = iP + zOffsetP + yOffsetP;
@@ -4106,7 +4094,7 @@
          rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]
             +          ra[iR] * a_aw[iAp1]
             +                   a_aw[iA]   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
          rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1];
 
@@ -4193,14 +4181,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         //HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne_offdp1 * pb[iP1];
 
          iP1 = iP + zOffsetP + yOffsetP;
@@ -4233,7 +4221,7 @@
          rap_as[iAc] = ra[iR] * a_cs_offdp1 * pb[iP1]
             +          ra[iR] * a_as_offdp1
             +                   a_as_offd   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP - xOffsetP;
          rap_asw[iAc] = ra[iR] * a_csw_offdp1 * pb[iP1];
 
@@ -4250,7 +4238,7 @@
             +                   a_an_offd   * pa[iP1]
             +          rb[iR] * a_an_offdm1
             +          ra[iR] * a_bn_offdp1;
- 
+
          iP1 = iP + yOffsetP - xOffsetP;
          rap_cnw[iAc] =         a_cnw_offd
             +          rb[iR] * a_cnw_offdm1 * pb[iP1]
@@ -4700,10 +4688,7 @@
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
                  
    HYPRE_Int             zOffsetA;
    HYPRE_Int             zOffsetA_diag; 
@@ -5021,20 +5006,21 @@
 
    if ( constant_coefficient_A == 0 )
    {
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]
             +           ra[iR] * a_ane[iAp1]
             +                    a_ane[iA]   * pb[iP1];
@@ -5063,7 +5049,7 @@
          rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]
             +          ra[iR] * a_aw[iAp1]
             +                   a_aw[iA]   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
          rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1]
             +           ra[iR] * a_ase[iAp1]
@@ -5088,7 +5074,7 @@
             +                   a_ane[iA]   * pa[iP1]
             +          rb[iR] * a_ane[iAm1]
             +          ra[iR] * a_bne[iAp1];
-
+         
          iP1 = iP + yOffsetP;
          rap_cn[iAc] =          a_cn[iA]
             +          rb[iR] * a_cn[iAm1] * pb[iP1]
@@ -5177,14 +5163,14 @@
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         //HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne_offdp1 * pb[iP1]
             +           ra[iR] * a_ane_offdp1
             +                    a_ane_offd   * pb[iP1];
@@ -5208,12 +5194,12 @@
          rap_ac[iAc] =          a_ac_offd   * pb[iP1]
             +          ra[iR] * a_cc[iAp1] * pb[iP1]
             +          ra[iR] * a_ac_offdp1;
- 
+
          iP1 = iP + zOffsetP - xOffsetP;
          rap_aw[iAc] = ra[iR] * a_cw_offdp1 * pb[iP1]
             +          ra[iR] * a_aw_offdp1
             +                   a_aw_offd   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
          rap_ase[iAc] = ra[iR] * a_cse_offdp1 * pb[iP1]
             +           ra[iR] * a_ase_offdp1
@@ -5229,7 +5215,6 @@
             +           ra[iR] * a_asw_offdp1
             +                    a_asw_offd   * pb[iP1];
 
-
          iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] =         a_cne_offd
             +          rb[iR] * a_cne_offdm1 * pb[iP1]
@@ -5247,7 +5232,7 @@
             +                   a_an_offd   * pa[iP1]
             +          rb[iR] * a_an_offdm1
             +          ra[iR] * a_bn_offdp1;
- 
+
          iP1 = iP + yOffsetP - xOffsetP;
          rap_cnw[iAc] =         a_cnw_offd
             +          rb[iR] * a_cnw_offdm1 * pb[iP1]
diff -Nru hypre-2.11.2/src/struct_ls/pfmg.c hypre-2.13.0/src/struct_ls/pfmg.c
--- hypre-2.11.2/src/struct_ls/pfmg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg.c	2017-10-20 17:42:22.000000000 +0000
@@ -57,7 +57,7 @@
 HYPRE_Int
 hypre_PFMGDestroy( void *pfmg_vdata )
 {
-	hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata;
+   hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata;
 
    HYPRE_Int l;
    
@@ -73,6 +73,9 @@
 
       if ((pfmg_data -> num_levels) > -1)
       {
+         HYPRE_Int constant_coefficient =
+            hypre_StructMatrixConstantCoefficient(pfmg_data -> A_l[0]);
+
          for (l = 0; l < (pfmg_data -> num_levels); l++)
          {
             if (pfmg_data -> active_l[l])
@@ -106,7 +109,11 @@
             hypre_StructVectorDestroy(pfmg_data -> x_l[l+1]);
             hypre_StructVectorDestroy(pfmg_data -> tx_l[l+1]);
          }
-         hypre_SharedTFree(pfmg_data -> data);
+         if (constant_coefficient == 0)
+	   {hypre_DeviceTFree(pfmg_data -> data);}
+         else
+	   {hypre_UMTFree(pfmg_data -> data);}
+      
          hypre_TFree(pfmg_data -> cdir_l);
          hypre_TFree(pfmg_data -> active_l);
          hypre_TFree(pfmg_data -> grid_l);
diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup.c hypre-2.13.0/src/struct_ls/pfmg_setup.c
--- hypre-2.11.2/src/struct_ls/pfmg_setup.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg_setup.c	2017-10-20 17:42:22.000000000 +0000
@@ -17,19 +17,19 @@
 
 #define hypre_PFMGSetCIndex(cdir, cindex)       \
    {                                            \
-      hypre_SetIndex3(cindex, 0, 0, 0);          \
+      hypre_SetIndex3(cindex, 0, 0, 0);         \
       hypre_IndexD(cindex, cdir) = 0;           \
    }
 
 #define hypre_PFMGSetFIndex(cdir, findex)       \
    {                                            \
-      hypre_SetIndex3(findex, 0, 0, 0);          \
+      hypre_SetIndex3(findex, 0, 0, 0);         \
       hypre_IndexD(findex, cdir) = 1;           \
    }
 
 #define hypre_PFMGSetStride(cdir, stride)       \
    {                                            \
-      hypre_SetIndex3(stride, 1, 1, 1);          \
+      hypre_SetIndex3(stride, 1, 1, 1);         \
       hypre_IndexD(stride, cdir) = 2;           \
    }
 
@@ -42,7 +42,7 @@
                  hypre_StructVector *b,
                  hypre_StructVector *x        )
 {
-	hypre_PFMGData       *pfmg_data = (hypre_PFMGData       *)pfmg_vdata;
+   hypre_PFMGData       *pfmg_data = (hypre_PFMGData       *)pfmg_vdata;
 
    MPI_Comm              comm = (pfmg_data -> comm);
                      
@@ -104,6 +104,8 @@
    HYPRE_Int             b_num_ghost[]  = {0, 0, 0, 0, 0, 0};
    HYPRE_Int             x_num_ghost[]  = {1, 1, 1, 1, 1, 1};
 
+   HYPRE_Int              constant_coefficient;
+   
 #if DEBUG
    char                  filename[255];
 #endif
@@ -116,7 +118,8 @@
 
    grid  = hypre_StructMatrixGrid(A);
    ndim  = hypre_StructGridNDim(grid);
-
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(A);
+   
    /* Compute a new max_levels value based on the grid */
    cbox = hypre_BoxDuplicate(hypre_StructGridBoundingBox(grid));
    max_levels = 1;
@@ -377,7 +380,12 @@
       hypre_StructVectorInitializeShell(tx_l[l+1]);
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   //data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+   if (constant_coefficient == 0)
+      data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+   else
+      data = hypre_UMCTAlloc(HYPRE_Real,data_size);
+   
    (pfmg_data -> data) = data;
 
    hypre_StructVectorInitializeData(tx_l[0], data);
@@ -730,6 +738,244 @@
       /* constant_coefficient==0, all coefficients vary with space */
       else
       {
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
+        /*FIXME: need reduction for more variables*/
+	HYPRE_Int tmp = 0;
+	hypre_MatrixIndexMove(A, stencil_size, i, tmp, 3);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcx
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:cx)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,cx);
+         {
+            HYPRE_Int tcx = 0.0;
+            HYPRE_Complex *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            //Ap = (data_A + indices_d[sdiag]);
+            
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+               /* x-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]);
+               if (Astenc)
+               {
+                  tcx -= Ap[Ai]*diag;
+               }
+            }
+
+            cx += tcx;
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,cx);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcx
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:sqcx)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,sqcx);
+         {
+            HYPRE_Int tcx = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+
+               /* x-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]);
+               if (Astenc)
+               {
+                  tcx -= Ap[Ai]*diag;
+               }
+            }       
+            sqcx += (tcx*tcx);
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,sqcx);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcy
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:cy)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,cy);
+         {
+            HYPRE_Int tcy = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+              
+               /* y-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]);
+               if (Astenc)
+               {
+                  tcy -= Ap[Ai]*diag;
+               }
+            }
+
+            cy += tcy;            
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,cy);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcy
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:sqcy)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,sqcy);
+         {
+            HYPRE_Int tcy = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+              
+               /* y-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]);
+               if (Astenc)
+               {
+                  tcy -= Ap[Ai]*diag;
+               }
+            }       
+            sqcy += (tcy*tcy);
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,sqcy);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcz
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:cz)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,cz);
+         {
+            HYPRE_Int tcz = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+              
+               /* z-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]);
+               if (Astenc)
+               {
+                  tcz -= Ap[Ai]*diag;
+               }
+            }
+
+            cz += tcz;            
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,cz);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcz
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:sqcz)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,sqcz);
+         {
+            HYPRE_Int tcz = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+
+               /* z-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]);
+               if (Astenc)
+               {
+                  tcz -= Ap[Ai]*diag;
+               }
+            }       
+            sqcz += (tcz*tcz);
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,sqcz);
+         hypre_StructCleanIndexD();
+#else
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                              A_dbox, start, stride, Ai);
 #ifdef HYPRE_USING_OPENMP
@@ -737,6 +983,11 @@
 #endif
          hypre_BoxLoop1For(Ai)
          {
+            HYPRE_Int tcx,tcy,tcz;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
             tcx = 0.0;
             tcy = 0.0;
             tcz = 0.0;
@@ -784,9 +1035,10 @@
             sqcz += (tcz*tcz);
          }
          hypre_BoxLoop1End(Ai);
+#endif
       }
    }
-
+   
    cxyz[0] = cx;
    cxyz[1] = cy;
    cxyz[2] = cz;
@@ -850,7 +1102,7 @@
       }
       else
       {
-         dxyz[d] = 1.0e+123;
+         dxyz[d] = HYPRE_REAL_MAX/1000;
       }
    }
 
@@ -910,6 +1162,11 @@
       }
       else
       {
+          /*FIXME: need reduction for multiplication*/
+#if defined(HYPRE_USE_CUDA) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)
+	hypre_newBoxLoop1ReductionMult(hypre_StructMatrixNDim(A), loop_size,
+				       A_dbox, start, stride, Ai,Ap,diag_product);
+#else
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                              A_dbox, start, stride, Ai);
 #ifdef HYPRE_USING_OPENMP
@@ -920,6 +1177,7 @@
             diag_product *= Ap[Ai];
          }
          hypre_BoxLoop1End(Ai);
+#endif
       }
    }
 
diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup_interp.c hypre-2.13.0/src/struct_ls/pfmg_setup_interp.c
--- hypre-2.11.2/src/struct_ls/pfmg_setup_interp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg_setup_interp.c	2017-10-20 17:42:22.000000000 +0000
@@ -239,25 +239,25 @@
   HYPRE_Int           si0,
   HYPRE_Int           si1 )
 {
-   HYPRE_Int              si;
-   HYPRE_Int              Ai, Pi;
-   HYPRE_Real            *Ap;
-   HYPRE_Real             center;
-   HYPRE_Int              Astenc;
-   HYPRE_Int              mrk0, mrk1;
    hypre_StructStencil   *stencil = hypre_StructMatrixStencil(A);
    hypre_Index           *stencil_shape = hypre_StructStencilShape(stencil);
    HYPRE_Int              stencil_size = hypre_StructStencilSize(stencil);
    HYPRE_Int              warning_cnt= 0;
 
+   hypre_MatrixIndexMove(A, stencil_size, i, cdir,1);
+   
    hypre_BoxLoop2Begin(hypre_StructMatrixNDim(A), loop_size,
                        A_dbox, start, stride, Ai,
                        P_dbox, startc, stridec, Pi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,Pi,si,center,Ap,Astenc,mrk0,mrk1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    hypre_BoxLoop2For(Ai, Pi)
    {
+      HYPRE_Int si,mrk0,mrk1,Astenc;
+      HYPRE_Real center;
+      HYPRE_Real *Ap;
+           
       center  = 0.0;
       Pp0[Pi] = 0.0;
       Pp1[Pi] = 0.0;
@@ -266,9 +266,10 @@
 
       for (si = 0; si < stencil_size; si++)
       {
-         Ap = hypre_StructMatrixBoxData(A, i, si);
-         Astenc = hypre_IndexD(stencil_shape[si], cdir);
-
+         Ap = hypre_StructGetMatrixBoxData(A, i, si);
+        
+         Astenc = hypre_StructGetIndexD(stencil_shape[si], cdir,stencil_shape_d[si]);
+        
          if (Astenc == 0)
          {
             center += Ap[Ai];
@@ -281,7 +282,7 @@
          {
             Pp1[Pi] -= Ap[Ai];
          }
-
+         
          if (si == si0 && Ap[Ai] == 0.0)
             mrk0++;
          if (si == si1 && Ap[Ai] == 0.0)
@@ -290,7 +291,7 @@
 
       if (!center)
       {
-         warning_cnt++;
+         //warning_cnt++;
          Pp0[Pi] = 0.0;
          Pp1[Pi] = 0.0;  
       }
@@ -449,9 +450,9 @@
    HYPRE_Int              Pi;
    HYPRE_Real            *Ap;
    HYPRE_Real             P0, P1;
-   HYPRE_Real             center, center_offd;
+   HYPRE_Real             center_offd;
    HYPRE_Int              Astenc;
-   HYPRE_Int              mrk0, mrk1, mrk0_offd, mrk1_offd;
+   HYPRE_Int              mrk0_offd, mrk1_offd;
    hypre_StructStencil   *stencil = hypre_StructMatrixStencil(A);
    hypre_Index           *stencil_shape = hypre_StructStencilShape(stencil);
    HYPRE_Int              stencil_size = hypre_StructStencilSize(stencil);
@@ -512,23 +513,34 @@
       }
 
       si = diag_rank;
+      
+      hypre_MatrixIndexMove(A, stencil_size, i, si, 1);
       hypre_BoxLoop2Begin(hypre_StructMatrixNDim(A), loop_size,
                           A_dbox, start, stride, Ai,
                           P_dbox, startc, stridec, Pi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,Pi,center,Ap,Astenc,mrk0,mrk1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(Ai, Pi)
       {
-         Pp0[Pi] = P0;
-         Pp1[Pi] = P1;
+         HYPRE_Int   mrk0,mrk1;
+         HYPRE_Real  center;
+         HYPRE_Real *Ap;
+         HYPRE_Real p0val,p1val;
+         
+         //Pp0[Pi] = P0;
+         //Pp1[Pi] = P1;
+         p0val = P0;
+         p1val = P1;
+         
          center = center_offd;
          mrk0 = mrk0_offd;
          mrk1 = mrk1_offd;
+         
+         Ap = hypre_StructGetMatrixBoxData(A, i, si);
+         //Astenc = hypre_IndexD(stencil_shape[si], cdir);
+         //hypre_assert( Astenc==0 );
 
-         Ap = hypre_StructMatrixBoxData(A, i, si);
-         Astenc = hypre_IndexD(stencil_shape[si], cdir);
-         hypre_assert( Astenc==0 );
          center += Ap[Ai];
 
          if (si == si0 && Ap[Ai] == 0.0)
@@ -538,14 +550,18 @@
 
          if (!center)
          {
-            warning_cnt++;
-            Pp0[Pi] = 0.0;
-            Pp1[Pi] = 0.0;  
+            //warning_cnt++;
+            //Pp0[Pi] = 0.0;
+            //Pp1[Pi] = 0.0;
+            p0val = 0;
+            p1val = 0;
          }
          else
          {
-            Pp0[Pi] /= center;
-            Pp1[Pi] /= center;  
+            //Pp0[Pi] /= center;
+            //Pp1[Pi] /= center;
+            p0val /= center;
+            p1val /= center;
          }
 
          /*----------------------------------------------
@@ -554,13 +570,17 @@
           * interpolation and operator stencils reaching
           * outside domain.
           *----------------------------------------------*/
+
          if (mrk0 != 0)
             Pp0[Pi] = 0.0;
          if (mrk1 != 0)
             Pp1[Pi] = 0.0;
+         Pp0[Pi] = p0val;
+         Pp1[Pi] = p1val;
 
       }
       hypre_BoxLoop2End(Ai, Pi);
+      //hypre_StructCleanIndexD();
    }
 
    if (warning_cnt)
diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup_rap5.c hypre-2.13.0/src/struct_ls/pfmg_setup_rap5.c
--- hypre-2.11.2/src/struct_ls/pfmg_setup_rap5.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg_setup_rap5.c	2017-10-20 17:42:22.000000000 +0000
@@ -194,12 +194,8 @@
 
    HYPRE_Real           *rap_cc, *rap_cw, *rap_ce;
    HYPRE_Real           *rap_cb, *rap_ca;
-   HYPRE_Real            west, east;
-   HYPRE_Real            center_int, center_bdy;
 
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iPm1, iPp1;
+   HYPRE_Real            center_int, center_bdy;
                       
    HYPRE_Int             OffsetA; 
    HYPRE_Int             OffsetP; 
@@ -348,10 +344,13 @@
                              A_dbox, fstart, stridef, iA,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iA,iAc,iAm1,iAp1,iPm1,iPp1,west,east) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop3For(iP, iA, iAc)
          {
+            HYPRE_Int iAm1,iAp1,iPm1,iPp1;
+            HYPRE_Real  west, east;
+             
             iAm1 = iA - OffsetA;
             iAp1 = iA + OffsetA;
 
@@ -407,7 +406,7 @@
                              A_dbox, fstart, stridef, iA,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(iA, iAc)
          {
@@ -442,7 +441,7 @@
                                 A_dbox, bfstart, stridef, iA,
                                 RAP_dbox, bcstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(iA, iAc)
             {
diff -Nru hypre-2.11.2/src/struct_ls/pfmg_setup_rap7.c hypre-2.13.0/src/struct_ls/pfmg_setup_rap7.c
--- hypre-2.11.2/src/struct_ls/pfmg_setup_rap7.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg_setup_rap7.c	2017-10-20 17:42:22.000000000 +0000
@@ -201,12 +201,7 @@
 
    HYPRE_Real           *rap_cc, *rap_cw, *rap_ce, *rap_cs, *rap_cn;
    HYPRE_Real           *rap_cb, *rap_ca;
-   HYPRE_Real            west, east, south, north;
    HYPRE_Real            center_int, center_bdy;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iPm1, iPp1;
                       
    HYPRE_Int             OffsetA; 
    HYPRE_Int             OffsetP; 
@@ -373,10 +368,13 @@
                              A_dbox, fstart, stridef, iA,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iA,iAc,iAm1,iAp1,iPm1,iPp1,west,east,south,north) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop3For(iP, iA, iAc)
          {
+            HYPRE_Int iAm1,iAp1,iPm1,iPp1;
+            HYPRE_Real west,east,south,north;
+             
             iAm1 = iA - OffsetA;
             iAp1 = iA + OffsetA;
 
diff -Nru hypre-2.11.2/src/struct_ls/pfmg_solve.c hypre-2.13.0/src/struct_ls/pfmg_solve.c
--- hypre-2.11.2/src/struct_ls/pfmg_solve.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/pfmg_solve.c	2017-10-20 17:42:22.000000000 +0000
@@ -64,7 +64,7 @@
    HYPRE_Int            *active_l        = (pfmg_data -> active_l);
 
    HYPRE_Real            b_dot_b = 0, r_dot_r, eps = 0;
-   HYPRE_Real            e_dot_e, x_dot_x;
+   HYPRE_Real            e_dot_e = 0.0, x_dot_x = 1.0;
                     
    HYPRE_Int             i, l;
    HYPRE_Int             constant_coefficient;
diff -Nru hypre-2.11.2/src/struct_ls/point_relax.c hypre-2.13.0/src/struct_ls/point_relax.c
--- hypre-2.11.2/src/struct_ls/point_relax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/point_relax.c	2017-10-20 17:42:22.000000000 +0000
@@ -102,7 +102,7 @@
 HYPRE_Int
 hypre_PointRelaxDestroy( void *relax_vdata )
 {
-	hypre_PointRelaxData *relax_data = (hypre_PointRelaxData *)relax_vdata;
+   hypre_PointRelaxData *relax_data = (hypre_PointRelaxData *)relax_vdata;
    HYPRE_Int             i;
 
    if (relax_data)
@@ -350,9 +350,6 @@
    void                  *matvec_data = NULL;
 
    HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
                         
    hypre_IndexRef         stride;
    hypre_IndexRef         start;
@@ -450,7 +447,7 @@
                hypre_BoxArrayBox(hypre_StructVectorDataSpace(b), i);
             x_data_box =
                hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i);
-
+             
             Ap = hypre_StructMatrixBoxData(A, i, diag_rank);
             bp = hypre_StructVectorBoxData(b, i);
             xp = hypre_StructVectorBoxData(x, i);
@@ -471,7 +468,7 @@
                                       b_data_box, start, stride, bi,
                                       x_data_box, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(bi, xi)
                   {
@@ -488,7 +485,7 @@
                                       b_data_box, start, stride, bi,
                                       x_data_box, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,bi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, bi, xi)
                   {
@@ -608,7 +605,7 @@
                                       A_data_box, start, stride, Ai,
                                       t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(Ai, ti)
                   {
@@ -711,10 +708,6 @@
    hypre_IndexRef         start;
    hypre_Index            loop_size;
    HYPRE_Int              si, sk, ssi[MAX_DEPTH], depth, k;
-   HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
 
    stencil       = hypre_StructMatrixStencil(A);
    stencil_shape = hypre_StructStencilShape(stencil);
@@ -726,7 +719,7 @@
                        b_data_box, start, stride, bi,
                        t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    hypre_BoxLoop2For(bi, ti)
    {
@@ -802,7 +795,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -824,7 +817,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -845,7 +838,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -865,7 +858,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -884,7 +877,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -902,7 +895,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -919,7 +912,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -991,9 +984,6 @@
    hypre_Index            loop_size;
    HYPRE_Int              si, sk, ssi[MAX_DEPTH], depth, k;
    HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
 
    stencil       = hypre_StructMatrixStencil(A);
    stencil_shape = hypre_StructStencilShape(stencil);
@@ -1018,7 +1008,7 @@
                           b_data_box, start, stride, bi,
                           t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(bi, ti)
       {
@@ -1033,7 +1023,7 @@
                           b_data_box, start, stride, bi,
                           t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(bi, ti)
       {
@@ -1117,7 +1107,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1144,7 +1134,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1169,7 +1159,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1192,7 +1182,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1213,7 +1203,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1232,7 +1222,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1249,7 +1239,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1514,7 +1504,7 @@
 
    HYPRE_Real weightc = 1 - weight;
    HYPRE_Real *xp, *tp;
-   HYPRE_Int compute_i, i, j, xi, ti;
+   HYPRE_Int compute_i, i, j;
 
    hypre_BoxArrayArray   *compute_box_aa;
    hypre_BoxArray        *compute_box_a;
@@ -1565,7 +1555,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1598,7 +1588,7 @@
    hypre_Index            loop_size;
 
    HYPRE_Real *xp, *tp;
-   HYPRE_Int compute_i, i, j, xi, ti;
+   HYPRE_Int compute_i, i, j;
 
    hypre_BoxArrayArray   *compute_box_aa;
    hypre_BoxArray        *compute_box_a;
@@ -1649,7 +1639,7 @@
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
diff -Nru hypre-2.11.2/src/struct_ls/red_black_constantcoef_gs.c hypre-2.13.0/src/struct_ls/red_black_constantcoef_gs.c
--- hypre-2.11.2/src/struct_ls/red_black_constantcoef_gs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/red_black_constantcoef_gs.c	2017-10-20 17:42:22.000000000 +0000
@@ -26,7 +26,7 @@
                               hypre_StructVector *b,
                               hypre_StructVector *x )
 {
-	hypre_RedBlackGSData  *relax_data = (hypre_RedBlackGSData  *)relax_vdata;
+   hypre_RedBlackGSData  *relax_data = (hypre_RedBlackGSData  *)relax_vdata;
 
    HYPRE_Int              max_iter    = (relax_data -> max_iter);
    HYPRE_Int              zero_guess  = (relax_data -> zero_guess);
@@ -46,8 +46,8 @@
    hypre_Box             *x_dbox;
                         
    HYPRE_Int              Ai, Astart, Ani, Anj;
-   HYPRE_Int              bi, bstart, bni, bnj;
-   HYPRE_Int              xi, xstart, xni, xnj;
+   HYPRE_Int              bstart, bni, bnj;
+   HYPRE_Int              xstart, xni, xnj;
    HYPRE_Int              xoff0, xoff1, xoff2, xoff3, xoff4, xoff5;
                         
    HYPRE_Real            *Ap;
@@ -68,7 +68,7 @@
    HYPRE_Int              offd[6];
                         
    HYPRE_Int              iter, rb, redblack, d;
-   HYPRE_Int              compute_i, i, j, ii, jj, kk;
+   HYPRE_Int              compute_i, i, j;
    HYPRE_Int              ni, nj, nk;
 
    /*----------------------------------------------------------
@@ -194,22 +194,17 @@
                   Ai= hypre_CCBoxIndexRank(A_dbox, start);
                   AApd= 1.0/Ap[Ai];
 
+                  hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                  for (kk = 0; kk < nk; kk++)
+                  hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                      bstart,bni,bnj,bi,
+                                                      xstart,xni,xnj,xi);
                   {
-                     for (jj = 0; jj < nj; jj++)
-                     {
-                        ii = (kk + jj + redblack) % 2;
-                        bi = bstart + kk*bnj*bni + jj*bni + ii;
-                        xi = xstart + kk*xnj*xni + jj*xni + ii;
-                        for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                        {
-                           xp[xi] = bp[bi]*AApd;
-                        }
-                     }
+                     xp[xi] = bp[bi]*AApd;
                   }
+                  hypre_RedBlackConstantcoefLoopEnd();                  
                }
 
                else      /* variable coefficient diag */
@@ -218,23 +213,18 @@
                   Ani = hypre_BoxSizeX(A_dbox);
                   Anj = hypre_BoxSizeY(A_dbox);
 
+                  hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                  for (kk = 0; kk < nk; kk++)
+                  hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                          Astart,Ani,Anj,Ai,
+                                          bstart,bni,bnj,bi,
+                                          xstart,xni,xnj,xi);
                   {
-                     for (jj = 0; jj < nj; jj++)
-                     {
-                        ii = (kk + jj + redblack) % 2;
-                        Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                        bi = bstart + kk*bnj*bni + jj*bni + ii;
-                        xi = xstart + kk*xnj*xni + jj*xni + ii;
-                        for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                        {
-                           xp[xi] = bp[bi] / Ap[Ai];
-                        }
-                     }
+                     xp[xi] = bp[bi] / Ap[Ai];
                   }
+                  hypre_RedBlackLoopEnd();
                }
 
             }
@@ -358,75 +348,61 @@
                   switch(stencil_size)
                   {
                      case 7:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                            bstart,bni,bnj,bi,
+                                                            xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] - 
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3] -
-                                     App4*xp[xi + xoff4] -
-                                     App5*xp[xi + xoff5])*AApd;
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] - 
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3] -
+                               App4*xp[xi + xoff4] -
+                               App5*xp[xi + xoff5])*AApd;
                         }
+                        hypre_RedBlackConstantcoefLoopEnd();
+                        
                         break;
 
                      case 5:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                            bstart,bni,bnj,bi,
+                                                            xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] -
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3])*AApd;
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] -
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3])*AApd;
                         }
+                        hypre_RedBlackConstantcoefLoopEnd();
                         break;
 
                      case 3:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                            bstart,bni,bnj,bi,
+                                                            xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] -
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1])*AApd;
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] -
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1])*AApd;
                         }
+                        hypre_RedBlackConstantcoefLoopEnd();
                         break;
                   }
 
@@ -441,78 +417,63 @@
                   switch(stencil_size)
                   {
                      case 7:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                                Astart,Ani,Anj,Ai,
+                                                bstart,bni,bnj,bi,
+                                                xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] - 
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3] -
-                                     App4*xp[xi + xoff4] -
-                                     App5*xp[xi + xoff5]) / Ap[Ai];
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] - 
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3] -
+                               App4*xp[xi + xoff4] -
+                               App5*xp[xi + xoff5]) / Ap[Ai];
                         }
+                        hypre_RedBlackLoopEnd();
                         break;
 
                      case 5:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                                Astart,Ani,Anj,Ai,
+                                                bstart,bni,bnj,bi,
+                                                xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] - 
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3]) / Ap[Ai]; 
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] - 
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3]) / Ap[Ai]; 
                         }
+                        hypre_RedBlackLoopEnd();
                         break;
 
                      case 3:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                                Astart,Ani,Anj,Ai,
+                                                bstart,bni,bnj,bi,
+                                                xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] -
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1]) / Ap[Ai]; 
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] -
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1]) / Ap[Ai]; 
                         }
+                        hypre_RedBlackLoopEnd();
                         break;
 
                   }  /* switch(stencil_size) */
diff -Nru hypre-2.11.2/src/struct_ls/red_black_gs.c hypre-2.13.0/src/struct_ls/red_black_gs.c
--- hypre-2.11.2/src/struct_ls/red_black_gs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/red_black_gs.c	2017-10-20 17:42:22.000000000 +0000
@@ -53,7 +53,7 @@
 HYPRE_Int
 hypre_RedBlackGSDestroy( void *relax_vdata )
 {
-	hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata;
+   hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata;
 
    if (relax_data)
    {
@@ -147,9 +147,9 @@
    hypre_Box             *b_dbox;
    hypre_Box             *x_dbox;
                         
-   HYPRE_Int              Ai, Astart, Ani, Anj;
-   HYPRE_Int              bi, bstart, bni, bnj;
-   HYPRE_Int              xi, xstart, xni, xnj;
+   HYPRE_Int              Astart, Ani, Anj;
+   HYPRE_Int              bstart, bni, bnj;
+   HYPRE_Int              xstart, xni, xnj;
    HYPRE_Int              xoff0, xoff1, xoff2, xoff3, xoff4, xoff5;
                         
    HYPRE_Real            *Ap;
@@ -166,7 +166,7 @@
    HYPRE_Int              offd[6];
                         
    HYPRE_Int              iter, rb, redblack, d;
-   HYPRE_Int              compute_i, i, j, ii, jj, kk;
+   HYPRE_Int              compute_i, i, j;
    HYPRE_Int              ni, nj, nk;
 
    /*----------------------------------------------------------
@@ -288,23 +288,18 @@
                   }
                }
 
+               hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-               for (kk = 0; kk < nk; kk++)
+               hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                       Astart,Ani,Anj,Ai,
+                                       bstart,bni,bnj,bi,
+                                       xstart,xni,xnj,xi);
                {
-                  for (jj = 0; jj < nj; jj++)
-                  {
-                     ii = (kk + jj + redblack) % 2;
-                     Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                     bi = bstart + kk*bnj*bni + jj*bni + ii;
-                     xi = xstart + kk*xnj*xni + jj*xni + ii;
-                     for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                     {
-                        xp[xi] = bp[bi] / Ap[Ai];
-                     }
-                  }
+                  xp[xi] = bp[bi] / Ap[Ai];
                }
+               hypre_RedBlackLoopEnd();
             }
          }
       }
@@ -418,78 +413,64 @@
                switch(stencil_size)
                {
                   case 7:
+                     hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                     for (kk = 0; kk < nk; kk++)
+                     hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                             Astart,Ani,Anj,Ai,
+                                             bstart,bni,bnj,bi,
+                                             xstart,xni,xnj,xi);
                      {
-                        for (jj = 0; jj < nj; jj++)
-                        {
-                           ii = (kk + jj + redblack) % 2;
-                           Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                           bi = bstart + kk*bnj*bni + jj*bni + ii;
-                           xi = xstart + kk*xnj*xni + jj*xni + ii;
-                           for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                           {
-                              xp[xi] =
-                                 (bp[bi] - 
-                                  Ap0[Ai] * xp[xi + xoff0] -
-                                  Ap1[Ai] * xp[xi + xoff1] -
-                                  Ap2[Ai] * xp[xi + xoff2] -
-                                  Ap3[Ai] * xp[xi + xoff3] -
-                                  Ap4[Ai] * xp[xi + xoff4] -
-                                  Ap5[Ai] * xp[xi + xoff5]) / Ap[Ai];
-                           }
-                        }
+                        xp[xi] =
+                           (bp[bi] - 
+                            Ap0[Ai] * xp[xi + xoff0] -
+                            Ap1[Ai] * xp[xi + xoff1] -
+                            Ap2[Ai] * xp[xi + xoff2] -
+                            Ap3[Ai] * xp[xi + xoff3] -
+                            Ap4[Ai] * xp[xi + xoff4] -
+                            Ap5[Ai] * xp[xi + xoff5]) / Ap[Ai];
                      }
+                     hypre_RedBlackLoopEnd();
                      break;
 
                   case 5:
+                     hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                     for (kk = 0; kk < nk; kk++)
+                     hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                             Astart,Ani,Anj,Ai,
+                                             bstart,bni,bnj,bi,
+                                             xstart,xni,xnj,xi);
                      {
-                        for (jj = 0; jj < nj; jj++)
-                        {
-                           ii = (kk + jj + redblack) % 2;
-                           Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                           bi = bstart + kk*bnj*bni + jj*bni + ii;
-                           xi = xstart + kk*xnj*xni + jj*xni + ii;
-                           for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                           {
-                              xp[xi] =
-                                 (bp[bi] - 
-                                  Ap0[Ai] * xp[xi + xoff0] -
-                                  Ap1[Ai] * xp[xi + xoff1] -
-                                  Ap2[Ai] * xp[xi + xoff2] -
-                                  Ap3[Ai] * xp[xi + xoff3]) / Ap[Ai];
-                           }
-                        }
+                        xp[xi] =
+                           (bp[bi] - 
+                            Ap0[Ai] * xp[xi + xoff0] -
+                            Ap1[Ai] * xp[xi + xoff1] -
+                            Ap2[Ai] * xp[xi + xoff2] -
+                            Ap3[Ai] * xp[xi + xoff3]) / Ap[Ai];
                      }
+                     hypre_RedBlackLoopEnd();
                      break;
 
                   case 3:
+                     hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                     for (kk = 0; kk < nk; kk++)
+                     hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                             Astart,Ani,Anj,Ai,
+                                             bstart,bni,bnj,bi,
+                                             xstart,xni,xnj,xi);
                      {
-                        for (jj = 0; jj < nj; jj++)
-                        {
-                           ii = (kk + jj + redblack) % 2;
-                           Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                           bi = bstart + kk*bnj*bni + jj*bni + ii;
-                           xi = xstart + kk*xnj*xni + jj*xni + ii;
-                           for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                           {
-                              xp[xi] =
-                                 (bp[bi] - 
-                                  Ap0[Ai] * xp[xi + xoff0] -
-                                  Ap1[Ai] * xp[xi + xoff1]) / Ap[Ai];
-                           }
-                        }
+                        xp[xi] =
+                           (bp[bi] - 
+                            Ap0[Ai] * xp[xi + xoff0] -
+                            Ap1[Ai] * xp[xi + xoff1]) / Ap[Ai];
                      }
+                     hypre_RedBlackLoopEnd();
+
                      break;
                }
             }
diff -Nru hypre-2.11.2/src/struct_ls/red_black_gs.h hypre-2.13.0/src/struct_ls/red_black_gs.h
--- hypre-2.11.2/src/struct_ls/red_black_gs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/red_black_gs.h	2017-10-20 17:42:22.000000000 +0000
@@ -42,3 +42,226 @@
 
 } hypre_RedBlackGSData;
 
+#ifdef HYPRE_USE_RAJA
+#define HYPRE_REDBLACK_PRIVATE hypre__global_error
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,	\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,Ai,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    Ai = Astart + kk*Anj*Ani + jj*Ani + ii;			\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackConstantcoefLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}  
+#elif defined(HYPRE_USE_KOKKOS)
+#define HYPRE_REDBLACK_PRIVATE hypre__global_error
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,	\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    HYPRE_Int hypre_fake = 0;						\
+    Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,Ai,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    Ai = Astart + kk*Anj*Ani + jj*Ani + ii;			\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackConstantcoefLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}  
+#elif defined(HYPRE_USE_CUDA)
+#define HYPRE_REDBLACK_PRIVATE hypre__global_error
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,	\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    BoxLoopforall(cuda_traversal(),hypre__tot,[=] __device__ (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,Ai,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    Ai = Astart + kk*Anj*Ani + jj*Ani + ii;			\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+	   
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+					    bstart,bni,bnj,bi,	\
+					    xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    BoxLoopforall(cuda_traversal(),hypre__tot,[=] __device__ (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackConstantcoefLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+#else
+#define HYPRE_REDBLACK_PRIVATE hypre__kk
+#define hypre_RedBlackLoopInit()\
+{\
+   HYPRE_Int hypre__kk;
+
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,\
+				bstart,bni,bnj,bi,\
+				xstart,xni,xnj,xi)\
+   for (hypre__kk = 0; hypre__kk < nk; hypre__kk++)\
+   {\
+      HYPRE_Int ii,jj,Ai,bi,xi;\
+      for (jj = 0; jj < nj; jj++)\
+      {\
+         ii = (hypre__kk + jj + redblack) % 2;\
+         Ai = Astart + hypre__kk*Anj*Ani + jj*Ani + ii;\
+         bi = bstart + hypre__kk*bnj*bni + jj*bni + ii;\
+         xi = xstart + hypre__kk*xnj*xni + jj*xni + ii;\
+         for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)\
+         {
+
+#define hypre_RedBlackLoopEnd()\
+         }\
+      }\
+   }\
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+                                            bstart,bni,bnj,bi,\
+                                            xstart,xni,xnj,xi)\
+   for (hypre__kk = 0; hypre__kk < nk; hypre__kk++)\
+   {\
+      HYPRE_Int ii,jj,bi,xi;\
+      for (jj = 0; jj < nj; jj++)\
+      {\
+         ii = (hypre__kk + jj + redblack) % 2;\
+         bi = bstart + hypre__kk*bnj*bni + jj*bni + ii;\
+         xi = xstart + hypre__kk*xnj*xni + jj*xni + ii;\
+         for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)\
+         {
+
+#define hypre_RedBlackConstantcoefLoopEnd()\
+         }\
+      }\
+   }\
+}
+#endif
diff -Nru hypre-2.11.2/src/struct_ls/semi_interp.c hypre-2.13.0/src/struct_ls/semi_interp.c
--- hypre-2.11.2/src/struct_ls/semi_interp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/semi_interp.c	2017-10-20 17:42:22.000000000 +0000
@@ -125,8 +125,6 @@
    hypre_Box              *e_dbox;
                        
    HYPRE_Int               Pi;
-   HYPRE_Int               xci;
-   HYPRE_Int               ei;
    HYPRE_Int               constant_coefficient;
                          
    HYPRE_Real             *Pp0, *Pp1;
@@ -201,7 +199,7 @@
                           e_dbox, start, stride, ei,
                           xc_dbox, startc, stridec, xci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(ei, xci)
       {
@@ -280,7 +278,7 @@
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(P), loop_size,
                                    e_dbox, start, stride, ei);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(ei)
                {
@@ -295,7 +293,7 @@
                                    P_dbox, startc, stridec, Pi,
                                    e_dbox, start, stride, ei);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Pi,ei) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(Pi, ei)
                {
diff -Nru hypre-2.11.2/src/struct_ls/semi_restrict.c hypre-2.13.0/src/struct_ls/semi_restrict.c
--- hypre-2.11.2/src/struct_ls/semi_restrict.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/semi_restrict.c	2017-10-20 17:42:22.000000000 +0000
@@ -123,8 +123,6 @@
    hypre_Box              *rc_dbox;
                        
    HYPRE_Int               Ri;
-   HYPRE_Int               ri;
-   HYPRE_Int               rci;
    HYPRE_Int               constant_coefficient;
 
    HYPRE_Real             *Rp0, *Rp1;
@@ -248,7 +246,7 @@
                                    r_dbox,  start,  stride,  ri,
                                    rc_dbox, startc, stridec, rci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ri,rci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(ri, rci)
                {
@@ -264,7 +262,7 @@
                                    r_dbox,  start,  stride,  ri,
                                    rc_dbox, startc, stridec, rci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ri,ri,rci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ri, ri, rci)
                {
diff -Nru hypre-2.11.2/src/struct_ls/semi_setup_rap.c hypre-2.13.0/src/struct_ls/semi_setup_rap.c
--- hypre-2.11.2/src/struct_ls/semi_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/semi_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -35,7 +35,7 @@
       if (imacro==2) imacro=-1;                         \
       if (jmacro==2) jmacro=-1;                         \
       if (kmacro==2) kmacro=-1;                         \
-      hypre_SetIndex3(indexRAP,imacro,jmacro,kmacro);    \
+      hypre_SetIndex3(indexRAP,imacro,jmacro,kmacro);   \
    }
 
 /*--------------------------------------------------------------------------
@@ -322,11 +322,6 @@
    HYPRE_Real           *rap_ptrS, *rap_ptrU, *rap_ptrD;
 
    HYPRE_Int             symm_path_multiplier;
-
-   HYPRE_Int             iA, iAp;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iPp;
-   HYPRE_Int             iR;
                         
    HYPRE_Int             COffsetA; 
    HYPRE_Int             COffsetP; 
@@ -561,10 +556,11 @@
                                             A_dbox, fstart, stridef, iA,
                                             RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                         hypre_BoxLoop4For(iP, iR, iA, iAc)
                         {
+                           HYPRE_Int iAp,iPp;
                            /* path 1 : (stay,stay) */
                            rap_ptrS[iAc] +=          a_ptr[iA]           ;
 
@@ -597,10 +593,11 @@
                                             A_dbox, fstart, stridef, iA,
                                             RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                         hypre_BoxLoop4For(iP, iR, iA, iAc)
                         {
+                           HYPRE_Int iAp,iPp;
                            /* path 1 : (stay,stay) */
                            rap_ptrS[iAc] +=          a_ptr[iA]           ;
 
@@ -670,10 +667,11 @@
                                          A_dbox, fstart, stridef, iA,
                                          RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop4For(iP, iR, iA, iAc)
                      {
+                        HYPRE_Int iAp,iPp;
                         /* Path 1 : (stay,up) & symmetric path  */
                         iPp = iP + AOffsetP; 
                         rap_ptrS[iAc] += symm_path_multiplier *
@@ -739,10 +737,11 @@
                                          A_dbox, fstart, stridef, iA,
                                          RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop4For(iP, iR, iA, iAc)
                      {
+                        HYPRE_Int iAp,iPp;
                         /* Path 1 : (stay,up) */
                         iPp = iP + COffsetP + AOffsetP; 
                         rap_ptrU[iAc] +=          a_ptr[iA]  * pb[iPp];
diff -Nru hypre-2.11.2/src/struct_ls/smg2_setup_rap.c hypre-2.13.0/src/struct_ls/smg2_setup_rap.c
--- hypre-2.11.2/src/struct_ls/smg2_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg2_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -186,11 +186,6 @@
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int            iA, iAm1, iAp1;
-   HYPRE_Int            iAc;
-   HYPRE_Int            iP, iP1;
-   HYPRE_Int            iR;
-
    HYPRE_Int            yOffsetA; 
    HYPRE_Int            xOffsetP; 
    HYPRE_Int            yOffsetP; 
@@ -337,7 +332,7 @@
        * Switch statement to direct control to apropriate BoxLoop depending
        * on stencil size. Default is full 9-point.
        *-----------------------------------------------------------------*/
-
+          
       switch (fine_stencil_size)
       {
 
@@ -357,29 +352,28 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
-
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1];
-
+                
                iP1 = iP - yOffsetP;
                rap_cs[iAc] = rb[iR] * a_cc[iAm1] * pa[iP1]
                   +          rb[iR] * a_cs[iAm1]
                   +                   a_cs[iA]   * pa[iP1];
-
+                
                iP1 = iP - yOffsetP + xOffsetP;
                rap_cse[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1];
-
+                
                iP1 = iP - xOffsetP;
                rap_cw[iAc] =          a_cw[iA]
                   +          rb[iR] * a_cw[iAm1] * pb[iP1]
                   +          ra[iR] * a_cw[iAp1] * pa[iP1];
-
+                
                rap_cc[iAc] =          a_cc[iA]
                   +          rb[iR] * a_cc[iAm1] * pb[iP]
                   +          ra[iR] * a_cc[iAp1] * pa[iP]
@@ -387,10 +381,9 @@
                   +          ra[iR] * a_cs[iAp1]
                   +                   a_cs[iA]   * pb[iP]
                   +                   a_cn[iA]   * pa[iP];
-
             }
             hypre_BoxLoop4End(iP, iR, iA, iAc);
-
+            
             break;
 
             /*--------------------------------------------------------------
@@ -409,14 +402,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
                   +           rb[iR] * a_csw[iAm1]
                   +                    a_csw[iA]  * pa[iP1];
@@ -506,11 +499,6 @@
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int            iA, iAm1, iAp1;
-   HYPRE_Int            iAc;
-   HYPRE_Int            iP, iP1;
-   HYPRE_Int            iR;
-
    HYPRE_Int            yOffsetA;
    HYPRE_Int            xOffsetP;
    HYPRE_Int            yOffsetP;
@@ -663,7 +651,7 @@
           *--------------------------------------------------------------*/
 
          case 5:
-
+ 
             hypre_BoxGetSize(cgrid_box, loop_size);
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 PT_dbox,  cstart, stridec, iP,
@@ -671,14 +659,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
 
                iP1 = iP + yOffsetP;
@@ -706,7 +694,6 @@
              *--------------------------------------------------------------*/
 
          default:
-
             hypre_BoxGetSize(cgrid_box, loop_size);
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 PT_dbox,  cstart, stridec, iP,
@@ -714,14 +701,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]
                   +           ra[iR] * a_cne[iAp1]
                   +                    a_cne[iA]  * pb[iP1];
@@ -785,9 +772,6 @@
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int            iAc;
-   HYPRE_Int            iAcm1;
-
    HYPRE_Int            xOffset;
 
    HYPRE_Real           zero = 0.0;
@@ -836,11 +820,11 @@
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcm1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
-            iAcm1 = iAc - xOffset;
+            HYPRE_Int iAcm1 = iAc - xOffset;
                
             rap_cw[iAc] += (rap_cse[iAcm1] + rap_csw[iAc]);
             rap_cc[iAc] += (2.0 * rap_cs[iAc]);
@@ -850,7 +834,7 @@
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
@@ -895,8 +879,6 @@
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int            iAc;
-
    HYPRE_Real           zero = 0.0;
 
    hypre_SetIndex3(stridec, 1, 1, 1);
@@ -950,7 +932,7 @@
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
diff -Nru hypre-2.11.2/src/struct_ls/smg3_setup_rap.c hypre-2.13.0/src/struct_ls/smg3_setup_rap.c
--- hypre-2.11.2/src/struct_ls/smg3_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg3_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -268,11 +268,6 @@
    HYPRE_Real           *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn;
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             xOffsetP; 
@@ -574,14 +569,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
-            {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+            {                   
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
                rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - xOffsetP;
@@ -638,14 +633,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
                rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1]
                   +          rb[iR] * a_bs[iAm1]
                   +                   a_bs[iA]   * pa[iP1];
@@ -719,14 +714,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - yOffsetP;
@@ -759,7 +754,7 @@
                rap_bn[iAc] = rb[iR] * a_cn[iAm1] * pa[iP1]
                   +          rb[iR] * a_bn[iAm1]
                   +                   a_bn[iA]   * pa[iP1];
- 
+
                iP1 = iP - zOffsetP + yOffsetP + xOffsetP;
                rap_bne[iAc] = rb[iR] * a_cne[iAm1] * pa[iP1];
 
@@ -822,14 +817,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]
                   +           rb[iR] * a_bsw[iAm1]
                   +                    a_bsw[iA]   * pa[iP1];
@@ -982,11 +977,6 @@
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
 
-   HYPRE_Int            iA, iAm1, iAp1;
-   HYPRE_Int            iAc;
-   HYPRE_Int            iP, iP1;
-   HYPRE_Int            iR;
-
    HYPRE_Int            zOffsetA;
    HYPRE_Int            xOffsetP;
    HYPRE_Int            yOffsetP;
@@ -1282,14 +1272,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
                rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + xOffsetP;
@@ -1337,14 +1327,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
                rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1]
                   +          ra[iR] * a_an[iAp1]
                   +                   a_an[iA]   * pb[iP1];
@@ -1411,14 +1401,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int  iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + yOffsetP;
@@ -1443,7 +1433,7 @@
                rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]
                   +          ra[iR] * a_aw[iAp1]
                   +                   a_aw[iA]   * pb[iP1];
- 
+
                iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
                rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1];
 
@@ -1506,14 +1496,14 @@
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]
                   +           ra[iR] * a_ane[iAp1]
                   +                    a_ane[iA]   * pb[iP1];
@@ -1637,12 +1627,6 @@
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int            iAc;
-   HYPRE_Int            iAcmx;
-   HYPRE_Int            iAcmy;
-   HYPRE_Int            iAcmxmy;
-   HYPRE_Int            iAcpxmy;
-
    HYPRE_Int            xOffset;
    HYPRE_Int            yOffset;
 
@@ -1739,12 +1723,12 @@
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcmx,iAcmy) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
-            iAcmx = iAc - xOffset;
-            iAcmy = iAc - yOffset;
+            HYPRE_Int iAcmx = iAc - xOffset;
+            HYPRE_Int iAcmy = iAc - yOffset;
                
             rap_cc[iAc] += (2.0 * rap_bc[iAc]);
             rap_cw[iAc] += (rap_bw[iAc] + rap_be[iAcmx]);
@@ -1755,7 +1739,7 @@
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
@@ -1778,12 +1762,12 @@
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                                 RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcmxmy,iAcpxmy) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
-               iAcmxmy = iAc - xOffset - yOffset;
-               iAcpxmy = iAc + xOffset - yOffset;
+               HYPRE_Int iAcmxmy = iAc - xOffset - yOffset;
+               HYPRE_Int iAcpxmy = iAc + xOffset - yOffset;
                   
                rap_csw[iAc] += (rap_bsw[iAc] + rap_bne[iAcmxmy]);
                   
@@ -1795,7 +1779,7 @@
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                                 RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
@@ -1845,8 +1829,6 @@
    HYPRE_Real           *rap_csw, *rap_cse, *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
 
-   HYPRE_Int            iAc;
-
    HYPRE_Real           zero = 0.0;
 
    hypre_StructStencil *stencil;
@@ -1974,7 +1956,7 @@
                              RAP_dbox,  cstart, stridec, iAc);
 
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
 
          hypre_BoxLoop1For(iAc)
@@ -2012,7 +1994,7 @@
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                                 RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
diff -Nru hypre-2.11.2/src/struct_ls/smg_axpy.c hypre-2.13.0/src/struct_ls/smg_axpy.c
--- hypre-2.11.2/src/struct_ls/smg_axpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg_axpy.c	2017-10-20 17:42:22.000000000 +0000
@@ -25,9 +25,6 @@
    HYPRE_Int         ndim = hypre_StructVectorNDim(x);
    hypre_Box        *x_data_box;
    hypre_Box        *y_data_box;
-                 
-   HYPRE_Int         xi;
-   HYPRE_Int         yi;
                     
    HYPRE_Real       *xp;
    HYPRE_Real       *yp;
@@ -58,7 +55,7 @@
                           x_data_box, start, base_stride, xi,
                           y_data_box, start, base_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, yi)
       {
diff -Nru hypre-2.11.2/src/struct_ls/smg.c hypre-2.13.0/src/struct_ls/smg.c
--- hypre-2.11.2/src/struct_ls/smg.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg.c	2017-10-20 17:42:22.000000000 +0000
@@ -53,7 +53,7 @@
 HYPRE_Int
 hypre_SMGDestroy( void *smg_vdata )
 {
-	hypre_SMGData *smg_data = (hypre_SMGData *)smg_vdata;
+   hypre_SMGData *smg_data = (hypre_SMGData *)smg_vdata;
 
    HYPRE_Int l;
 
@@ -111,7 +111,7 @@
             hypre_StructVectorDestroy(smg_data -> tb_l[l+1]);
             hypre_StructVectorDestroy(smg_data -> tx_l[l+1]);
          }
-         hypre_SharedTFree(smg_data -> data);
+         hypre_DeviceTFree(smg_data -> data);
          hypre_TFree(smg_data -> grid_l);
          hypre_TFree(smg_data -> PT_grid_l);
          hypre_TFree(smg_data -> A_l);
@@ -468,7 +468,6 @@
 {
    hypre_Box          *v_data_box;
 
-   HYPRE_Int           vi;
    HYPRE_Real         *vp;
 
    hypre_Box          *box;
@@ -495,7 +494,7 @@
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
diff -Nru hypre-2.11.2/src/struct_ls/smg_relax.c hypre-2.13.0/src/struct_ls/smg_relax.c
--- hypre-2.11.2/src/struct_ls/smg_relax.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg_relax.c	2017-10-20 17:42:22.000000000 +0000
@@ -263,7 +263,6 @@
    residual_data   = (relax_data -> residual_data);
    solve_data      = (relax_data -> solve_data);
 
-
    /*----------------------------------------------------------
     * Set zero values
     *----------------------------------------------------------*/
@@ -315,7 +314,6 @@
          (relax_data -> num_iterations) = (i + 1);
       }
    }
-
    /*----------------------------------------------------------
     * Free up memory according to memory_use parameter
     *----------------------------------------------------------*/
@@ -586,6 +584,7 @@
          hypre_SMGSetMemoryUse(solve_data[i], (relax_data -> memory_use));
          hypre_SMGSetTol(solve_data[i], 0.0);
          hypre_SMGSetMaxIter(solve_data[i], 1);
+		 
          hypre_SMGSetup(solve_data[i], A_sol, temp_vec, x);
       }
       else
diff -Nru hypre-2.11.2/src/struct_ls/smg_residual.c hypre-2.13.0/src/struct_ls/smg_residual.c
--- hypre-2.11.2/src/struct_ls/smg_residual.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg_residual.c	2017-10-20 17:42:22.000000000 +0000
@@ -138,11 +138,6 @@
    hypre_Box              *x_data_box;
    hypre_Box              *b_data_box;
    hypre_Box              *r_data_box;
-                       
-   HYPRE_Int               Ai;
-   HYPRE_Int               xi;
-   HYPRE_Int               bi;
-   HYPRE_Int               ri;
                          
    HYPRE_Real             *Ap;
    HYPRE_Real             *xp;
@@ -201,7 +196,7 @@
                                    b_data_box, start, base_stride, bi,
                                    r_data_box, start, base_stride, ri);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ri) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(bi, ri)
                {
@@ -253,7 +248,7 @@
                                    x_data_box, start, base_stride, xi,
                                    r_data_box, start, base_stride, ri);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ri) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ai, xi, ri)
                {
diff -Nru hypre-2.11.2/src/struct_ls/smg_setup.c hypre-2.13.0/src/struct_ls/smg_setup.c
--- hypre-2.11.2/src/struct_ls/smg_setup.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg_setup.c	2017-10-20 17:42:22.000000000 +0000
@@ -186,6 +186,7 @@
    for (l = 0; l < (num_levels - 1); l++)
    {
       PT_l[l]  = hypre_SMGCreateInterpOp(A_l[l], PT_grid_l[l+1], cdir);
+	  
       hypre_StructMatrixInitializeShell(PT_l[l]);
       data_size += hypre_StructMatrixDataSize(PT_l[l]);
 
@@ -228,7 +229,8 @@
       hypre_StructVectorInitializeShell(tx_l[l+1]);
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+
    (smg_data -> data) = data;
 
    hypre_StructVectorInitializeData(tb_l[0], data);
@@ -331,6 +333,7 @@
       hypre_SMGRelaxSetTempVec(relax_data_l[l], tb_l[l]);
       hypre_SMGRelaxSetNumPreRelax( relax_data_l[l], n_pre);
       hypre_SMGRelaxSetNumPostRelax( relax_data_l[l], n_post);
+
       hypre_SMGRelaxSetup(relax_data_l[l], A_l[l], b_l[l], x_l[l]);
 
       hypre_SMGSetupInterpOp(relax_data_l[l], A_l[l], b_l[l], x_l[l],
diff -Nru hypre-2.11.2/src/struct_ls/smg_setup_interp.c hypre-2.13.0/src/struct_ls/smg_setup_interp.c
--- hypre-2.11.2/src/struct_ls/smg_setup_interp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/smg_setup_interp.c	2017-10-20 17:42:22.000000000 +0000
@@ -113,8 +113,6 @@
    hypre_Box            *x_data_box;
    HYPRE_Real           *PTp;
    HYPRE_Real           *xp;
-   HYPRE_Int             PTi;
-   HYPRE_Int             xi;
 
    hypre_Index           loop_size;
    hypre_Index           start;
@@ -263,7 +261,7 @@
                                    x_data_box,  start,  stride,  xi,
                                    PT_data_box, startc, stridec, PTi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,PTi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(xi, PTi)
                {
diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg2_setup_rap.c hypre-2.13.0/src/struct_ls/sparse_msg2_setup_rap.c
--- hypre-2.11.2/src/struct_ls/sparse_msg2_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/sparse_msg2_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -201,11 +201,6 @@
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
-                      
    HYPRE_Int             yOffsetA; 
    HYPRE_Int             xOffsetP; 
    HYPRE_Int             yOffsetP; 
@@ -397,14 +392,14 @@
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1];
 
                iP1 = iP - yOffsetP;
@@ -442,21 +437,20 @@
          default:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
                   +           rb[iR] * a_csw[iAm1]
                   +                    a_csw[iA]  * pa[iP1];
@@ -548,11 +542,6 @@
 
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                      
    HYPRE_Int             yOffsetA;
    HYPRE_Int             xOffsetP;
@@ -730,21 +719,20 @@
          case 5:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
 
                iP1 = iP + yOffsetP;
@@ -773,21 +761,20 @@
          default:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]
                   +           ra[iR] * a_cne[iAp1]
                   +                    a_cne[iA]  * pb[iP1];
diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg3_setup_rap.c hypre-2.13.0/src/struct_ls/sparse_msg3_setup_rap.c
--- hypre-2.11.2/src/struct_ls/sparse_msg3_setup_rap.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/sparse_msg3_setup_rap.c	2017-10-20 17:42:22.000000000 +0000
@@ -225,11 +225,6 @@
    HYPRE_Real           *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn;
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             xOffsetP; 
@@ -563,14 +558,14 @@
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
-            {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+            {           
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
                rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - xOffsetP;
@@ -633,14 +628,14 @@
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - yOffsetP;
@@ -737,14 +732,14 @@
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]
                   +           rb[iR] * a_bsw[iAm1]
                   +                    a_bsw[iA]   * pa[iP1];
@@ -898,11 +893,6 @@
    HYPRE_Real           *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an;
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                  
    HYPRE_Int             zOffsetA;
    HYPRE_Int             xOffsetP;
@@ -1221,21 +1211,20 @@
          case 7:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
                rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + xOffsetP;
@@ -1289,14 +1278,14 @@
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + yOffsetP;
@@ -1385,14 +1374,14 @@
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]
                   +           ra[iR] * a_ane[iAp1]
                   +                    a_ane[iA]   * pb[iP1];
diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_filter.c hypre-2.13.0/src/struct_ls/sparse_msg_filter.c
--- hypre-2.11.2/src/struct_ls/sparse_msg_filter.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/sparse_msg_filter.c	2017-10-20 17:42:22.000000000 +0000
@@ -325,22 +325,14 @@
    hypre_Box             *A_dbox;
    hypre_Box             *v_dbox;
                         
-   HYPRE_Int              Ai;
-   HYPRE_Int              vi;
-                        
-   HYPRE_Real            *Ap;
    HYPRE_Real            *vxp;
    HYPRE_Real            *vyp;
    HYPRE_Real            *vzp;
-   HYPRE_Real             lambdax;
-   HYPRE_Real             lambday;
-   HYPRE_Real             lambdaz;
+
                         
    hypre_StructStencil   *stencil;
    hypre_Index           *stencil_shape;
-   HYPRE_Int              stencil_size;
-                        
-   HYPRE_Int              Astenc;
+   HYPRE_Int              stencil_size;                        
                         
    hypre_Index            loop_size;
    hypre_Index            cindex;
@@ -349,7 +341,7 @@
    hypre_Index            stride;
    hypre_Index            stridev;
                         
-   HYPRE_Int              i, si;
+   HYPRE_Int              i;
 
    /*----------------------------------------------------------
     * Initialize some things
@@ -375,6 +367,9 @@
    compute_boxes = hypre_StructGridBoxes(hypre_StructMatrixGrid(A));
    hypre_ForBoxI(i, compute_boxes)
    {
+
+      hypre_MatrixIndexMove(A, stencil_size, i, ierr,3);
+     
       compute_box = hypre_BoxArrayBox(compute_boxes, i);
 
       A_dbox = hypre_BoxArrayBox(hypre_StructMatrixDataSpace(A), i);
@@ -392,20 +387,28 @@
                           A_dbox, start,  stride,  Ai,
                           v_dbox, startv, stridev, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,vi,lambdax,lambday,lambdaz,si,Ap,Astenc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(Ai, vi)
       {
+         HYPRE_Real lambdax,lambday,lambdaz;
+         HYPRE_Real *Ap;
+         HYPRE_Int si,Astenc;
+		  
          lambdax = 0.0;
          lambday = 0.0;
          lambdaz = 0.0;
 
+		 
          for (si = 0; si < stencil_size; si++)
          {
-            Ap = hypre_StructMatrixBoxData(A, i, si);
-
+            //Ap = hypre_StructMatrixBoxData(A, i, si);
+            //Ap = data_A + indices_d[si];
+            Ap = hypre_StructGetMatrixBoxData(A, i, si);
             /* compute lambdax */
-            Astenc = hypre_IndexD(stencil_shape[si], 0);
+            //Astenc = hypre_IndexD(stencil_shape[si], 0);
+            //Astenc = stencil_shape_d[si];
+            Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]);
             if (Astenc == 0)
             {
                lambdax += Ap[Ai];
@@ -416,7 +419,10 @@
             }
 
             /* compute lambday */
-            Astenc = hypre_IndexD(stencil_shape[si], 1);
+            //Astenc = hypre_IndexD(stencil_shape[si], 1);
+	    //Astenc = stencil_shape_d[stencil_size+si];
+	    Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]);
+	    
             if (Astenc == 0)
             {
                lambday += Ap[Ai];
@@ -427,7 +433,9 @@
             }
 
             /* compute lambdaz */
-            Astenc = hypre_IndexD(stencil_shape[si], 2);
+            //Astenc = hypre_IndexD(stencil_shape[si], 2);
+	    //Astenc = stencil_shape_d[2*stencil_size+si];
+	    Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]);
             if (Astenc == 0)
             {
                lambdaz += Ap[Ai];
@@ -447,6 +455,8 @@
          vzp[vi] = lambdaz / (lambdax + lambday + lambdaz);
       }
       hypre_BoxLoop2End(Ai, vi);
+
+      hypre_StructCleanIndexD();	  
    }
 
    return ierr;
@@ -472,9 +482,6 @@
    hypre_Box             *e_dbox;
    hypre_Box             *v_dbox;
                         
-   HYPRE_Int              ei;
-   HYPRE_Int              vi;
-                        
    HYPRE_Real            *ep;
    HYPRE_Real            *vp;
                         
@@ -519,7 +526,7 @@
                           e_dbox, start,  stride,  ei,
                           v_dbox, startv, stridev, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(ei, vi)
       {
diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_interp.c hypre-2.13.0/src/struct_ls/sparse_msg_interp.c
--- hypre-2.11.2/src/struct_ls/sparse_msg_interp.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/sparse_msg_interp.c	2017-10-20 17:42:22.000000000 +0000
@@ -58,7 +58,7 @@
                             hypre_Index         stride,
                             hypre_Index         strideP       )
 {
-	hypre_SparseMSGInterpData   *interp_data = (hypre_SparseMSGInterpData   *)interp_vdata;
+   hypre_SparseMSGInterpData   *interp_data = (hypre_SparseMSGInterpData   *)interp_vdata;
 
    hypre_StructGrid       *grid;
    hypre_StructStencil    *stencil;
@@ -131,10 +131,6 @@
    hypre_Box              *P_dbox;
    hypre_Box              *xc_dbox;
    hypre_Box              *e_dbox;
-                       
-   HYPRE_Int               Pi;
-   HYPRE_Int               xci;
-   HYPRE_Int               ei;
                          
    HYPRE_Real             *Pp0, *Pp1;
    HYPRE_Real             *xcp;
@@ -203,7 +199,7 @@
                           e_dbox,  start,  stride,  ei,
                           xc_dbox, startc, stridec, xci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(ei, xci)
       {
@@ -263,7 +259,7 @@
                                 P_dbox, startP, strideP, Pi,
                                 e_dbox, start,  stride,  ei);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Pi,ei) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(Pi, ei)
             {
diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_restrict.c hypre-2.13.0/src/struct_ls/sparse_msg_restrict.c
--- hypre-2.11.2/src/struct_ls/sparse_msg_restrict.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/sparse_msg_restrict.c	2017-10-20 17:42:22.000000000 +0000
@@ -58,7 +58,7 @@
                               hypre_Index         stride,
                               hypre_Index         strideR         )
 {
-	hypre_SparseMSGRestrictData *restrict_data = (hypre_SparseMSGRestrictData *)restrict_vdata;
+   hypre_SparseMSGRestrictData *restrict_data = (hypre_SparseMSGRestrictData *)restrict_vdata;
 
    hypre_StructGrid       *grid;
    hypre_StructStencil    *stencil;
@@ -129,10 +129,6 @@
    hypre_Box              *R_dbox;
    hypre_Box              *r_dbox;
    hypre_Box              *rc_dbox;
-                       
-   HYPRE_Int               Ri;
-   HYPRE_Int               ri;
-   HYPRE_Int               rci;
                          
    HYPRE_Real             *Rp0, *Rp1;
    HYPRE_Real             *rp, *rp0, *rp1;
@@ -231,7 +227,7 @@
                                 r_dbox,  start,  stride,  ri,
                                 rc_dbox, startc, stridec, rci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ri,ri,rci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ri, ri, rci)
             {
diff -Nru hypre-2.11.2/src/struct_ls/sparse_msg_setup.c hypre-2.13.0/src/struct_ls/sparse_msg_setup.c
--- hypre-2.11.2/src/struct_ls/sparse_msg_setup.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_ls/sparse_msg_setup.c	2017-10-20 17:42:22.000000000 +0000
@@ -10,8 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
 /******************************************************************************
  *
  *
@@ -24,23 +22,23 @@
 
 #define GRID 0
 
-#define hypre_SparseMSGSetCIndex(cdir, cindex) \
-{\
-   hypre_SetIndex3(cindex, 0, 0, 0);\
-   hypre_IndexD(cindex, cdir) = 0;\
-}
-
-#define hypre_SparseMSGSetFIndex(cdir, findex) \
-{\
-   hypre_SetIndex3(findex, 0, 0, 0);\
-   hypre_IndexD(findex, cdir) = 1;\
-}
-
-#define hypre_SparseMSGSetStride(cdir, stride) \
-{\
-   hypre_SetIndex3(stride, 1, 1, 1);\
-   hypre_IndexD(stride, cdir) = 2;\
-}
+#define hypre_SparseMSGSetCIndex(cdir, cindex)  \
+   {                                            \
+      hypre_SetIndex3(cindex, 0, 0, 0);         \
+      hypre_IndexD(cindex, cdir) = 0;           \
+   }
+
+#define hypre_SparseMSGSetFIndex(cdir, findex)  \
+   {                                            \
+      hypre_SetIndex3(findex, 0, 0, 0);         \
+      hypre_IndexD(findex, cdir) = 1;           \
+   }
+
+#define hypre_SparseMSGSetStride(cdir, stride)  \
+   {                                            \
+      hypre_SetIndex3(stride, 1, 1, 1);         \
+      hypre_IndexD(stride, cdir) = 2;           \
+   }
 
 /*--------------------------------------------------------------------------
  * hypre_SparseMSGSetup
@@ -52,7 +50,7 @@
                       hypre_StructVector *b,
                       hypre_StructVector *x          )
 {
-	hypre_SparseMSGData  *smsg_data = (hypre_SparseMSGData  *)smsg_vdata;
+   hypre_SparseMSGData  *smsg_data = (hypre_SparseMSGData  *)smsg_vdata;
 
    MPI_Comm              comm = (smsg_data -> comm);
                      
@@ -559,7 +557,7 @@
       }
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
    (smsg_data -> data) = data;
 
    hypre_StructVectorInitializeData(t_a[0], data);
diff -Nru hypre-2.11.2/src/struct_mv/assumed_part.c hypre-2.13.0/src/struct_mv/assumed_part.c
--- hypre-2.11.2/src/struct_mv/assumed_part.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/assumed_part.c	2017-10-20 17:42:22.000000000 +0000
@@ -158,9 +158,7 @@
    }
 
    count = 0;
-   hypre_BoxLoop0Begin(ndim, div);
-   hypre_BoxLoopSetOneBlock();
-   hypre_BoxLoop0For()
+   hypre_SerialBoxLoop0Begin(ndim, div);
    {
       box = hypre_BoxArrayBox(box_array, count);
       hypre_BoxLoopGetIndex(index);
@@ -172,7 +170,7 @@
       }
       count++;
    }
-   hypre_BoxLoop0End();
+   hypre_SerialBoxLoop0End();
 
    /* clean up */
    for (i = 0; i < ndim; i++) 
@@ -1635,7 +1633,7 @@
 
    HYPRE_Int       i, d, p, q, r, myid;
    HYPRE_Int       num_regions, in_regions, this_region, proc_count, proc_start;
-   HYPRE_Int       adj_proc_id, extra, num_partitions, part_num;
+   HYPRE_Int       adj_proc_id, extra, num_partitions;
    HYPRE_Int       width;
    
    HYPRE_Int      *proc_array, proc_array_count;
@@ -1772,9 +1770,7 @@
       hypre_SetIndex(stride, 1);
       hypre_BoxGetSize(part_box, loop_size);
       hypre_BoxSetExtents(part_dbox, stride, div);
-      hypre_BoxLoop1Begin(ndim, loop_size, part_dbox, start, stride, part_num);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop1For(part_num)
+      hypre_SerialBoxLoop1Begin(ndim, loop_size, part_dbox, start, stride, part_num);
       {
          /*convert the partition number to a processor number*/
          if (part_num < (2*extra))
@@ -1795,7 +1791,7 @@
          proc_ids[num_proc_ids] = adj_proc_id + proc_start;
          num_proc_ids++;
       }
-      hypre_BoxLoop1End(part_num);
+      hypre_SerialBoxLoop1End(part_num);
 
    } /*end of for each region loop*/
 
diff -Nru hypre-2.11.2/src/struct_mv/box.h hypre-2.13.0/src/struct_mv/box.h
--- hypre-2.11.2/src/struct_mv/box.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/box.h	2017-10-20 17:42:22.000000000 +0000
@@ -170,329 +170,155 @@
  * BoxLoop macros:
  *--------------------------------------------------------------------------*/
 
-#if 0 /* set to 0 to use the new box loops */
+#ifdef HYPRE_USE_RAJA
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+//ReduceSum< cuda_reduce<BLOCKSIZE>, HYPRE_Real> local_result(0.0);
+#else
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+#endif
 
-#define HYPRE_BOX_PRIVATE hypre__nx,hypre__ny,hypre__nz,hypre__i,hypre__j,hypre__k
+#if defined(HYPRE_MEMORY_GPU)
 
-#define hypre_BoxLoopDeclareS(dbox, stride, sx, sy, sz) \
-HYPRE_Int  sx = (hypre_IndexX(stride));\
-HYPRE_Int  sy = (hypre_IndexY(stride)*hypre_BoxSizeX(dbox));\
-HYPRE_Int  sz = (hypre_IndexZ(stride)*\
-           hypre_BoxSizeX(dbox)*hypre_BoxSizeY(dbox))
-
-#define hypre_BoxLoopDeclareN(loop_size) \
-HYPRE_Int  hypre__i, hypre__j, hypre__k;\
-HYPRE_Int  hypre__nx = hypre_IndexX(loop_size);\
-HYPRE_Int  hypre__ny = hypre_IndexY(loop_size);\
-HYPRE_Int  hypre__nz = hypre_IndexZ(loop_size);\
-HYPRE_Int  hypre__mx = hypre__nx;\
-HYPRE_Int  hypre__my = hypre__ny;\
-HYPRE_Int  hypre__mz = hypre__nz;\
-HYPRE_Int  hypre__dir, hypre__max;\
-HYPRE_Int  hypre__div, hypre__mod;\
-HYPRE_Int  hypre__block, hypre__num_blocks;\
-hypre__dir = 0;\
-hypre__max = hypre__nx;\
-if (hypre__ny > hypre__max)\
-{\
-   hypre__dir = 1;\
-   hypre__max = hypre__ny;\
-}\
-if (hypre__nz > hypre__max)\
-{\
-   hypre__dir = 2;\
-   hypre__max = hypre__nz;\
-}\
-hypre__num_blocks = hypre_NumThreads();\
-if (hypre__max < hypre__num_blocks)\
-{\
-   hypre__num_blocks = hypre__max;\
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)\
+HYPRE_Int * indices_d;\
+HYPRE_Int indices_h[stencil_size];\
+HYPRE_Int * stencil_shape_d;\
+HYPRE_Int  stencil_shape_h[size*stencil_size];\
+HYPRE_Complex * data_A = hypre_StructMatrixData(A);\
+indices_d = hypre_DeviceTAlloc(HYPRE_Int, stencil_size);\
+stencil_shape_d = hypre_DeviceTAlloc(HYPRE_Int, size*stencil_size);\
+for (HYPRE_Int ii = 0; ii < stencil_size; ii++)\
+{\
+   HYPRE_Int jj = 0;\
+   indices_h[ii]       = hypre_StructMatrixDataIndices(A)[i][ii];\
+   if (size > 1) cdir = 0;\
+   stencil_shape_h[ii] = hypre_IndexD(stencil_shape[ii], cdir);\
+   for (jj = 1;jj < size;jj++)\
+      stencil_shape_h[jj*stencil_size+ii] = hypre_IndexD(stencil_shape[ii], jj);\
 }\
-if (hypre__num_blocks > 0)\
-{\
-   hypre__div = hypre__max / hypre__num_blocks;\
-   hypre__mod = hypre__max % hypre__num_blocks;\
-}
+hypre_DataCopyToData(indices_h,indices_d,HYPRE_Int,stencil_size);\
+hypre_DataCopyToData(stencil_shape_h,stencil_shape_d,HYPRE_Int,size*stencil_size);\
 
-#define hypre_BoxLoopSet(i, j, k) \
-i = 0;\
-j = 0;\
-k = 0;\
-hypre__nx = hypre__mx;\
-hypre__ny = hypre__my;\
-hypre__nz = hypre__mz;\
-if (hypre__num_blocks > 1)\
-{\
-   if (hypre__dir == 0)\
-   {\
-      i = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 1)\
-   {\
-      j = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 2)\
-   {\
-      k = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-}
+#define hypre_StructGetMatrixBoxData(A, i, si)  (data_A + indices_d[si])
 
-#define hypre_BoxLoopGetIndex(index) \
-index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+#define hypre_StructGetIndexD(index,i,index_d) (index_d)
 
-/* Use this before the For macros below to force only one block */
-#define hypre_BoxLoopSetOneBlock() hypre__num_blocks = 1
+#define hypre_StructCleanIndexD()\
+hypre_DeviceTFree(indices_d);\
+hypre_DeviceTFree(stencil_shape_d);
 
-/* Use this to get the block iteration inside a BoxLoop */
-#define hypre_BoxLoopBlock() hypre__block
+#define hypre_StructPreparePrint()\
+HYPRE_Int tot_size = num_values*hypre_BoxVolume(hypre_BoxArrayBox(data_space, hypre_BoxArraySize(box_array)-1));\
+data_host = hypre_CTAlloc(HYPRE_Complex, tot_size);\
+hypre_DataCopyFromData(data_host,data,HYPRE_Complex,tot_size);
 
-/*-----------------------------------*/
+#define hypre_StructPostPrint() hypre_TFree(data_host)
 
-#define hypre_BoxLoop0Begin(ndim, loop_size)\
-{\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop0For()\
-   hypre__BoxLoop0For(hypre__i, hypre__j, hypre__k)
-#define hypre__BoxLoop0For(i, j, k)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
-
-#define hypre_BoxLoop0End()\
-         }\
-      }\
-   }\
-   }\
-}
-  
-/*-----------------------------------*/
+#else
 
-#define hypre_BoxLoop1Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1)\
-{\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop1For(i1)\
-   hypre__BoxLoop1For(hypre__i, hypre__j, hypre__k, i1)
-#define hypre__BoxLoop1For(i, j, k, i1)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)
+#define hypre_StructGetMatrixBoxData(A, i, si) hypre_StructMatrixBoxData(A,i,si)
+#define hypre_StructGetIndexD(index,i,index_d) hypre_IndexD(index,i)
+#define hypre_StructCleanIndexD() {;}
+#define hypre_StructPreparePrint() data_host = data;
+#define hypre_StructPostPrint() {;}
 
-#define hypre_BoxLoop1End(i1)\
-            i1 += hypre__sx1;\
-         }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-      }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-   }\
-   }\
-}
+#endif
   
-/*-----------------------------------*/
-
-#define hypre_BoxLoop2Begin(ndim,loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2)\
+#define hypre_SerialBoxLoop0Begin(ndim, loop_size)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop2For(i1, i2)\
-   hypre__BoxLoop2For(hypre__i, hypre__j, hypre__k, i1, i2)
-#define hypre__BoxLoop2For(i, j, k, i1, i2)\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   hypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop2End(i1, i2)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
+#define hypre_SerialBoxLoop0End()\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop3Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3)\
+#define hypre_SerialBoxLoop1Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop3For(i1, i2, i3)\
-   hypre__BoxLoop3For(hypre__i, hypre__j, hypre__k, i1, i2, i3)
-#define hypre__BoxLoop3For(i, j, k, i1, i2, i3)\
+   HYPRE_Int i1;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop3End(i1, i2, i3)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
+#define hypre_SerialBoxLoop1End(i1)\
+            i1 += hypre__i0inc1;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop4Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3,\
-                            dbox4, start4, stride4, i4)\
+#define hypre_SerialBoxLoop2Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1,\
+                                  dbox2, start2, stride2, i2)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   HYPRE_Int  hypre__i4start = hypre_BoxIndexRank(dbox4, start4);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareS(dbox4, stride4, hypre__sx4, hypre__sy4, hypre__sz4);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop4For(i1, i2, i3, i4)\
-   hypre__BoxLoop4For(hypre__i, hypre__j, hypre__k, i1, i2, i3, i4)
-#define hypre__BoxLoop4For(i, j, k, i1, i2, i3, i4)\
+   HYPRE_Int i1,i2;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   i4 = hypre__i4start + i*hypre__sx4 + j*hypre__sy4 + k*hypre__sz4;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop4End(i1, i2, i3, i4)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
-            i4 += hypre__sx4;\
+#define hypre_SerialBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
-         i4 += hypre__sy4 - hypre__nx*hypre__sx4;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-      i4 += hypre__sz4 - hypre__ny*hypre__sy4;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
+#if defined (HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)
+#define HYPRE_BOX_PRIVATE hypre__global_error
 #else
-
-#define HYPRE_BOX_PRIVATE        ZYPRE_BOX_PRIVATE
-
-#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
-#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
-#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
-#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
-#define hypre_BoxLoop0For        zypre_BoxLoop0For
-#define hypre_BoxLoop0End        zypre_BoxLoop0End
-#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
-#define hypre_BoxLoop1For        zypre_BoxLoop1For
-#define hypre_BoxLoop1End        zypre_BoxLoop1End
-#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
-#define hypre_BoxLoop2For        zypre_BoxLoop2For
-#define hypre_BoxLoop2End        zypre_BoxLoop2End
-#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
-#define hypre_BoxLoop3For        zypre_BoxLoop3For
-#define hypre_BoxLoop3End        zypre_BoxLoop3End
-#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
-#define hypre_BoxLoop4For        zypre_BoxLoop4For
-#define hypre_BoxLoop4End        zypre_BoxLoop4End
-
-#endif /* end if 1 */
-
+#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE
 #endif
-
-/******************************************************************************
- *
- * NEW BoxLoop STUFF
- *
- *****************************************************************************/
-
-#ifndef hypre_ZBOX_HEADER
-#define hypre_ZBOX_HEADER
-
 #define ZYPRE_BOX_PRIVATE hypre__IN,hypre__JN,hypre__I,hypre__J,hypre__d,hypre__i
 
-/*--------------------------------------------------------------------------
- * BoxLoop macros:
- *--------------------------------------------------------------------------*/
-
 #define zypre_BoxLoopDeclare() \
 HYPRE_Int  hypre__tot, hypre__div, hypre__mod;\
 HYPRE_Int  hypre__block, hypre__num_blocks;\
@@ -629,6 +455,7 @@
 #define zypre_BoxLoop1Begin(ndim, loop_size,\
                             dbox1, start1, stride1, i1)\
 {\
+   HYPRE_Int i1;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopInit(ndim, loop_size);\
@@ -637,6 +464,7 @@
 #define zypre_BoxLoop1For(i1)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
@@ -660,6 +488,7 @@
                             dbox1, start1, stride1, i1,\
                             dbox2, start2, stride2, i2)\
 {\
+   HYPRE_Int i1,i2;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -670,6 +499,7 @@
 #define zypre_BoxLoop2For(i1, i2)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -697,6 +527,7 @@
                             dbox2, start2, stride2, i2,\
                             dbox3, start3, stride3, i3)\
 {\
+   HYPRE_Int i1,i2,i3;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -709,6 +540,7 @@
 #define zypre_BoxLoop3For(i1, i2, i3)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -740,6 +572,7 @@
                             dbox3, start3, stride3, i3,\
                             dbox4, start4, stride4, i4)\
 {\
+   HYPRE_Int i1,i2,i3,i4;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -754,6 +587,7 @@
 #define zypre_BoxLoop4For(i1, i2, i3, i4)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3,i4;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -782,6 +616,32 @@
 
 /*-----------------------------------*/
 
+#define zypre_BasicBoxLoopInitK(k, stridek) \
+hypre__sk##k[0] = stridek[0];\
+hypre__ikinc##k[0] = 0;\
+for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
+{\
+   hypre__sk##k[hypre__d] = stridek[hypre__d];\
+   hypre__ikinc##k[hypre__d] = hypre__ikinc##k[hypre__d-1] +\
+      hypre__sk##k[hypre__d] - hypre__n[hypre__d-1]*hypre__sk##k[hypre__d-1];\
+}\
+hypre__i0inc##k = hypre__sk##k[0];\
+hypre__ikinc##k[hypre__ndim] = 0;\
+hypre__ikstart##k = 0
+
+#define zypre_BasicBoxLoop2Begin(ndim, loop_size,\
+                                 stride1, i1,\
+                                 stride2, i2)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BasicBoxLoopInitK(1, stride1);\
+   zypre_BasicBoxLoopInitK(2, stride2);
+
+/*-----------------------------------*/
+
 #endif
 
 
diff -Nru hypre-2.11.2/src/struct_mv/boxloop_cuda.h hypre-2.13.0/src/struct_mv/boxloop_cuda.h
--- hypre-2.11.2/src/struct_mv/boxloop_cuda.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/boxloop_cuda.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,717 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_traversal {HYPRE_Int cuda;};
+struct omp_traversal  {HYPRE_Int omp;};
+#define hypre_exec_policy cuda_traversal()
+#define HYPER_LAMBDA [=] __device__
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+    }
+}
+#define BLOCKSIZE 128
+
+#define hypre_fence() \
+  cudaError err = cudaGetLastError();		\
+if ( cudaSuccess != err )\
+{\
+   printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__);\
+}									\
+AxCheckError(cudaDeviceSynchronize());
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void forall_kernel(LOOP_BODY loop_body, HYPRE_Int length)
+{
+	HYPRE_Int idx = blockDim.x * blockIdx.x + threadIdx.x;
+	if (idx < length)
+		loop_body(idx);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (cuda_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{	
+	size_t const blockSize = 128;
+	size_t gridSize  = (length + blockSize - 1) / blockSize;
+	if (gridSize == 0) gridSize = 1;
+	
+	//hypre_printf("length= %d, blocksize = %d, gridsize = %d\n",length,blockSize,gridSize);
+	forall_kernel<<<gridSize, blockSize>>>(loop_body,length);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (omp_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{
+
+#pragma omp parallel for schedule(static)
+	for (HYPRE_Int idx = 0;idx < length;idx++)
+		loop_body(idx);
+}
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{       								\
+HYPRE_Int idx = idx_local;						\
+local_idx  = idx % box.lsize0;					\
+idx        = idx / box.lsize0;					\
+i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+local_idx  = idx % box.lsize1;					\
+idx        = idx / box.lsize1;					\
+i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+local_idx  = idx % box.lsize2;					\
+idx  = idx / box.lsize2;					\
+i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+template<class T>
+__global__ void reduction_mult (T * a, T * b, HYPRE_Int hypre__tot,
+				hypre_Boxloop box1)
+{
+    HYPRE_Int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local = id;
+    HYPRE_Int hypre_boxD1 = 1;
+    HYPRE_Int i1 = 0;
+    //// reducted output
+    __shared__ T shared_cache [BLOCKSIZE];
+    T sum = 1;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    if (id < hypre__tot)
+      sum = a[i1];
+    *(shared_cache + threadIdx.x) = sum;
+    
+    __syncthreads();
+    
+    ///////// sum of internal cache
+    
+    HYPRE_Int i;    
+    
+    for (i=(BLOCKSIZE /2); i>0 ; i= i/2){
+      if (threadIdx.x < i){
+	*(shared_cache + threadIdx.x) *= *(shared_cache + threadIdx.x + i);
+      }
+      __syncthreads();
+    }
+    
+    if ( threadIdx.x == 0){
+      *(b+ blockIdx.x) = shared_cache[0];
+    }
+}
+}
+
+#define hypre_BoxLoopInit(ndim,loop_size)					\
+	HYPRE_Int hypre__tot = 1;											\
+	for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		hypre__tot *= loop_size[i];
+
+
+#define hypre_newBoxLoopDeclare()\
+	HYPRE_Int hypre__i,hypre__j,hypre__k;\
+	HYPRE_Int idx_local = idx;
+
+#define hypre_newBoxLoop0Begin(ndim, loop_size)				\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {
+
+#define hypre_newBoxLoop0End()					\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;											\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{							      	\
+	      databox##k.lsize2 = loop_size[2];				\
+	      databox##k.strides2 = stride[2];				\
+	      databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	      databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	}				                        	\
+	else						        	\
+	{							       	\
+		databox##k.lsize2 = 1;				       	\
+		databox##k.strides2 = 0;		       		\
+		databox##k.bstart2  = 0;	       			\
+		databox##k.bsize2   = 0;		       		\
+	}
+
+#define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);				\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+      hypre_newBoxLoopDeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      hypre__i  = idx_local % databox1.lsize0;				\
+      idx_local = idx_local / databox1.lsize0;				\
+      i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+      hypre__j  = idx_local % databox1.lsize1;				\
+      idx_local = idx_local / databox1.lsize1;				\
+      i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+      hypre__k  = idx_local % databox1.lsize2;				\
+      idx_local = idx_local / databox1.lsize2;				\
+      i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+      
+#define hypre_newBoxLoop1End(i1)				\
+    });									\
+    hypre_fence();							\
+}
+	
+#define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+#define hypre_newBoxLoop2End(i1, i2)			\
+    });							\
+    hypre_fence();					\
+}
+
+#define hypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	hypre__i  = idx_local % databox1.lsize0;				\
+	idx_local  = idx_local / databox1.lsize0;				\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	hypre__j   = idx_local % databox1.lsize1;				\
+	idx_local  = idx_local / databox1.lsize1;				\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	hypre__k  = idx_local % databox1.lsize2;				\
+	idx_local  = idx_local / databox1.lsize2;				\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	i3 += (hypre__k*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	
+
+#define hypre_newBoxLoop3End(i1, i2,i3)			\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+     hypre_BoxLoopInit(ndim,loop_size);			       \
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+     {									\
+        hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	i4 += (hypre__i*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	i4 += (hypre__j*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	i3 += (hypre__k*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	i4 += (hypre__k*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+		
+#define hypre_newBoxLoop4End(i1, i2, i3, i4)	\
+    });						\
+    hypre_fence();				\
+}
+
+#define MAX_BLOCK 512
+
+extern "C++" {
+template<class T>
+__inline__ __device__
+HYPRE_Int fake_shfl_down(T val, HYPRE_Int offset, HYPRE_Int width=32) {
+  static __shared__ T shared[MAX_BLOCK];
+  HYPRE_Int lane=threadIdx.x%32;
+
+  shared[threadIdx.x]=val;
+  __syncthreads();
+
+  val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
+  __syncthreads();
+
+  return val;
+}
+
+template<class T>  
+__inline__ __device__
+HYPRE_Real warpReduceSum (T val) {
+  for (HYPRE_Int offset = warpSize/2; offset > 0; offset /= 2)
+    val += __shfl_down(val,offset);
+  return val;
+}
+
+
+template<class T> 
+__inline__ __device__
+HYPRE_Real blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  HYPRE_Int lane=threadIdx.x%warpSize;
+  HYPRE_Int wid=threadIdx.x/warpSize;
+  val=warpReduceSum<T>(val);
+
+  //write reduced value to shared memory
+  if(lane==0) shared[wid]=val;
+  __syncthreads();
+
+  //ensure we only grab a value from shared memory if that warp existed
+  val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : HYPRE_Int(0);
+  if(wid==0) val=warpReduceSum<T>(val);
+
+  return val;
+}
+
+template<class T>
+__global__ void hypre_device_reduce_stable_kernel(T*a, T*b, T* out, HYPRE_Int N,
+						  hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int local_idx;
+  HYPRE_Int idx_local;
+  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+  HYPRE_Int i1 = 0, i2 = 0;
+  T sum=T(0);
+  HYPRE_Int i;
+  
+  for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+  {
+    idx_local = i;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+    i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+    hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+    sum += a[i1] * hypre_conj(b[i2]);
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>       
+__global__ void hypre_device_reduce_stable_kernel2(T *in, T* out, HYPRE_Int N) {
+  T sum=T(0);
+  for(HYPRE_Int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>   
+void hypre_device_reduce_stable(T*a,T*b, T* out, HYPRE_Int N,
+				hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduce_stable_kernel<T><<<blocks,threads>>>(a,b,out,N,box1,box2);
+  hypre_device_reduce_stable_kernel2<T><<<1,1024>>>(out,out,blocks); 
+}
+
+}
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void hypre_device_reduction_kernel(HYPRE_Real* out,
+					      HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+					      LOOP_BODY loop_body)
+{
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local;
+    HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+    HYPRE_Int i1 = 0, i2 = 0;
+    HYPRE_Real sum = HYPRE_Real(0);
+    HYPRE_Int i;
+    
+    for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+      {
+	idx_local = i;
+	local_idx  = idx_local % box1.lsize0;
+	idx_local  = idx_local / box1.lsize0;
+	i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+	i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+	local_idx  = idx_local % box1.lsize1;
+	idx_local  = idx_local / box1.lsize1;
+	i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+	i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+	hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+	local_idx  = idx_local % box1.lsize2;	      
+	idx_local  = idx_local / box1.lsize2;		      
+	i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+	i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+	sum = loop_body(i1,i2,sum);
+      }
+    sum=blockReduceSum<HYPRE_Real>(sum);
+    if(threadIdx.x==0)
+      out[blockIdx.x]=sum;
+}
+
+template<typename LOOP_BODY>
+void hypre_device_reduction (HYPRE_Real* out,
+			     HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+			     LOOP_BODY loop_body)
+{	
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduction_kernel<<<blocks,threads>>>(out,N,box1,box2,loop_body);
+  hypre_device_reduce_stable_kernel2<HYPRE_Real><<<1,1024>>>(out,out,blocks);
+
+}
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox1,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)			\
+       return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox2,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+      return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+
+
+#define hypre_newBoxLoop1ReductionMult(ndim, loop_size,			  \
+				       dbox1, start1, stride1, i1,xp,sum) \
+{    									  \
+   HYPRE_Real sum_old = sum;\
+   sum = 1.0;\
+   hypre_BoxLoopInit(ndim,loop_size);				  \
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	  \
+   HYPRE_Int n_blocks = (hypre__tot+BLOCKSIZE-1)/BLOCKSIZE;		  \
+   HYPRE_Real *d_b;				        		  \
+   HYPRE_Real * b = new HYPRE_Real[n_blocks];				  \
+   cudaMalloc((void**) &d_b, n_blocks * sizeof(HYPRE_Real));			\
+   reduction_mult<HYPRE_Real><<< n_blocks ,BLOCKSIZE>>>(xp,d_b,hypre__tot,databox1);		\
+   hypre_fence();							\
+   for (HYPRE_Int j = 0 ; j< n_blocks ; ++j){								\
+     sum *= b[j];							\
+   }	  		\
+   delete [] b;		\
+   sum *=sum_old;\
+}
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+	BoxLoopforall(hypre_exec_policy,size,HYPER_LAMBDA (HYPRE_Int idx) \
+	{
+
+#define hypre_LoopEnd()					\
+	});											\
+        hypre_fence();\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	    hypre_newBoxLoopDeclare()											\
+	    HYPRE_Int i1 = 0;											\
+	    hypre__i  = idx_local % databox1.lsize0;			\
+	    idx_local  = idx_local / databox1.lsize0;			\
+	    i1 += hypre__i*databox1.strides0;				\
+	    hypre__j  = idx_local % databox1.lsize1;			\
+	    idx_local  = idx_local / databox1.lsize1;			\
+	    i1 += hypre__j*databox1.strides1;				\
+	    hypre__k  = idx_local % databox1.lsize2;			\
+	    idx_local  = idx_local / databox1.lsize2;			\
+	    i1 += hypre__k*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];									\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];									\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += hypre__i*databox1.strides0;				\
+	i2 += hypre__i*databox2.strides0;				\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += hypre__j*databox1.strides1;				\
+	i2 += hypre__j*databox2.strides1;				\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += hypre__k*databox1.strides2;				\
+	i2 += hypre__k*databox2.strides2;
+
+
+#define hypre_BoxDataExchangeEnd()				\
+     });								\
+     hypre_fence();							\
+}
+  
+#define hypre_newBoxLoop0For()
+
+#define hypre_newBoxLoop1For(i1)
+
+#define hypre_newBoxLoop2For(i1, i2) 
+ 
+#define hypre_newBoxLoop3For(i1, i2, i3)
+
+#define hypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopSetOneBlock() ; 
+#define hypre_BoxLoopBlock()       0
+
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+#endif
diff -Nru hypre-2.11.2/src/struct_mv/boxloop.h hypre-2.13.0/src/struct_mv/boxloop.h
--- hypre-2.11.2/src/struct_mv/boxloop.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/boxloop.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,388 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#ifdef HYPRE_USING_OPENMP
+#ifdef WIN32
+#define Pragma(x) __pragma(#x)
+#else
+#define Pragma(x) _Pragma(#x)
+#endif
+#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_SMP_SCHEDULE)
+#define OMPREDUCTION() Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#else
+#define OMP1
+#define OMPREDUCTION() ;
+#endif
+
+typedef struct hypre_Boxloop_struct
+  {
+    HYPRE_Int lsize0,lsize1,lsize2;
+    HYPRE_Int strides0,strides1,strides2;
+    HYPRE_Int bstart0,bstart1,bstart2;
+    HYPRE_Int bsize0,bsize1,bsize2;
+  }hypre_Boxloop;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)				\
+{\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop0End()\
+         }\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+                               dbox1, start1, stride1, i1)              \
+	{														\
+	zypre_BoxLoopDeclare();									\
+	zypre_BoxLoopDeclareK(1);								\
+	zypre_BoxLoopInit(ndim, loop_size);						\
+	zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);					\
+	OMP1\
+	for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+	{																	\
+		zypre_BoxLoopSet();												\
+		zypre_BoxLoopSetK(1, i1);										\
+		for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)			\
+		{																\
+			for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)		\
+			{
+
+#define zypre_newBoxLoop1End(i1)				\
+	             i1 += hypre__i0inc1;						\
+		    }											\
+			zypre_BoxLoopInc1();					\
+	        i1 += hypre__ikinc1[hypre__d];				\
+	        zypre_BoxLoopInc2();						\
+		}											\
+	}											\
+}
+
+
+#define zypre_newBoxLoop2Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2)	\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2,	\
+							   dbox3, start3, stride3, i3)	\
+{														\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopDeclareK(1);								\
+   zypre_BoxLoopDeclareK(2);								\
+   zypre_BoxLoopDeclareK(3);								\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);		\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);		\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);		\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop3End(i1, i2, i3)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,\
+                            dbox1, start1, stride1, i1,\
+                            dbox2, start2, stride2, i2,\
+                            dbox3, start3, stride3, i3,\
+                            dbox4, start4, stride4, i4)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopDeclareK(3);\
+   zypre_BoxLoopDeclareK(4);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);\
+   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      zypre_BoxLoopSetK(4, i4);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+            i4 += hypre__i0inc4;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         i4 += hypre__ikinc4[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+                                        sum)				\
+{									\
+   zypre_BoxLoopDeclare();						\
+   zypre_BoxLoopDeclareK(1);						\
+   zypre_BoxLoopInit(ndim, loop_size);					\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);			\
+   OMPREDUCTION()							\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)\
+            i1 += hypre__i0inc1;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,				\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,	\
+                                        sum)							\
+{\
+   HYPRE_Int i1,i2;				\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMPREDUCTION()														\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_LoopBegin(size,idx)			\
+{									\
+   HYPRE_Int idx;							\
+   for (idx = 0;idx < size;idx ++)					\
+   {
+
+#define hypre_LoopEnd()					\
+  }							\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    HYPRE_Int hypre__tot = 1;						\
+    hypre_Boxloop databox1;						\
+    HYPRE_Int d,idx;							\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (d = 0;d < ndim;d ++)						\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    for (idx = 0;idx < hypre__tot;idx++)				\
+      {									\
+	  HYPRE_Int local_idx;						\
+	  HYPRE_Int idx_local = idx;					\
+	  HYPRE_Int i1 = 0;						\
+	  local_idx  = idx_local % databox1.lsize0;			\
+	  idx_local  = idx_local / databox1.lsize0;			\
+	  i1 += local_idx*databox1.strides0;				\
+	  local_idx  = idx_local % databox1.lsize1;			\
+	  idx_local  = idx_local / databox1.lsize1;			\
+	  i1 += local_idx*databox1.strides1;				\
+	  local_idx  = idx_local % databox1.lsize2;			\
+	  idx_local  = idx_local / databox1.lsize2;			\
+	  i1 += local_idx*databox1.strides2;				\
+
+
+#define hypre_BoxBoundaryCopyEnd()					\
+  }									\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,			\
+                                   stride1, i1,				\
+                                   stride2, i2)				\
+{									\
+   HYPRE_Int hypre__tot = 1,idx;					\
+   hypre_Boxloop databox1,databox2;					\
+   HYPRE_Int d;								\
+   databox1.lsize0 = loop_size[0];					\
+   databox1.lsize1 = loop_size[1];					\
+   databox1.lsize2 = loop_size[2];					\
+   databox1.strides0 = stride1[0];					\
+   databox1.strides1 = stride1[1];					\
+   databox1.strides2 = stride1[2];					\
+   databox2.lsize0 = loop_size[0];					\
+   databox2.lsize1 = loop_size[1];					\
+   databox2.lsize2 = loop_size[2];					\
+   databox2.strides0 = stride2[0];					\
+   databox2.strides1 = stride2[1];					\
+   databox2.strides2 = stride2[2];					\
+   for (d = 0;d < ndim;d ++)						\
+   {									\
+      hypre__tot *= loop_size[d];					\
+   }									\
+   for (idx = 0;idx < hypre__tot;idx++)					\
+   {									\
+      HYPRE_Int local_idx;						\
+      HYPRE_Int idx_local = idx;					\
+      HYPRE_Int i1 = 0, i2 = 0;						\
+      local_idx  = idx_local % databox1.lsize0;				\
+      idx_local  = idx_local / databox1.lsize0;				\
+      i1 += local_idx*databox1.strides0;				\
+      i2 += local_idx*databox2.strides0;				\
+      local_idx  = idx_local % databox1.lsize1;				\
+      idx_local  = idx_local / databox1.lsize1;				\
+      i1 += local_idx*databox1.strides1;				\
+      i2 += local_idx*databox2.strides1;				\
+      local_idx  = idx_local % databox1.lsize2;				\
+      idx_local  = idx_local / databox1.lsize2;				\
+      i1 += local_idx*databox1.strides2;				\
+      i2 += local_idx*databox2.strides2;
+
+#define hypre_BoxDataExchangeEnd()					\
+   }                                                                    \
+}
+
+#define hypre_newBoxLoopGetIndex zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
+#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
+#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_BoxLoop0For
+#define hypre_BoxLoop0End        zypre_BoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_BoxLoop1For
+#define hypre_BoxLoop1End        zypre_BoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_BoxLoop2For
+#define hypre_BoxLoop2End        zypre_BoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_BoxLoop3For
+#define hypre_BoxLoop3End        zypre_BoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_BoxLoop4For
+#define hypre_BoxLoop4End        zypre_BoxLoop4End
+#define hypre_BasicBoxLoop2Begin zypre_BasicBoxLoop2Begin
+
+#endif
diff -Nru hypre-2.11.2/src/struct_mv/boxloop_kokkos.h hypre-2.13.0/src/struct_mv/boxloop_kokkos.h
--- hypre-2.11.2/src/struct_mv/boxloop_kokkos.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/boxloop_kokkos.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,542 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+extern "C++" {
+#include <Kokkos_Core.hpp>
+}
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+ typedef struct hypre_Boxloop_struct
+ {
+	 HYPRE_Int lsize0,lsize1,lsize2;
+	 HYPRE_Int strides0,strides1,strides2;
+	 HYPRE_Int bstart0,bstart1,bstart2;
+	 HYPRE_Int bsize0,bsize1,bsize2;
+ } hypre_Boxloop;
+
+ #if defined(HYPRE_MEMORY_GPU)
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ #define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+ inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+ {
+     if (err)
+     {
+	 printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+     }
+ }
+ #define BLOCKSIZE 256
+
+ #define hypre_fence() \
+ cudaError err = cudaGetLastError();\
+ if ( cudaSuccess != err ) {\
+ printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+ }\
+ AxCheckError(cudaDeviceSynchronize());
+ #elif defined(HYPRE_USE_OPENMP)
+    #define hypre_fence() ;
+ #elif defined(HYPRE_USING_OPENMP_ACC)
+ #define hypre_fence()  
+ #else 
+    #define hypre_fence();
+ #endif
+
+ #define hypre_newBoxLoopInit(ndim,loop_size)					\
+	 HYPRE_Int hypre__tot = 1;											\
+	 for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		 hypre__tot *= loop_size[i];
+
+
+ #define hypre_BoxLoopIncK(k,box,i)					\
+ {									\
+    HYPRE_Int idx = idx_local;						\
+    local_idx  = idx % box.lsize0;					\
+    idx        = idx / box.lsize0;					\
+    i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+    local_idx  = idx % box.lsize1;					\
+    idx        = idx / box.lsize1;					\
+    i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+    local_idx  = idx % box.lsize2;					\
+    idx  = idx / box.lsize2;					\
+    i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+ }
+
+ #define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	 hypre_Boxloop databox##k;     					\
+	 databox##k.lsize0 = loop_size[0];				\
+	 databox##k.strides0 = stride[0];				\
+	 databox##k.bstart0  = start[0] - dbox->imin[0];		\
+	 databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	 if (ndim > 1)							\
+	 {								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	 }								\
+	 else						        	\
+	 {							       	\
+	    databox##k.lsize1 = 1;				       	\
+	    databox##k.strides1 = 0;					\
+	    databox##k.bstart1  = 0;					\
+	    databox##k.bsize1   = 0;					\
+	 }								\
+	 if (ndim == 3)							\
+	 {								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	 }								\
+	 else								\
+	 {								\
+	   databox##k.lsize2 = 1;					\
+	   databox##k.strides2 = 0;					\
+	   databox##k.bstart2  = 0;					\
+	   databox##k.bsize2   = 0;					\
+	 }
+
+ #define hypre_newBoxLoopDeclare()										\
+	 HYPRE_Int local_idx;												\
+	 HYPRE_Int idx_local = idx;
+
+ #define hypre_newBoxLoop0Begin(ndim, loop_size) 	\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {
+
+
+ #define hypre_newBoxLoop0End(i1)				\
+	 });											\
+ }
+
+
+ #define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1)		\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size)						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+       hypre_newBoxLoopDeclare();						\
+       HYPRE_Int hypre_boxD1 = 1;					\
+       HYPRE_Int i1 = 0;							\
+       local_idx  = idx_local % databox1.lsize0;				\
+       idx_local  = idx_local / databox1.lsize0;				\
+       i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+       local_idx  = idx_local % databox1.lsize1;				\
+       idx_local  = idx_local / databox1.lsize1;				\
+       i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+       local_idx  = idx_local % databox1.lsize2;				\
+       idx_local  = idx_local / databox1.lsize2;				\
+       i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+
+
+ #define hypre_newBoxLoop1End(i1)					\
+     });									\
+     hypre_fence();							\
+ }
+
+
+ #define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2)		\
+ {    														\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+ #define hypre_newBoxLoop2End(i1, i2)			\
+      });							\
+      hypre_fence();						\
+ }
+
+
+ #define hypre_newBoxLoop3Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3)		\
+ {																	\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+      hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+      hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+      hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+      Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+      {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	 local_idx  = idx_local % databox1.lsize0;				\
+	 idx_local  = idx_local / databox1.lsize0;				\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	 local_idx  = idx_local % databox1.lsize1;				\
+	 idx_local  = idx_local / databox1.lsize1;				\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	 local_idx  = idx_local % databox1.lsize2;				\
+	 idx_local  = idx_local / databox1.lsize2;				\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	 i3 += (local_idx*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);
+
+ #define hypre_newBoxLoop3End(i1, i2, i3)			\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop4Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3,		\
+				dbox4, start4, stride4, i4)		\
+ {									\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop4End(i1, i2, i3, i4)		\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+
+
+
+ #define hypre_newBoxLoop1ReductionEnd(i1, sum)				\
+     },sum);								\
+     hypre_fence();							\
+     sum += sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1,	\
+					 dbox2, start2, stride2, i2, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);				\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+     },sum);								\
+     hypre_fence();							\
+     sum +=sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop1ReductionMult(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, xp, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 1.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 sum *= xp[i1];							\
+     },sum);								\
+     hypre_fence();							\
+     sum *=sum_tmp;								\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+    Kokkos::parallel_for(size, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {
+
+#define hypre_LoopEnd()							\
+    });									\
+    hypre_fence();							\
+}
+  
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+       hypre__tot *= loop_size[d];					\
+    }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int i1 = 0;						\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+	});							\
+	hypre_fence();						\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define hypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+ 
+#define hypre_newBoxLoopSetOneBlock() {}
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock hypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+
+//#define hypre_newBoxLoop1ReductionBegin hypre_newBoxLoop1ReductionBegin
+//#define hypre_newBoxLoop1ReductionEnd   hypre_newBoxLoop1ReductionEnd
+//#define hypre_newBoxLoop2ReductionBegin hypre_newBoxLoop2ReductionBegin
+//#define hypre_newBoxLoop2ReductionEnd   hypre_newBoxLoop2ReductionEnd
+//#define hypre_newBoxLoop1ReductionMult hypre_newBoxLoop1ReductionMult
+//#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+//#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+//#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+//#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+
+#endif
diff -Nru hypre-2.11.2/src/struct_mv/boxloop_raja.h hypre-2.13.0/src/struct_mv/boxloop_raja.h
--- hypre-2.11.2/src/struct_mv/boxloop_raja.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/boxloop_raja.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,845 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+extern "C++" {
+#include <RAJA/RAJA.hxx>
+}
+using namespace RAJA;
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define BLOCKSIZE 256
+
+#if defined(HYPRE_MEMORY_GPU)
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+		HYPRE_Int *p = NULL; *p = 1;
+    }
+}
+
+#define hypre_exec_policy cuda_exec<BLOCKSIZE>
+#define hypre_reduce_policy  cuda_reduce_atomic<BLOCKSIZE>
+#define hypre_fence() \
+cudaError err = cudaGetLastError();\
+if ( cudaSuccess != err ) {\
+printf("\n ERROR zypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+}\
+AxCheckError(cudaDeviceSynchronize());
+
+#elif defined(HYPRE_USE_OPENMP)
+   #define hypre_exec_policy      omp_for_exec
+   #define hypre_reduce_policy omp_reduce
+   #define hypre_fence() 
+#elif defined(HYPRE_USING_OPENMP_ACC)
+   #define hypre_exec_policy      omp_parallel_for_acc
+   #define hypre_reduce_policy omp_acc_reduce
+#else 
+   #define hypre_exec_policy   seq_exec
+   #define hypre_reduce_policy seq_reduce
+   #define hypre_fence()
+#endif
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{									\
+   HYPRE_Int idx = idx_local;						\
+   local_idx  = idx % box.lsize0;					\
+   idx        = idx / box.lsize0;					\
+   i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+   local_idx  = idx % box.lsize1;					\
+   idx        = idx / box.lsize1;					\
+   i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+   local_idx  = idx % box.lsize2;					\
+   idx  = idx / box.lsize2;					\
+   i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+#define zypre_BoxLoopCUDAInit(ndim,loop_size)				\
+  HYPRE_Int hypre__tot = 1;						\
+  for (HYPRE_Int i = 0;i < ndim;i ++)					\
+      hypre__tot *= loop_size[i];
+
+
+#define zypre_BoxLoopCUDADeclare()										\
+	HYPRE_Int local_idx;												\
+	HYPRE_Int idx_local = idx;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)			\
+{									\
+   zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+   forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+   {
+
+
+#define zypre_newBoxLoop0End()					\
+	});											\
+	hypre_fence();      \
+}
+
+#define zypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;					\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];   	\
+	}								\
+	else								\
+	{								\
+	    databox##k.lsize2 = 1;					\
+	    databox##k.strides2 = 0;					\
+	    databox##k.bstart2  = 0;					\
+	    databox##k.bsize2   = 0;					\
+	}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+      zypre_BoxLoopCUDADeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      zypre_BoxLoopIncK(1,databox1,i1);
+
+      
+#define zypre_newBoxLoop1End(i1)				\
+	});											\
+    hypre_fence();\
+}
+	
+#define zypre_newBoxLoop2Begin(ndim, loop_size,				\
+                                dbox1, start1, stride1, i1,	\
+                                dbox2, start2, stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+		HYPRE_Int i1 = 0, i2 = 0;							\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	
+
+
+#define zypre_newBoxLoop2End(i1, i2)			\
+	});											\
+    hypre_fence();\
+}
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+  {									\
+  zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+        zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+        zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+        zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+        forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+	{								\
+	  zypre_BoxLoopCUDADeclare();					\
+	  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	  HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	  local_idx  = idx_local % databox1.lsize0;				\
+	  idx_local  = idx_local / databox1.lsize0;				\
+	  i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	  i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	  i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	  local_idx  = idx_local % databox1.lsize1;				\
+	  idx_local  = idx_local / databox1.lsize1;				\
+	  i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	  i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	  i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	  local_idx  = idx_local % databox1.lsize2;				\
+	  idx_local  = idx_local / databox1.lsize2;				\
+	  i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	  i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	  i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	  
+
+#define zypre_newBoxLoop3End(i1, i2, i3)			\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+ zypre_BoxLoopCUDAInit(ndim,loop_size);					       \
+     zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     zypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+     {									\
+         zypre_BoxLoopCUDADeclare();					\
+		 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+	 
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)	\
+  });						\
+  hypre_fence();				\
+}
+
+#define MAX_BLOCK BLOCKSIZE
+
+extern "C++" {
+#if defined(HYPRE_MEMORY_GPU)
+template<class T>
+class ReduceMult   
+{
+public:
+  /*!
+   * \brief Constructor takes initial reduction value (default constructor
+   * is disabled).
+   *
+   * Note: Constructor only executes on the host.
+   */
+  explicit ReduceMult(T init_val)
+  {
+    m_is_copy_host = false;
+    m_myID = getCudaReductionId();
+    getCudaReductionTallyBlock(m_myID,
+                               (void **)&m_tally_host,
+                               (void **)&m_tally_device);
+    m_tally_host->tally = init_val;
+  }
+
+  /*!
+   * \brief Initialize shared memory on device, request shared memory on host.
+   *
+   * Copy constructor executes on both host and device.
+   * On host requests dynamic shared memory and gets offset into dynamic
+   * shared memory if in forall.
+   * On device initializes dynamic shared memory to appropriate value.
+   */
+  RAJA_HOST_DEVICE
+  ReduceMult(const ReduceMult<T> &other)
+  {
+    *this = other;
+#if defined(__CUDA_ARCH__)
+    m_is_copy_device = true;
+    m_finish_reduction = !other.m_is_copy_device;
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    // initialize shared memory
+    T val = static_cast<T>(0);
+    for (HYPRE_Int i = BLOCKSIZE / 2; i > 0; i /= 2) {
+      // this descends all the way to 1
+      if (threadId < i) {
+        sd[threadId + i] = val;
+      }
+    }
+    if (threadId < 1) {
+      sd[threadId] = val;
+    }
+
+    __syncthreads();
+#else
+    m_is_copy_host = true;
+    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCKSIZE, sizeof(T));
+#endif
+  }
+
+  /*!
+   * \brief Finish reduction on device and free memory on host.
+   *
+   * Destruction on host releases the device memory chunk for
+   * reduction id and id itself for others to use.
+   * Destruction on device completes the reduction.
+   *
+   * Note: destructor executes on both host and device.
+   */
+  RAJA_HOST_DEVICE ~ReduceMult<T>()
+  {
+#if defined(__CUDA_ARCH__)
+    if (m_finish_reduction) {
+      extern __shared__ unsigned char sd_block[];
+      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+      HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                     + (blockDim.x * blockDim.y) * threadIdx.z;
+
+      T temp = 1;
+      __syncthreads();
+
+      for (HYPRE_Int i = BLOCKSIZE / 2; i >= WARP_SIZE; i /= 2) {
+        if (threadId < i) {
+          sd[threadId] *= sd[threadId + i];
+        }
+        __syncthreads();
+      }
+
+      if (threadId < WARP_SIZE) {
+        temp = sd[threadId];
+        for (HYPRE_Int i = WARP_SIZE / 2; i > 0; i /= 2) {
+          temp *= HIDDEN::shfl_xor<T>(temp, i);
+        }
+      }
+
+      // one thread adds to tally
+      if (threadId == 0) {
+        _atomicAdd<T>(&(m_tally_device->tally), temp);
+      }
+    }
+#else
+    if (!m_is_copy_host) {
+      releaseCudaReductionTallyBlock(m_myID);
+      releaseCudaReductionId(m_myID);
+    }
+#endif
+
+    
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  operator T()
+  {
+    beforeCudaReadTallyBlock<true>(m_myID);
+    return m_tally_host->tally;
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  T get() { return operator T(); }
+
+  /*!
+   * \brief Operator that adds value to sum.
+   *
+   * Note: only operates on device.
+   */
+  RAJA_DEVICE ReduceMult<T> const &
+  operator*=(T val) const
+  {
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    sd[threadId] *= val;
+
+    return *this;
+  }
+
+private:
+  /*!
+   * \brief Default constructor is declared private and not implemented.
+   */
+  ReduceMult<T>();
+
+  /*!
+   * \brief Pointer to host tally block cache slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
+
+  /*!
+   * \brief Pointer to device tally block slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+
+  /*!
+   * \brief My cuda reduction variable ID.
+   */
+  HYPRE_Int m_myID = -1;
+
+  /*!
+   * \brief Byte offset into dynamic shared memory.
+   */
+  HYPRE_Int m_smem_offset = -1;
+
+  /*!
+   * \brief If this variable is a copy or not; only original may release memory 
+   *        or perform finalization.
+   */
+  bool m_is_copy_host = false;
+  bool m_is_copy_device = false;
+  bool m_finish_reduction = false;
+
+  // Sanity checks for block size and template type size
+  static constexpr bool powerOfTwoCheck = (!(BLOCKSIZE & (BLOCKSIZE - 1)));
+  static constexpr bool reasonableRangeCheck =
+      ((BLOCKSIZE >= 32) && (BLOCKSIZE <= 1024));
+  static constexpr bool sizeofcheck =
+      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
+       && (sizeof(CudaReductionTallyType<T>)
+           <= sizeof(CudaReductionDummyTallyType))
+       && (sizeof(CudaReductionBlockType<T>)
+           <= sizeof(CudaReductionDummyBlockType)));
+  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
+  static_assert(reasonableRangeCheck,
+                "Error: block sizes must be between 32 and 1024");
+  static_assert(sizeofcheck,
+      "Error: type must be of size <= " 
+      RAJA_STRINGIFY_MACRO(RAJA_CUDA_REDUCE_VAR_MAXSIZE));
+};
+#elif defined(HYPRE_USING_OPENMP)
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_val, T initializer = 1)
+        : m_parent(NULL), m_val(init_val), m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+#pragma omp critical
+                {
+                    *m_parent *= m_val;
+                }
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum for current thread.
+        //
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+        
+    };
+#else
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_m_val, T initializer = 1) :
+        m_parent(NULL),
+        m_val(init_m_val),
+        m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+                *m_parent *= m_val;
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns reduced sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum.
+        //
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+    };
+#endif
+}
+
+
+#define zypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,sum) \
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1,i1)	\
+      {
+
+#define zypre_newBoxLoop1ReductionEnd(i1,sum)				\
+      }									\
+      zypre_newBoxLoop1End(i1);					\
+      hypre_fence();						\
+      sum_tmp = (HYPRE_Real)(sum);				\
+   }								\
+   sum += sum_tmp; \
+}
+		    
+#define zypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum)	\
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop2Begin(ndim, loop_size, \
+			     dbox1, start1, stride1,i1,\
+			     dbox2, start2, stride2,i2)	\
+      {
+
+#define zypre_newBoxLoop2ReductionEnd(i1,i2,sum)			\
+      }									\
+      zypre_newBoxLoop2End(i1,i2);					\
+      hypre_fence();							\
+      sum_tmp = (HYPRE_Real)(sum);					\
+   }								\
+   sum += sum_tmp; \
+}
+
+#define zypre_newBoxLoop1ReductionMult(ndim, loop_size,				\
+				       dbox1, start1, stride1, i1,xp,sum) \
+{									\
+   ReduceMult<HYPRE_Real> local_result_raja(1.0);				\
+   zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \
+   {									\
+       local_result_raja *= xp[i1];					\
+   }									\
+   zypre_newBoxLoop1End(i1)						\
+   hypre_fence();							\
+   sum *= (HYPRE_Real)(local_result_raja);				\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{									\
+   forall< hypre_exec_policy >(0, size, [=] RAJA_DEVICE (HYPRE_Int idx)	\
+   {
+
+#define hypre_LoopEnd()					\
+   });							\
+   hypre_fence();		\
+}
+  
+#define zypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);				\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int i1 = 0;							\
+        local_idx  = idx_local % databox1.lsize0;			\
+        idx_local  = idx_local / databox1.lsize0;			\
+        i1 += local_idx*databox1.strides0;				\
+        local_idx  = idx_local % databox1.lsize1;			\
+        idx_local  = idx_local / databox1.lsize1;			\
+        i1 += local_idx*databox1.strides1;				\
+        local_idx  = idx_local % databox1.lsize2;			\
+        idx_local  = idx_local / databox1.lsize2;			\
+        i1 += local_idx*databox1.strides2;				\
+		
+#define zypre_BoxBoundaryCopyEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define zypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define zypre_newBoxLoopSetOneBlock()
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_newBoxLoop0For
+#define hypre_BoxLoop0End        zypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_newBoxLoop1For
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_newBoxLoop2For
+#define hypre_BoxLoop2End        zypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_newBoxLoop3For
+#define hypre_BoxLoop3End        zypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_newBoxLoop4For
+#define hypre_BoxLoop4End        zypre_newBoxLoop4End
+
+#define hypre_newBoxLoop1ReductionBegin zypre_newBoxLoop1ReductionBegin
+#define hypre_newBoxLoop1ReductionEnd   zypre_newBoxLoop1ReductionEnd
+#define hypre_newBoxLoop2ReductionBegin zypre_newBoxLoop2ReductionBegin
+#define hypre_newBoxLoop2ReductionEnd   zypre_newBoxLoop2ReductionEnd
+#define hypre_newBoxLoop1ReductionMult zypre_newBoxLoop1ReductionMult
+#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+#endif
diff -Nru hypre-2.11.2/src/struct_mv/box_manager.c hypre-2.13.0/src/struct_mv/box_manager.c
--- hypre-2.11.2/src/struct_mv/box_manager.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/box_manager.c	2017-10-20 17:42:22.000000000 +0000
@@ -2226,7 +2226,7 @@
       HYPRE_Int  size[HYPRE_MAXDIM];
       HYPRE_Int  iminmax[2];
       HYPRE_Int  index_not_there;
-      HYPRE_Int  d, e, ii, itsize;
+      HYPRE_Int  d, e, itsize;
       HYPRE_Int  mystart, myfinish;
       HYPRE_Int  imin[HYPRE_MAXDIM];
       HYPRE_Int  imax[HYPRE_MAXDIM];
@@ -2388,9 +2388,7 @@
             /* set up index table */
             hypre_BoxSetExtents(index_box, imin, imax);
             hypre_BoxGetSize(index_box, loop_size);
-            hypre_BoxLoop1Begin(ndim, loop_size, table_box, imin, stride, ii);
-            hypre_BoxLoopSetOneBlock();
-            hypre_BoxLoop1For(ii)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size, table_box, imin, stride, ii);
             {
                if (!index_table[ii]) /* no entry- add one */
                {
@@ -2403,7 +2401,7 @@
                   index_table[ii] = entry;
                }
             }
-            hypre_BoxLoop1End(ii);
+            hypre_SerialBoxLoop1End(ii);
 
          } /* end of subset of entries */
       }/* end of three loops over subsets */
@@ -2464,7 +2462,7 @@
                         HYPRE_Int *nentries_ptr )
 {
    HYPRE_Int           ndim = hypre_BoxManNDim(manager);
-   HYPRE_Int           d, ii;
+   HYPRE_Int           d;
    HYPRE_Int           find_index_d, current_index_d;
    HYPRE_Int          *man_indexes_d;
    HYPRE_Int           man_index_size_d;
@@ -2581,9 +2579,7 @@
    hypre_BoxShiftNeg(table_box, stride); /* Want box to start at 0*/
    hypre_BoxSetExtents(index_box, man_ilower, man_iupper);
    hypre_BoxGetSize(index_box, loop_size);
-   hypre_BoxLoop1Begin(ndim, loop_size, table_box, man_ilower, stride, ii);
-   hypre_BoxLoopSetOneBlock();
-   hypre_BoxLoop1For(ii)
+   hypre_SerialBoxLoop1Begin(ndim, loop_size, table_box, man_ilower, stride, ii);
    {
       entry = index_table[ii];
 
@@ -2601,7 +2597,7 @@
          entry = hypre_BoxManEntryNext(entry);
       }
    }
-   hypre_BoxLoop1End(ii);
+   hypre_SerialBoxLoop1End(ii);
 
    entries  = hypre_TReAlloc(entries, hypre_BoxManEntry *, nentries);
 
diff -Nru hypre-2.11.2/src/struct_mv/communication_info.c hypre-2.13.0/src/struct_mv/communication_info.c
--- hypre-2.11.2/src/struct_mv/communication_info.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/communication_info.c	2017-10-20 17:42:22.000000000 +0000
@@ -431,14 +431,13 @@
       hypre_BoxSetExtents(sbox, istart, istop);
       start = hypre_BoxIMin(sbox);
       hypre_BoxGetSize(sbox, loop_size);
-      hypre_BoxLoop1Begin(ndim, loop_size,
+
+      hypre_SerialBoxLoop1Begin(ndim, loop_size,
                           stencil_box, start, stride, si);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop1For(si)
       {
          stencil_grid[si] = 1;
       }
-      hypre_BoxLoop1End(si);
+      hypre_SerialBoxLoop1End(si);
    }
 
    /*------------------------------------------------------
@@ -816,9 +815,7 @@
    size = 0;
    start = hypre_BoxIMin(box);
    hypre_BoxGetSize(box, loop_size);
-   hypre_BoxLoop0Begin(ndim, loop_size);
-   hypre_BoxLoopSetOneBlock();
-   hypre_BoxLoop0For()
+   hypre_SerialBoxLoop0Begin(ndim, loop_size);
    {
       hypre_BoxLoopGetIndex(ii);
       for (d = 0; d < ndim; d++)
@@ -835,7 +832,7 @@
       }
       size++;
    }
-   hypre_BoxLoop0End();
+   hypre_SerialBoxLoop0End();
 
    hypre_BoxDestroy(box);
 
diff -Nru hypre-2.11.2/src/struct_mv/headers hypre-2.13.0/src/struct_mv/headers
--- hypre-2.11.2/src/struct_mv/headers	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/headers	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,9 @@
 
 cat > $INTERNAL_HEADER <<@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_STRUCT_MV_HEADER
 #define hypre_STRUCT_MV_HEADER
 
@@ -29,25 +32,56 @@
 #include "HYPRE_struct_mv.h"
 #include "_hypre_utilities.h"
 
+@
+
+
+cat >> $INTERNAL_HEADER <<@
+#if defined(HYPRE_USE_RAJA)
+@
+
+cat boxloop_raja.h                >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#elif defined(HYPRE_USE_KOKKOS)
+@
+
+cat boxloop_kokkos.h              >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#elif defined(HYPRE_USE_CUDA)
+@
+
+cat boxloop_cuda.h                >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#else
+@
+
+cat boxloop.h                     >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#endif
+@
+
+cat >> $INTERNAL_HEADER <<@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
 @
 
 #===========================================================================
 # Structures and prototypes
 #===========================================================================
 
-cat box.h                  >> $INTERNAL_HEADER	
+cat box.h                  >> $INTERNAL_HEADER  
 cat assumed_part.h         >> $INTERNAL_HEADER
 cat box_manager.h          >> $INTERNAL_HEADER
-cat struct_grid.h          >> $INTERNAL_HEADER	
-cat struct_stencil.h       >> $INTERNAL_HEADER	
-cat struct_communication.h >> $INTERNAL_HEADER	
-cat computation.h          >> $INTERNAL_HEADER	
-cat struct_matrix.h        >> $INTERNAL_HEADER	
-cat struct_vector.h        >> $INTERNAL_HEADER	
+cat struct_grid.h          >> $INTERNAL_HEADER  
+cat struct_stencil.h       >> $INTERNAL_HEADER  
+cat struct_communication.h >> $INTERNAL_HEADER  
+cat computation.h          >> $INTERNAL_HEADER  
+cat struct_matrix.h        >> $INTERNAL_HEADER  
+cat struct_vector.h        >> $INTERNAL_HEADER  
 
 cat protos.h               >> $INTERNAL_HEADER
 
diff -Nru hypre-2.11.2/src/struct_mv/_hypre_struct_mv.h hypre-2.13.0/src/struct_mv/_hypre_struct_mv.h
--- hypre-2.11.2/src/struct_mv/_hypre_struct_mv.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/_hypre_struct_mv.h	2017-10-20 17:42:22.000000000 +0000
@@ -1,3 +1,2125 @@
+
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
+#ifndef hypre_STRUCT_MV_HEADER
+#define hypre_STRUCT_MV_HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "HYPRE_struct_mv.h"
+#include "_hypre_utilities.h"
+
+#if defined(HYPRE_USE_RAJA)
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+extern "C++" {
+#include <RAJA/RAJA.hxx>
+}
+using namespace RAJA;
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define BLOCKSIZE 256
+
+#if defined(HYPRE_MEMORY_GPU)
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+		HYPRE_Int *p = NULL; *p = 1;
+    }
+}
+
+#define hypre_exec_policy cuda_exec<BLOCKSIZE>
+#define hypre_reduce_policy  cuda_reduce_atomic<BLOCKSIZE>
+#define hypre_fence() \
+cudaError err = cudaGetLastError();\
+if ( cudaSuccess != err ) {\
+printf("\n ERROR zypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+}\
+AxCheckError(cudaDeviceSynchronize());
+
+#elif defined(HYPRE_USE_OPENMP)
+   #define hypre_exec_policy      omp_for_exec
+   #define hypre_reduce_policy omp_reduce
+   #define hypre_fence() 
+#elif defined(HYPRE_USING_OPENMP_ACC)
+   #define hypre_exec_policy      omp_parallel_for_acc
+   #define hypre_reduce_policy omp_acc_reduce
+#else 
+   #define hypre_exec_policy   seq_exec
+   #define hypre_reduce_policy seq_reduce
+   #define hypre_fence()
+#endif
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{									\
+   HYPRE_Int idx = idx_local;						\
+   local_idx  = idx % box.lsize0;					\
+   idx        = idx / box.lsize0;					\
+   i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+   local_idx  = idx % box.lsize1;					\
+   idx        = idx / box.lsize1;					\
+   i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+   local_idx  = idx % box.lsize2;					\
+   idx  = idx / box.lsize2;					\
+   i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+#define zypre_BoxLoopCUDAInit(ndim,loop_size)				\
+  HYPRE_Int hypre__tot = 1;						\
+  for (HYPRE_Int i = 0;i < ndim;i ++)					\
+      hypre__tot *= loop_size[i];
+
+
+#define zypre_BoxLoopCUDADeclare()										\
+	HYPRE_Int local_idx;												\
+	HYPRE_Int idx_local = idx;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)			\
+{									\
+   zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+   forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+   {
+
+
+#define zypre_newBoxLoop0End()					\
+	});											\
+	hypre_fence();      \
+}
+
+#define zypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;					\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];   	\
+	}								\
+	else								\
+	{								\
+	    databox##k.lsize2 = 1;					\
+	    databox##k.strides2 = 0;					\
+	    databox##k.bstart2  = 0;					\
+	    databox##k.bsize2   = 0;					\
+	}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+      zypre_BoxLoopCUDADeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      zypre_BoxLoopIncK(1,databox1,i1);
+
+      
+#define zypre_newBoxLoop1End(i1)				\
+	});											\
+    hypre_fence();\
+}
+	
+#define zypre_newBoxLoop2Begin(ndim, loop_size,				\
+                                dbox1, start1, stride1, i1,	\
+                                dbox2, start2, stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+		HYPRE_Int i1 = 0, i2 = 0;							\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	
+
+
+#define zypre_newBoxLoop2End(i1, i2)			\
+	});											\
+    hypre_fence();\
+}
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+  {									\
+  zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+        zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+        zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+        zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+        forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+	{								\
+	  zypre_BoxLoopCUDADeclare();					\
+	  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	  HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	  local_idx  = idx_local % databox1.lsize0;				\
+	  idx_local  = idx_local / databox1.lsize0;				\
+	  i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	  i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	  i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	  local_idx  = idx_local % databox1.lsize1;				\
+	  idx_local  = idx_local / databox1.lsize1;				\
+	  i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	  i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	  i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	  local_idx  = idx_local % databox1.lsize2;				\
+	  idx_local  = idx_local / databox1.lsize2;				\
+	  i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	  i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	  i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	  
+
+#define zypre_newBoxLoop3End(i1, i2, i3)			\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+ zypre_BoxLoopCUDAInit(ndim,loop_size);					       \
+     zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     zypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+     {									\
+         zypre_BoxLoopCUDADeclare();					\
+		 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+	 
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)	\
+  });						\
+  hypre_fence();				\
+}
+
+#define MAX_BLOCK BLOCKSIZE
+
+extern "C++" {
+#if defined(HYPRE_MEMORY_GPU)
+template<class T>
+class ReduceMult   
+{
+public:
+  /*!
+   * \brief Constructor takes initial reduction value (default constructor
+   * is disabled).
+   *
+   * Note: Constructor only executes on the host.
+   */
+  explicit ReduceMult(T init_val)
+  {
+    m_is_copy_host = false;
+    m_myID = getCudaReductionId();
+    getCudaReductionTallyBlock(m_myID,
+                               (void **)&m_tally_host,
+                               (void **)&m_tally_device);
+    m_tally_host->tally = init_val;
+  }
+
+  /*!
+   * \brief Initialize shared memory on device, request shared memory on host.
+   *
+   * Copy constructor executes on both host and device.
+   * On host requests dynamic shared memory and gets offset into dynamic
+   * shared memory if in forall.
+   * On device initializes dynamic shared memory to appropriate value.
+   */
+  RAJA_HOST_DEVICE
+  ReduceMult(const ReduceMult<T> &other)
+  {
+    *this = other;
+#if defined(__CUDA_ARCH__)
+    m_is_copy_device = true;
+    m_finish_reduction = !other.m_is_copy_device;
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    // initialize shared memory
+    T val = static_cast<T>(0);
+    for (HYPRE_Int i = BLOCKSIZE / 2; i > 0; i /= 2) {
+      // this descends all the way to 1
+      if (threadId < i) {
+        sd[threadId + i] = val;
+      }
+    }
+    if (threadId < 1) {
+      sd[threadId] = val;
+    }
+
+    __syncthreads();
+#else
+    m_is_copy_host = true;
+    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCKSIZE, sizeof(T));
+#endif
+  }
+
+  /*!
+   * \brief Finish reduction on device and free memory on host.
+   *
+   * Destruction on host releases the device memory chunk for
+   * reduction id and id itself for others to use.
+   * Destruction on device completes the reduction.
+   *
+   * Note: destructor executes on both host and device.
+   */
+  RAJA_HOST_DEVICE ~ReduceMult<T>()
+  {
+#if defined(__CUDA_ARCH__)
+    if (m_finish_reduction) {
+      extern __shared__ unsigned char sd_block[];
+      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+      HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                     + (blockDim.x * blockDim.y) * threadIdx.z;
+
+      T temp = 1;
+      __syncthreads();
+
+      for (HYPRE_Int i = BLOCKSIZE / 2; i >= WARP_SIZE; i /= 2) {
+        if (threadId < i) {
+          sd[threadId] *= sd[threadId + i];
+        }
+        __syncthreads();
+      }
+
+      if (threadId < WARP_SIZE) {
+        temp = sd[threadId];
+        for (HYPRE_Int i = WARP_SIZE / 2; i > 0; i /= 2) {
+          temp *= HIDDEN::shfl_xor<T>(temp, i);
+        }
+      }
+
+      // one thread adds to tally
+      if (threadId == 0) {
+        _atomicAdd<T>(&(m_tally_device->tally), temp);
+      }
+    }
+#else
+    if (!m_is_copy_host) {
+      releaseCudaReductionTallyBlock(m_myID);
+      releaseCudaReductionId(m_myID);
+    }
+#endif
+
+    
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  operator T()
+  {
+    beforeCudaReadTallyBlock<true>(m_myID);
+    return m_tally_host->tally;
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  T get() { return operator T(); }
+
+  /*!
+   * \brief Operator that adds value to sum.
+   *
+   * Note: only operates on device.
+   */
+  RAJA_DEVICE ReduceMult<T> const &
+  operator*=(T val) const
+  {
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    sd[threadId] *= val;
+
+    return *this;
+  }
+
+private:
+  /*!
+   * \brief Default constructor is declared private and not implemented.
+   */
+  ReduceMult<T>();
+
+  /*!
+   * \brief Pointer to host tally block cache slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
+
+  /*!
+   * \brief Pointer to device tally block slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+
+  /*!
+   * \brief My cuda reduction variable ID.
+   */
+  HYPRE_Int m_myID = -1;
+
+  /*!
+   * \brief Byte offset into dynamic shared memory.
+   */
+  HYPRE_Int m_smem_offset = -1;
+
+  /*!
+   * \brief If this variable is a copy or not; only original may release memory 
+   *        or perform finalization.
+   */
+  bool m_is_copy_host = false;
+  bool m_is_copy_device = false;
+  bool m_finish_reduction = false;
+
+  // Sanity checks for block size and template type size
+  static constexpr bool powerOfTwoCheck = (!(BLOCKSIZE & (BLOCKSIZE - 1)));
+  static constexpr bool reasonableRangeCheck =
+      ((BLOCKSIZE >= 32) && (BLOCKSIZE <= 1024));
+  static constexpr bool sizeofcheck =
+      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
+       && (sizeof(CudaReductionTallyType<T>)
+           <= sizeof(CudaReductionDummyTallyType))
+       && (sizeof(CudaReductionBlockType<T>)
+           <= sizeof(CudaReductionDummyBlockType)));
+  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
+  static_assert(reasonableRangeCheck,
+                "Error: block sizes must be between 32 and 1024");
+  static_assert(sizeofcheck,
+      "Error: type must be of size <= " 
+      RAJA_STRINGIFY_MACRO(RAJA_CUDA_REDUCE_VAR_MAXSIZE));
+};
+#elif defined(HYPRE_USING_OPENMP)
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_val, T initializer = 1)
+        : m_parent(NULL), m_val(init_val), m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+#pragma omp critical
+                {
+                    *m_parent *= m_val;
+                }
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum for current thread.
+        //
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+        
+    };
+#else
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_m_val, T initializer = 1) :
+        m_parent(NULL),
+        m_val(init_m_val),
+        m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+                *m_parent *= m_val;
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns reduced sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum.
+        //
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+    };
+#endif
+}
+
+
+#define zypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,sum) \
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1,i1)	\
+      {
+
+#define zypre_newBoxLoop1ReductionEnd(i1,sum)				\
+      }									\
+      zypre_newBoxLoop1End(i1);					\
+      hypre_fence();						\
+      sum_tmp = (HYPRE_Real)(sum);				\
+   }								\
+   sum += sum_tmp; \
+}
+		    
+#define zypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum)	\
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop2Begin(ndim, loop_size, \
+			     dbox1, start1, stride1,i1,\
+			     dbox2, start2, stride2,i2)	\
+      {
+
+#define zypre_newBoxLoop2ReductionEnd(i1,i2,sum)			\
+      }									\
+      zypre_newBoxLoop2End(i1,i2);					\
+      hypre_fence();							\
+      sum_tmp = (HYPRE_Real)(sum);					\
+   }								\
+   sum += sum_tmp; \
+}
+
+#define zypre_newBoxLoop1ReductionMult(ndim, loop_size,				\
+				       dbox1, start1, stride1, i1,xp,sum) \
+{									\
+   ReduceMult<HYPRE_Real> local_result_raja(1.0);				\
+   zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \
+   {									\
+       local_result_raja *= xp[i1];					\
+   }									\
+   zypre_newBoxLoop1End(i1)						\
+   hypre_fence();							\
+   sum *= (HYPRE_Real)(local_result_raja);				\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{									\
+   forall< hypre_exec_policy >(0, size, [=] RAJA_DEVICE (HYPRE_Int idx)	\
+   {
+
+#define hypre_LoopEnd()					\
+   });							\
+   hypre_fence();		\
+}
+  
+#define zypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);				\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int i1 = 0;							\
+        local_idx  = idx_local % databox1.lsize0;			\
+        idx_local  = idx_local / databox1.lsize0;			\
+        i1 += local_idx*databox1.strides0;				\
+        local_idx  = idx_local % databox1.lsize1;			\
+        idx_local  = idx_local / databox1.lsize1;			\
+        i1 += local_idx*databox1.strides1;				\
+        local_idx  = idx_local % databox1.lsize2;			\
+        idx_local  = idx_local / databox1.lsize2;			\
+        i1 += local_idx*databox1.strides2;				\
+		
+#define zypre_BoxBoundaryCopyEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define zypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define zypre_newBoxLoopSetOneBlock()
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_newBoxLoop0For
+#define hypre_BoxLoop0End        zypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_newBoxLoop1For
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_newBoxLoop2For
+#define hypre_BoxLoop2End        zypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_newBoxLoop3For
+#define hypre_BoxLoop3End        zypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_newBoxLoop4For
+#define hypre_BoxLoop4End        zypre_newBoxLoop4End
+
+#define hypre_newBoxLoop1ReductionBegin zypre_newBoxLoop1ReductionBegin
+#define hypre_newBoxLoop1ReductionEnd   zypre_newBoxLoop1ReductionEnd
+#define hypre_newBoxLoop2ReductionBegin zypre_newBoxLoop2ReductionBegin
+#define hypre_newBoxLoop2ReductionEnd   zypre_newBoxLoop2ReductionEnd
+#define hypre_newBoxLoop1ReductionMult zypre_newBoxLoop1ReductionMult
+#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+#endif
+#elif defined(HYPRE_USE_KOKKOS)
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+extern "C++" {
+#include <Kokkos_Core.hpp>
+}
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+ typedef struct hypre_Boxloop_struct
+ {
+	 HYPRE_Int lsize0,lsize1,lsize2;
+	 HYPRE_Int strides0,strides1,strides2;
+	 HYPRE_Int bstart0,bstart1,bstart2;
+	 HYPRE_Int bsize0,bsize1,bsize2;
+ } hypre_Boxloop;
+
+ #if defined(HYPRE_MEMORY_GPU)
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ #define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+ inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+ {
+     if (err)
+     {
+	 printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+     }
+ }
+ #define BLOCKSIZE 256
+
+ #define hypre_fence() \
+ cudaError err = cudaGetLastError();\
+ if ( cudaSuccess != err ) {\
+ printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+ }\
+ AxCheckError(cudaDeviceSynchronize());
+ #elif defined(HYPRE_USE_OPENMP)
+    #define hypre_fence() ;
+ #elif defined(HYPRE_USING_OPENMP_ACC)
+ #define hypre_fence()  
+ #else 
+    #define hypre_fence();
+ #endif
+
+ #define hypre_newBoxLoopInit(ndim,loop_size)					\
+	 HYPRE_Int hypre__tot = 1;											\
+	 for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		 hypre__tot *= loop_size[i];
+
+
+ #define hypre_BoxLoopIncK(k,box,i)					\
+ {									\
+    HYPRE_Int idx = idx_local;						\
+    local_idx  = idx % box.lsize0;					\
+    idx        = idx / box.lsize0;					\
+    i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+    local_idx  = idx % box.lsize1;					\
+    idx        = idx / box.lsize1;					\
+    i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+    local_idx  = idx % box.lsize2;					\
+    idx  = idx / box.lsize2;					\
+    i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+ }
+
+ #define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	 hypre_Boxloop databox##k;     					\
+	 databox##k.lsize0 = loop_size[0];				\
+	 databox##k.strides0 = stride[0];				\
+	 databox##k.bstart0  = start[0] - dbox->imin[0];		\
+	 databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	 if (ndim > 1)							\
+	 {								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	 }								\
+	 else						        	\
+	 {							       	\
+	    databox##k.lsize1 = 1;				       	\
+	    databox##k.strides1 = 0;					\
+	    databox##k.bstart1  = 0;					\
+	    databox##k.bsize1   = 0;					\
+	 }								\
+	 if (ndim == 3)							\
+	 {								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	 }								\
+	 else								\
+	 {								\
+	   databox##k.lsize2 = 1;					\
+	   databox##k.strides2 = 0;					\
+	   databox##k.bstart2  = 0;					\
+	   databox##k.bsize2   = 0;					\
+	 }
+
+ #define hypre_newBoxLoopDeclare()										\
+	 HYPRE_Int local_idx;												\
+	 HYPRE_Int idx_local = idx;
+
+ #define hypre_newBoxLoop0Begin(ndim, loop_size) 	\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {
+
+
+ #define hypre_newBoxLoop0End(i1)				\
+	 });											\
+ }
+
+
+ #define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1)		\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size)						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+       hypre_newBoxLoopDeclare();						\
+       HYPRE_Int hypre_boxD1 = 1;					\
+       HYPRE_Int i1 = 0;							\
+       local_idx  = idx_local % databox1.lsize0;				\
+       idx_local  = idx_local / databox1.lsize0;				\
+       i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+       local_idx  = idx_local % databox1.lsize1;				\
+       idx_local  = idx_local / databox1.lsize1;				\
+       i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+       local_idx  = idx_local % databox1.lsize2;				\
+       idx_local  = idx_local / databox1.lsize2;				\
+       i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+
+
+ #define hypre_newBoxLoop1End(i1)					\
+     });									\
+     hypre_fence();							\
+ }
+
+
+ #define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2)		\
+ {    														\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+ #define hypre_newBoxLoop2End(i1, i2)			\
+      });							\
+      hypre_fence();						\
+ }
+
+
+ #define hypre_newBoxLoop3Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3)		\
+ {																	\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+      hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+      hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+      hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+      Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+      {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	 local_idx  = idx_local % databox1.lsize0;				\
+	 idx_local  = idx_local / databox1.lsize0;				\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	 local_idx  = idx_local % databox1.lsize1;				\
+	 idx_local  = idx_local / databox1.lsize1;				\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	 local_idx  = idx_local % databox1.lsize2;				\
+	 idx_local  = idx_local / databox1.lsize2;				\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	 i3 += (local_idx*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);
+
+ #define hypre_newBoxLoop3End(i1, i2, i3)			\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop4Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3,		\
+				dbox4, start4, stride4, i4)		\
+ {									\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop4End(i1, i2, i3, i4)		\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+
+
+
+ #define hypre_newBoxLoop1ReductionEnd(i1, sum)				\
+     },sum);								\
+     hypre_fence();							\
+     sum += sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1,	\
+					 dbox2, start2, stride2, i2, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);				\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+     },sum);								\
+     hypre_fence();							\
+     sum +=sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop1ReductionMult(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, xp, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 1.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 sum *= xp[i1];							\
+     },sum);								\
+     hypre_fence();							\
+     sum *=sum_tmp;								\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+    Kokkos::parallel_for(size, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {
+
+#define hypre_LoopEnd()							\
+    });									\
+    hypre_fence();							\
+}
+  
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+       hypre__tot *= loop_size[d];					\
+    }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int i1 = 0;						\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+	});							\
+	hypre_fence();						\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define hypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+ 
+#define hypre_newBoxLoopSetOneBlock() {}
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock hypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+
+//#define hypre_newBoxLoop1ReductionBegin hypre_newBoxLoop1ReductionBegin
+//#define hypre_newBoxLoop1ReductionEnd   hypre_newBoxLoop1ReductionEnd
+//#define hypre_newBoxLoop2ReductionBegin hypre_newBoxLoop2ReductionBegin
+//#define hypre_newBoxLoop2ReductionEnd   hypre_newBoxLoop2ReductionEnd
+//#define hypre_newBoxLoop1ReductionMult hypre_newBoxLoop1ReductionMult
+//#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+//#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+//#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+//#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+
+#endif
+#elif defined(HYPRE_USE_CUDA)
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_traversal {HYPRE_Int cuda;};
+struct omp_traversal  {HYPRE_Int omp;};
+#define hypre_exec_policy cuda_traversal()
+#define HYPER_LAMBDA [=] __device__
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+    }
+}
+#define BLOCKSIZE 128
+
+#define hypre_fence() \
+  cudaError err = cudaGetLastError();		\
+if ( cudaSuccess != err )\
+{\
+   printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__);\
+}									\
+AxCheckError(cudaDeviceSynchronize());
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void forall_kernel(LOOP_BODY loop_body, HYPRE_Int length)
+{
+	HYPRE_Int idx = blockDim.x * blockIdx.x + threadIdx.x;
+	if (idx < length)
+		loop_body(idx);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (cuda_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{	
+	size_t const blockSize = 128;
+	size_t gridSize  = (length + blockSize - 1) / blockSize;
+	if (gridSize == 0) gridSize = 1;
+	
+	//hypre_printf("length= %d, blocksize = %d, gridsize = %d\n",length,blockSize,gridSize);
+	forall_kernel<<<gridSize, blockSize>>>(loop_body,length);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (omp_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{
+
+#pragma omp parallel for schedule(static)
+	for (HYPRE_Int idx = 0;idx < length;idx++)
+		loop_body(idx);
+}
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{       								\
+HYPRE_Int idx = idx_local;						\
+local_idx  = idx % box.lsize0;					\
+idx        = idx / box.lsize0;					\
+i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+local_idx  = idx % box.lsize1;					\
+idx        = idx / box.lsize1;					\
+i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+local_idx  = idx % box.lsize2;					\
+idx  = idx / box.lsize2;					\
+i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+template<class T>
+__global__ void reduction_mult (T * a, T * b, HYPRE_Int hypre__tot,
+				hypre_Boxloop box1)
+{
+    HYPRE_Int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local = id;
+    HYPRE_Int hypre_boxD1 = 1;
+    HYPRE_Int i1 = 0;
+    //// reducted output
+    __shared__ T shared_cache [BLOCKSIZE];
+    T sum = 1;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    if (id < hypre__tot)
+      sum = a[i1];
+    *(shared_cache + threadIdx.x) = sum;
+    
+    __syncthreads();
+    
+    ///////// sum of internal cache
+    
+    HYPRE_Int i;    
+    
+    for (i=(BLOCKSIZE /2); i>0 ; i= i/2){
+      if (threadIdx.x < i){
+	*(shared_cache + threadIdx.x) *= *(shared_cache + threadIdx.x + i);
+      }
+      __syncthreads();
+    }
+    
+    if ( threadIdx.x == 0){
+      *(b+ blockIdx.x) = shared_cache[0];
+    }
+}
+}
+
+#define hypre_BoxLoopInit(ndim,loop_size)					\
+	HYPRE_Int hypre__tot = 1;											\
+	for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		hypre__tot *= loop_size[i];
+
+
+#define hypre_newBoxLoopDeclare()\
+	HYPRE_Int hypre__i,hypre__j,hypre__k;\
+	HYPRE_Int idx_local = idx;
+
+#define hypre_newBoxLoop0Begin(ndim, loop_size)				\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {
+
+#define hypre_newBoxLoop0End()					\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;											\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{							      	\
+	      databox##k.lsize2 = loop_size[2];				\
+	      databox##k.strides2 = stride[2];				\
+	      databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	      databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	}				                        	\
+	else						        	\
+	{							       	\
+		databox##k.lsize2 = 1;				       	\
+		databox##k.strides2 = 0;		       		\
+		databox##k.bstart2  = 0;	       			\
+		databox##k.bsize2   = 0;		       		\
+	}
+
+#define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);				\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+      hypre_newBoxLoopDeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      hypre__i  = idx_local % databox1.lsize0;				\
+      idx_local = idx_local / databox1.lsize0;				\
+      i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+      hypre__j  = idx_local % databox1.lsize1;				\
+      idx_local = idx_local / databox1.lsize1;				\
+      i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+      hypre__k  = idx_local % databox1.lsize2;				\
+      idx_local = idx_local / databox1.lsize2;				\
+      i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+      
+#define hypre_newBoxLoop1End(i1)				\
+    });									\
+    hypre_fence();							\
+}
+	
+#define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+#define hypre_newBoxLoop2End(i1, i2)			\
+    });							\
+    hypre_fence();					\
+}
+
+#define hypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	hypre__i  = idx_local % databox1.lsize0;				\
+	idx_local  = idx_local / databox1.lsize0;				\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	hypre__j   = idx_local % databox1.lsize1;				\
+	idx_local  = idx_local / databox1.lsize1;				\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	hypre__k  = idx_local % databox1.lsize2;				\
+	idx_local  = idx_local / databox1.lsize2;				\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	i3 += (hypre__k*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	
+
+#define hypre_newBoxLoop3End(i1, i2,i3)			\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+     hypre_BoxLoopInit(ndim,loop_size);			       \
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+     {									\
+        hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	i4 += (hypre__i*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	i4 += (hypre__j*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	i3 += (hypre__k*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	i4 += (hypre__k*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+		
+#define hypre_newBoxLoop4End(i1, i2, i3, i4)	\
+    });						\
+    hypre_fence();				\
+}
+
+#define MAX_BLOCK 512
+
+extern "C++" {
+template<class T>
+__inline__ __device__
+HYPRE_Int fake_shfl_down(T val, HYPRE_Int offset, HYPRE_Int width=32) {
+  static __shared__ T shared[MAX_BLOCK];
+  HYPRE_Int lane=threadIdx.x%32;
+
+  shared[threadIdx.x]=val;
+  __syncthreads();
+
+  val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
+  __syncthreads();
+
+  return val;
+}
+
+template<class T>  
+__inline__ __device__
+HYPRE_Real warpReduceSum (T val) {
+  for (HYPRE_Int offset = warpSize/2; offset > 0; offset /= 2)
+    val += __shfl_down(val,offset);
+  return val;
+}
+
+
+template<class T> 
+__inline__ __device__
+HYPRE_Real blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  HYPRE_Int lane=threadIdx.x%warpSize;
+  HYPRE_Int wid=threadIdx.x/warpSize;
+  val=warpReduceSum<T>(val);
+
+  //write reduced value to shared memory
+  if(lane==0) shared[wid]=val;
+  __syncthreads();
+
+  //ensure we only grab a value from shared memory if that warp existed
+  val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : HYPRE_Int(0);
+  if(wid==0) val=warpReduceSum<T>(val);
+
+  return val;
+}
+
+template<class T>
+__global__ void hypre_device_reduce_stable_kernel(T*a, T*b, T* out, HYPRE_Int N,
+						  hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int local_idx;
+  HYPRE_Int idx_local;
+  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+  HYPRE_Int i1 = 0, i2 = 0;
+  T sum=T(0);
+  HYPRE_Int i;
+  
+  for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+  {
+    idx_local = i;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+    i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+    hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+    sum += a[i1] * hypre_conj(b[i2]);
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>       
+__global__ void hypre_device_reduce_stable_kernel2(T *in, T* out, HYPRE_Int N) {
+  T sum=T(0);
+  for(HYPRE_Int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>   
+void hypre_device_reduce_stable(T*a,T*b, T* out, HYPRE_Int N,
+				hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduce_stable_kernel<T><<<blocks,threads>>>(a,b,out,N,box1,box2);
+  hypre_device_reduce_stable_kernel2<T><<<1,1024>>>(out,out,blocks); 
+}
+
+}
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void hypre_device_reduction_kernel(HYPRE_Real* out,
+					      HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+					      LOOP_BODY loop_body)
+{
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local;
+    HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+    HYPRE_Int i1 = 0, i2 = 0;
+    HYPRE_Real sum = HYPRE_Real(0);
+    HYPRE_Int i;
+    
+    for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+      {
+	idx_local = i;
+	local_idx  = idx_local % box1.lsize0;
+	idx_local  = idx_local / box1.lsize0;
+	i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+	i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+	local_idx  = idx_local % box1.lsize1;
+	idx_local  = idx_local / box1.lsize1;
+	i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+	i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+	hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+	local_idx  = idx_local % box1.lsize2;	      
+	idx_local  = idx_local / box1.lsize2;		      
+	i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+	i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+	sum = loop_body(i1,i2,sum);
+      }
+    sum=blockReduceSum<HYPRE_Real>(sum);
+    if(threadIdx.x==0)
+      out[blockIdx.x]=sum;
+}
+
+template<typename LOOP_BODY>
+void hypre_device_reduction (HYPRE_Real* out,
+			     HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+			     LOOP_BODY loop_body)
+{	
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduction_kernel<<<blocks,threads>>>(out,N,box1,box2,loop_body);
+  hypre_device_reduce_stable_kernel2<HYPRE_Real><<<1,1024>>>(out,out,blocks);
+
+}
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox1,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)			\
+       return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox2,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+      return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+
+
+#define hypre_newBoxLoop1ReductionMult(ndim, loop_size,			  \
+				       dbox1, start1, stride1, i1,xp,sum) \
+{    									  \
+   HYPRE_Real sum_old = sum;\
+   sum = 1.0;\
+   hypre_BoxLoopInit(ndim,loop_size);				  \
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	  \
+   HYPRE_Int n_blocks = (hypre__tot+BLOCKSIZE-1)/BLOCKSIZE;		  \
+   HYPRE_Real *d_b;				        		  \
+   HYPRE_Real * b = new HYPRE_Real[n_blocks];				  \
+   cudaMalloc((void**) &d_b, n_blocks * sizeof(HYPRE_Real));			\
+   reduction_mult<HYPRE_Real><<< n_blocks ,BLOCKSIZE>>>(xp,d_b,hypre__tot,databox1);		\
+   hypre_fence();							\
+   for (HYPRE_Int j = 0 ; j< n_blocks ; ++j){								\
+     sum *= b[j];							\
+   }	  		\
+   delete [] b;		\
+   sum *=sum_old;\
+}
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+	BoxLoopforall(hypre_exec_policy,size,HYPER_LAMBDA (HYPRE_Int idx) \
+	{
+
+#define hypre_LoopEnd()					\
+	});											\
+        hypre_fence();\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	    hypre_newBoxLoopDeclare()											\
+	    HYPRE_Int i1 = 0;											\
+	    hypre__i  = idx_local % databox1.lsize0;			\
+	    idx_local  = idx_local / databox1.lsize0;			\
+	    i1 += hypre__i*databox1.strides0;				\
+	    hypre__j  = idx_local % databox1.lsize1;			\
+	    idx_local  = idx_local / databox1.lsize1;			\
+	    i1 += hypre__j*databox1.strides1;				\
+	    hypre__k  = idx_local % databox1.lsize2;			\
+	    idx_local  = idx_local / databox1.lsize2;			\
+	    i1 += hypre__k*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];									\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];									\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += hypre__i*databox1.strides0;				\
+	i2 += hypre__i*databox2.strides0;				\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += hypre__j*databox1.strides1;				\
+	i2 += hypre__j*databox2.strides1;				\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += hypre__k*databox1.strides2;				\
+	i2 += hypre__k*databox2.strides2;
+
+
+#define hypre_BoxDataExchangeEnd()				\
+     });								\
+     hypre_fence();							\
+}
+  
+#define hypre_newBoxLoop0For()
+
+#define hypre_newBoxLoop1For(i1)
+
+#define hypre_newBoxLoop2For(i1, i2) 
+ 
+#define hypre_newBoxLoop3For(i1, i2, i3)
+
+#define hypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopSetOneBlock() ; 
+#define hypre_BoxLoopBlock()       0
+
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+#endif
+#else
 /*BHEADER**********************************************************************
  * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
  * Produced at the Lawrence Livermore National Laboratory.
@@ -10,20 +2132,386 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-#ifndef hypre_STRUCT_MV_HEADER
-#define hypre_STRUCT_MV_HEADER
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#ifdef HYPRE_USING_OPENMP
+#ifdef WIN32
+#define Pragma(x) __pragma(#x)
+#else
+#define Pragma(x) _Pragma(#x)
+#endif
+#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_SMP_SCHEDULE)
+#define OMPREDUCTION() Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#else
+#define OMP1
+#define OMPREDUCTION() ;
+#endif
+
+typedef struct hypre_Boxloop_struct
+  {
+    HYPRE_Int lsize0,lsize1,lsize2;
+    HYPRE_Int strides0,strides1,strides2;
+    HYPRE_Int bstart0,bstart1,bstart2;
+    HYPRE_Int bsize0,bsize1,bsize2;
+  }hypre_Boxloop;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)				\
+{\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop0End()\
+         }\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+                               dbox1, start1, stride1, i1)              \
+	{														\
+	zypre_BoxLoopDeclare();									\
+	zypre_BoxLoopDeclareK(1);								\
+	zypre_BoxLoopInit(ndim, loop_size);						\
+	zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);					\
+	OMP1\
+	for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+	{																	\
+		zypre_BoxLoopSet();												\
+		zypre_BoxLoopSetK(1, i1);										\
+		for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)			\
+		{																\
+			for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)		\
+			{
+
+#define zypre_newBoxLoop1End(i1)				\
+	             i1 += hypre__i0inc1;						\
+		    }											\
+			zypre_BoxLoopInc1();					\
+	        i1 += hypre__ikinc1[hypre__d];				\
+	        zypre_BoxLoopInc2();						\
+		}											\
+	}											\
+}
+
+
+#define zypre_newBoxLoop2Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2)	\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2,	\
+							   dbox3, start3, stride3, i3)	\
+{														\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopDeclareK(1);								\
+   zypre_BoxLoopDeclareK(2);								\
+   zypre_BoxLoopDeclareK(3);								\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);		\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);		\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);		\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop3End(i1, i2, i3)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,\
+                            dbox1, start1, stride1, i1,\
+                            dbox2, start2, stride2, i2,\
+                            dbox3, start3, stride3, i3,\
+                            dbox4, start4, stride4, i4)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopDeclareK(3);\
+   zypre_BoxLoopDeclareK(4);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);\
+   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      zypre_BoxLoopSetK(4, i4);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+            i4 += hypre__i0inc4;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         i4 += hypre__ikinc4[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+                                        sum)				\
+{									\
+   zypre_BoxLoopDeclare();						\
+   zypre_BoxLoopDeclareK(1);						\
+   zypre_BoxLoopInit(ndim, loop_size);					\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);			\
+   OMPREDUCTION()							\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)\
+            i1 += hypre__i0inc1;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
 
-#include "HYPRE_struct_mv.h"
-#include "_hypre_utilities.h"
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,				\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,	\
+                                        sum)							\
+{\
+   HYPRE_Int i1,i2;				\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMPREDUCTION()														\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_LoopBegin(size,idx)			\
+{									\
+   HYPRE_Int idx;							\
+   for (idx = 0;idx < size;idx ++)					\
+   {
+
+#define hypre_LoopEnd()					\
+  }							\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    HYPRE_Int hypre__tot = 1;						\
+    hypre_Boxloop databox1;						\
+    HYPRE_Int d,idx;							\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (d = 0;d < ndim;d ++)						\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    for (idx = 0;idx < hypre__tot;idx++)				\
+      {									\
+	  HYPRE_Int local_idx;						\
+	  HYPRE_Int idx_local = idx;					\
+	  HYPRE_Int i1 = 0;						\
+	  local_idx  = idx_local % databox1.lsize0;			\
+	  idx_local  = idx_local / databox1.lsize0;			\
+	  i1 += local_idx*databox1.strides0;				\
+	  local_idx  = idx_local % databox1.lsize1;			\
+	  idx_local  = idx_local / databox1.lsize1;			\
+	  i1 += local_idx*databox1.strides1;				\
+	  local_idx  = idx_local % databox1.lsize2;			\
+	  idx_local  = idx_local / databox1.lsize2;			\
+	  i1 += local_idx*databox1.strides2;				\
+
+
+#define hypre_BoxBoundaryCopyEnd()					\
+  }									\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,			\
+                                   stride1, i1,				\
+                                   stride2, i2)				\
+{									\
+   HYPRE_Int hypre__tot = 1,idx;					\
+   hypre_Boxloop databox1,databox2;					\
+   HYPRE_Int d;								\
+   databox1.lsize0 = loop_size[0];					\
+   databox1.lsize1 = loop_size[1];					\
+   databox1.lsize2 = loop_size[2];					\
+   databox1.strides0 = stride1[0];					\
+   databox1.strides1 = stride1[1];					\
+   databox1.strides2 = stride1[2];					\
+   databox2.lsize0 = loop_size[0];					\
+   databox2.lsize1 = loop_size[1];					\
+   databox2.lsize2 = loop_size[2];					\
+   databox2.strides0 = stride2[0];					\
+   databox2.strides1 = stride2[1];					\
+   databox2.strides2 = stride2[2];					\
+   for (d = 0;d < ndim;d ++)						\
+   {									\
+      hypre__tot *= loop_size[d];					\
+   }									\
+   for (idx = 0;idx < hypre__tot;idx++)					\
+   {									\
+      HYPRE_Int local_idx;						\
+      HYPRE_Int idx_local = idx;					\
+      HYPRE_Int i1 = 0, i2 = 0;						\
+      local_idx  = idx_local % databox1.lsize0;				\
+      idx_local  = idx_local / databox1.lsize0;				\
+      i1 += local_idx*databox1.strides0;				\
+      i2 += local_idx*databox2.strides0;				\
+      local_idx  = idx_local % databox1.lsize1;				\
+      idx_local  = idx_local / databox1.lsize1;				\
+      i1 += local_idx*databox1.strides1;				\
+      i2 += local_idx*databox2.strides1;				\
+      local_idx  = idx_local % databox1.lsize2;				\
+      idx_local  = idx_local / databox1.lsize2;				\
+      i1 += local_idx*databox1.strides2;				\
+      i2 += local_idx*databox2.strides2;
+
+#define hypre_BoxDataExchangeEnd()					\
+   }                                                                    \
+}
+
+#define hypre_newBoxLoopGetIndex zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
+#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
+#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_BoxLoop0For
+#define hypre_BoxLoop0End        zypre_BoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_BoxLoop1For
+#define hypre_BoxLoop1End        zypre_BoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_BoxLoop2For
+#define hypre_BoxLoop2End        zypre_BoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_BoxLoop3For
+#define hypre_BoxLoop3End        zypre_BoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_BoxLoop4For
+#define hypre_BoxLoop4End        zypre_BoxLoop4End
+#define hypre_BasicBoxLoop2Begin zypre_BasicBoxLoop2Begin
 
+#endif
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
-
 /*BHEADER**********************************************************************
  * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
  * Produced at the Lawrence Livermore National Laboratory.
@@ -196,329 +2684,155 @@
  * BoxLoop macros:
  *--------------------------------------------------------------------------*/
 
-#if 0 /* set to 0 to use the new box loops */
+#ifdef HYPRE_USE_RAJA
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+//ReduceSum< cuda_reduce<BLOCKSIZE>, HYPRE_Real> local_result(0.0);
+#else
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+#endif
 
-#define HYPRE_BOX_PRIVATE hypre__nx,hypre__ny,hypre__nz,hypre__i,hypre__j,hypre__k
+#if defined(HYPRE_MEMORY_GPU)
 
-#define hypre_BoxLoopDeclareS(dbox, stride, sx, sy, sz) \
-HYPRE_Int  sx = (hypre_IndexX(stride));\
-HYPRE_Int  sy = (hypre_IndexY(stride)*hypre_BoxSizeX(dbox));\
-HYPRE_Int  sz = (hypre_IndexZ(stride)*\
-           hypre_BoxSizeX(dbox)*hypre_BoxSizeY(dbox))
-
-#define hypre_BoxLoopDeclareN(loop_size) \
-HYPRE_Int  hypre__i, hypre__j, hypre__k;\
-HYPRE_Int  hypre__nx = hypre_IndexX(loop_size);\
-HYPRE_Int  hypre__ny = hypre_IndexY(loop_size);\
-HYPRE_Int  hypre__nz = hypre_IndexZ(loop_size);\
-HYPRE_Int  hypre__mx = hypre__nx;\
-HYPRE_Int  hypre__my = hypre__ny;\
-HYPRE_Int  hypre__mz = hypre__nz;\
-HYPRE_Int  hypre__dir, hypre__max;\
-HYPRE_Int  hypre__div, hypre__mod;\
-HYPRE_Int  hypre__block, hypre__num_blocks;\
-hypre__dir = 0;\
-hypre__max = hypre__nx;\
-if (hypre__ny > hypre__max)\
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)\
+HYPRE_Int * indices_d;\
+HYPRE_Int indices_h[stencil_size];\
+HYPRE_Int * stencil_shape_d;\
+HYPRE_Int  stencil_shape_h[size*stencil_size];\
+HYPRE_Complex * data_A = hypre_StructMatrixData(A);\
+indices_d = hypre_DeviceTAlloc(HYPRE_Int, stencil_size);\
+stencil_shape_d = hypre_DeviceTAlloc(HYPRE_Int, size*stencil_size);\
+for (HYPRE_Int ii = 0; ii < stencil_size; ii++)\
 {\
-   hypre__dir = 1;\
-   hypre__max = hypre__ny;\
+   HYPRE_Int jj = 0;\
+   indices_h[ii]       = hypre_StructMatrixDataIndices(A)[i][ii];\
+   if (size > 1) cdir = 0;\
+   stencil_shape_h[ii] = hypre_IndexD(stencil_shape[ii], cdir);\
+   for (jj = 1;jj < size;jj++)\
+      stencil_shape_h[jj*stencil_size+ii] = hypre_IndexD(stencil_shape[ii], jj);\
 }\
-if (hypre__nz > hypre__max)\
-{\
-   hypre__dir = 2;\
-   hypre__max = hypre__nz;\
-}\
-hypre__num_blocks = hypre_NumThreads();\
-if (hypre__max < hypre__num_blocks)\
-{\
-   hypre__num_blocks = hypre__max;\
-}\
-if (hypre__num_blocks > 0)\
-{\
-   hypre__div = hypre__max / hypre__num_blocks;\
-   hypre__mod = hypre__max % hypre__num_blocks;\
-}
+hypre_DataCopyToData(indices_h,indices_d,HYPRE_Int,stencil_size);\
+hypre_DataCopyToData(stencil_shape_h,stencil_shape_d,HYPRE_Int,size*stencil_size);\
 
-#define hypre_BoxLoopSet(i, j, k) \
-i = 0;\
-j = 0;\
-k = 0;\
-hypre__nx = hypre__mx;\
-hypre__ny = hypre__my;\
-hypre__nz = hypre__mz;\
-if (hypre__num_blocks > 1)\
-{\
-   if (hypre__dir == 0)\
-   {\
-      i = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 1)\
-   {\
-      j = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 2)\
-   {\
-      k = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-}
+#define hypre_StructGetMatrixBoxData(A, i, si)  (data_A + indices_d[si])
 
-#define hypre_BoxLoopGetIndex(index) \
-index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+#define hypre_StructGetIndexD(index,i,index_d) (index_d)
 
-/* Use this before the For macros below to force only one block */
-#define hypre_BoxLoopSetOneBlock() hypre__num_blocks = 1
-
-/* Use this to get the block iteration inside a BoxLoop */
-#define hypre_BoxLoopBlock() hypre__block
-
-/*-----------------------------------*/
-
-#define hypre_BoxLoop0Begin(ndim, loop_size)\
-{\
-   hypre_BoxLoopDeclareN(loop_size);
+#define hypre_StructCleanIndexD()\
+hypre_DeviceTFree(indices_d);\
+hypre_DeviceTFree(stencil_shape_d);
 
-#define hypre_BoxLoop0For()\
-   hypre__BoxLoop0For(hypre__i, hypre__j, hypre__k)
-#define hypre__BoxLoop0For(i, j, k)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
+#define hypre_StructPreparePrint()\
+HYPRE_Int tot_size = num_values*hypre_BoxVolume(hypre_BoxArrayBox(data_space, hypre_BoxArraySize(box_array)-1));\
+data_host = hypre_CTAlloc(HYPRE_Complex, tot_size);\
+hypre_DataCopyFromData(data_host,data,HYPRE_Complex,tot_size);
 
-#define hypre_BoxLoop0End()\
-         }\
-      }\
-   }\
-   }\
-}
-  
-/*-----------------------------------*/
+#define hypre_StructPostPrint() hypre_TFree(data_host)
 
-#define hypre_BoxLoop1Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1)\
-{\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareN(loop_size);
+#else
 
-#define hypre_BoxLoop1For(i1)\
-   hypre__BoxLoop1For(hypre__i, hypre__j, hypre__k, i1)
-#define hypre__BoxLoop1For(i, j, k, i1)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)
+#define hypre_StructGetMatrixBoxData(A, i, si) hypre_StructMatrixBoxData(A,i,si)
+#define hypre_StructGetIndexD(index,i,index_d) hypre_IndexD(index,i)
+#define hypre_StructCleanIndexD() {;}
+#define hypre_StructPreparePrint() data_host = data;
+#define hypre_StructPostPrint() {;}
 
-#define hypre_BoxLoop1End(i1)\
-            i1 += hypre__sx1;\
-         }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-      }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-   }\
-   }\
-}
+#endif
   
-/*-----------------------------------*/
-
-#define hypre_BoxLoop2Begin(ndim,loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2)\
+#define hypre_SerialBoxLoop0Begin(ndim, loop_size)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop2For(i1, i2)\
-   hypre__BoxLoop2For(hypre__i, hypre__j, hypre__k, i1, i2)
-#define hypre__BoxLoop2For(i, j, k, i1, i2)\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   hypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop2End(i1, i2)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
+#define hypre_SerialBoxLoop0End()\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop3Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3)\
+#define hypre_SerialBoxLoop1Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop3For(i1, i2, i3)\
-   hypre__BoxLoop3For(hypre__i, hypre__j, hypre__k, i1, i2, i3)
-#define hypre__BoxLoop3For(i, j, k, i1, i2, i3)\
+   HYPRE_Int i1;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop3End(i1, i2, i3)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
+#define hypre_SerialBoxLoop1End(i1)\
+            i1 += hypre__i0inc1;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop4Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3,\
-                            dbox4, start4, stride4, i4)\
+#define hypre_SerialBoxLoop2Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1,\
+                                  dbox2, start2, stride2, i2)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   HYPRE_Int  hypre__i4start = hypre_BoxIndexRank(dbox4, start4);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareS(dbox4, stride4, hypre__sx4, hypre__sy4, hypre__sz4);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop4For(i1, i2, i3, i4)\
-   hypre__BoxLoop4For(hypre__i, hypre__j, hypre__k, i1, i2, i3, i4)
-#define hypre__BoxLoop4For(i, j, k, i1, i2, i3, i4)\
+   HYPRE_Int i1,i2;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   i4 = hypre__i4start + i*hypre__sx4 + j*hypre__sy4 + k*hypre__sz4;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop4End(i1, i2, i3, i4)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
-            i4 += hypre__sx4;\
+#define hypre_SerialBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
-         i4 += hypre__sy4 - hypre__nx*hypre__sx4;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-      i4 += hypre__sz4 - hypre__ny*hypre__sy4;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
+#if defined (HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)
+#define HYPRE_BOX_PRIVATE hypre__global_error
 #else
-
-#define HYPRE_BOX_PRIVATE        ZYPRE_BOX_PRIVATE
-
-#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
-#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
-#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
-#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
-#define hypre_BoxLoop0For        zypre_BoxLoop0For
-#define hypre_BoxLoop0End        zypre_BoxLoop0End
-#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
-#define hypre_BoxLoop1For        zypre_BoxLoop1For
-#define hypre_BoxLoop1End        zypre_BoxLoop1End
-#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
-#define hypre_BoxLoop2For        zypre_BoxLoop2For
-#define hypre_BoxLoop2End        zypre_BoxLoop2End
-#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
-#define hypre_BoxLoop3For        zypre_BoxLoop3For
-#define hypre_BoxLoop3End        zypre_BoxLoop3End
-#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
-#define hypre_BoxLoop4For        zypre_BoxLoop4For
-#define hypre_BoxLoop4End        zypre_BoxLoop4End
-
-#endif /* end if 1 */
-
+#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE
 #endif
-
-/******************************************************************************
- *
- * NEW BoxLoop STUFF
- *
- *****************************************************************************/
-
-#ifndef hypre_ZBOX_HEADER
-#define hypre_ZBOX_HEADER
-
 #define ZYPRE_BOX_PRIVATE hypre__IN,hypre__JN,hypre__I,hypre__J,hypre__d,hypre__i
 
-/*--------------------------------------------------------------------------
- * BoxLoop macros:
- *--------------------------------------------------------------------------*/
-
 #define zypre_BoxLoopDeclare() \
 HYPRE_Int  hypre__tot, hypre__div, hypre__mod;\
 HYPRE_Int  hypre__block, hypre__num_blocks;\
@@ -655,6 +2969,7 @@
 #define zypre_BoxLoop1Begin(ndim, loop_size,\
                             dbox1, start1, stride1, i1)\
 {\
+   HYPRE_Int i1;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopInit(ndim, loop_size);\
@@ -663,6 +2978,7 @@
 #define zypre_BoxLoop1For(i1)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
@@ -686,6 +3002,7 @@
                             dbox1, start1, stride1, i1,\
                             dbox2, start2, stride2, i2)\
 {\
+   HYPRE_Int i1,i2;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -696,6 +3013,7 @@
 #define zypre_BoxLoop2For(i1, i2)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -723,6 +3041,7 @@
                             dbox2, start2, stride2, i2,\
                             dbox3, start3, stride3, i3)\
 {\
+   HYPRE_Int i1,i2,i3;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -735,6 +3054,7 @@
 #define zypre_BoxLoop3For(i1, i2, i3)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -766,6 +3086,7 @@
                             dbox3, start3, stride3, i3,\
                             dbox4, start4, stride4, i4)\
 {\
+   HYPRE_Int i1,i2,i3,i4;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -780,6 +3101,7 @@
 #define zypre_BoxLoop4For(i1, i2, i3, i4)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3,i4;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -808,6 +3130,32 @@
 
 /*-----------------------------------*/
 
+#define zypre_BasicBoxLoopInitK(k, stridek) \
+hypre__sk##k[0] = stridek[0];\
+hypre__ikinc##k[0] = 0;\
+for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
+{\
+   hypre__sk##k[hypre__d] = stridek[hypre__d];\
+   hypre__ikinc##k[hypre__d] = hypre__ikinc##k[hypre__d-1] +\
+      hypre__sk##k[hypre__d] - hypre__n[hypre__d-1]*hypre__sk##k[hypre__d-1];\
+}\
+hypre__i0inc##k = hypre__sk##k[0];\
+hypre__ikinc##k[hypre__ndim] = 0;\
+hypre__ikstart##k = 0
+
+#define zypre_BasicBoxLoop2Begin(ndim, loop_size,\
+                                 stride1, i1,\
+                                 stride2, i2)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BasicBoxLoopInitK(1, stride1);\
+   zypre_BasicBoxLoopInitK(2, stride2);
+
+/*-----------------------------------*/
+
 #endif
 
 
@@ -1469,6 +3817,9 @@
    HYPRE_Complex **send_buffers;
    HYPRE_Complex **recv_buffers;
 
+   HYPRE_Complex      **send_buffers_data;
+   HYPRE_Complex      **recv_buffers_data;
+	
    /* set = 0, add = 1 */
    HYPRE_Int       action;
 
@@ -1576,6 +3927,8 @@
 #define hypre_CommHandleSendBuffers(comm_handle) (comm_handle -> send_buffers)
 #define hypre_CommHandleRecvBuffers(comm_handle) (comm_handle -> recv_buffers)
 #define hypre_CommHandleAction(comm_handle)      (comm_handle -> action)
+#define hypre_CommHandleSendBuffersDevice(comm_handle)    (comm_handle -> send_buffers_data)
+#define hypre_CommHandleRecvBuffersDevice(comm_handle)    (comm_handle -> recv_buffers_data)
 
 #endif
 /*BHEADER**********************************************************************
@@ -2028,7 +4381,6 @@
 HYPRE_Int HYPRE_StructVectorGetMigrateCommPkg ( HYPRE_StructVector from_vector , HYPRE_StructVector to_vector , HYPRE_CommPkg *comm_pkg );
 HYPRE_Int HYPRE_StructVectorMigrate ( HYPRE_CommPkg comm_pkg , HYPRE_StructVector from_vector , HYPRE_StructVector to_vector );
 HYPRE_Int HYPRE_CommPkgDestroy ( HYPRE_CommPkg comm_pkg );
-HYPRE_Int HYPRE_StructVectorClone ( HYPRE_StructVector x, HYPRE_StructVector *y_ptr );
 
 /* project.c */
 HYPRE_Int hypre_ProjectBox ( hypre_Box *box , hypre_Index index , hypre_Index stride );
@@ -2141,7 +4493,7 @@
 HYPRE_Int hypre_StructVectorAssemble ( hypre_StructVector *vector );
 HYPRE_Int hypre_StructVectorCopy ( hypre_StructVector *x , hypre_StructVector *y );
 HYPRE_Int hypre_StructVectorSetConstantValues ( hypre_StructVector *vector , HYPRE_Complex values );
-HYPRE_Int hypre_StructVectorSetFunctionValues ( hypre_StructVector *vector , HYPRE_Complex (*fcn )(HYPRE_Int, HYPRE_Int, HYPRE_Int));
+HYPRE_Int hypre_StructVectorSetFunctionValues ( hypre_StructVector *vector , HYPRE_Complex (*fcn )());
 HYPRE_Int hypre_StructVectorClearGhostValues ( hypre_StructVector *vector );
 HYPRE_Int hypre_StructVectorClearBoundGhostValues ( hypre_StructVector *vector , HYPRE_Int force );
 HYPRE_Int hypre_StructVectorScaleValues ( hypre_StructVector *vector , HYPRE_Complex factor );
@@ -2152,7 +4504,6 @@
 HYPRE_Int hypre_StructVectorMaxValue ( hypre_StructVector *vector , HYPRE_Real *max_value , HYPRE_Int *max_index , hypre_Index max_xyz_index );
 hypre_StructVector *hypre_StructVectorClone ( hypre_StructVector *vector );
 
-
 #ifdef __cplusplus
 }
 #endif
diff -Nru hypre-2.11.2/src/struct_mv/protos.h hypre-2.13.0/src/struct_mv/protos.h
--- hypre-2.11.2/src/struct_mv/protos.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/protos.h	2017-10-20 17:42:22.000000000 +0000
@@ -304,4 +304,4 @@
 HYPRE_Int hypre_StructVectorPrint ( const char *filename , hypre_StructVector *vector , HYPRE_Int all );
 hypre_StructVector *hypre_StructVectorRead ( MPI_Comm comm , const char *filename , HYPRE_Int *num_ghost );
 HYPRE_Int hypre_StructVectorMaxValue ( hypre_StructVector *vector , HYPRE_Real *max_value , HYPRE_Int *max_index , hypre_Index max_xyz_index );
-
+hypre_StructVector *hypre_StructVectorClone ( hypre_StructVector *vector );
diff -Nru hypre-2.11.2/src/struct_mv/struct_axpy.c hypre-2.13.0/src/struct_mv/struct_axpy.c
--- hypre-2.11.2/src/struct_mv/struct_axpy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_axpy.c	2017-10-20 17:42:22.000000000 +0000
@@ -29,9 +29,6 @@
 {
    hypre_Box        *x_data_box;
    hypre_Box        *y_data_box;
-                 
-   HYPRE_Int         xi;
-   HYPRE_Int         yi;
                     
    HYPRE_Complex    *xp;
    HYPRE_Complex    *yp;
@@ -59,12 +56,17 @@
       yp = hypre_StructVectorBoxData(y, i);
 
       hypre_BoxGetSize(box, loop_size);
-
+	  
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR xi,yi
+	  
       hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size,
-                          x_data_box, start, unit_stride, xi,
-                          y_data_box, start, unit_stride, yi);
+			  x_data_box, start, unit_stride, xi,
+			  y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, yi)
       {
diff -Nru hypre-2.11.2/src/struct_mv/struct_communication.c hypre-2.13.0/src/struct_mv/struct_communication.c
--- hypre-2.11.2/src/struct_mv/struct_communication.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_communication.c	2017-10-20 17:42:22.000000000 +0000
@@ -19,6 +19,14 @@
 FILE      *file;
 #endif
 
+/* This is needed to do communication in the GPU case */
+#if defined(HYPRE_MEMORY_GPU)
+static HYPRE_Complex* global_recv_buffer;
+static HYPRE_Complex* global_send_buffer;
+static HYPRE_Int      global_recv_size = 0;
+static HYPRE_Int      global_send_size = 0;
+#endif
+
 /* this computes a (large enough) size (in doubles) for the message prefix */
 #define hypre_CommPrefixSize(ne)                                        \
    ( (((1+ne)*sizeof(HYPRE_Int) + ne*sizeof(hypre_Box))/sizeof(HYPRE_Complex)) + 1 )
@@ -82,7 +90,7 @@
    HYPRE_Int            *send_order;
 
    HYPRE_Int             i, j, k, p, m, size, p_old, my_proc;
-                        
+
    /*------------------------------------------------------
     *------------------------------------------------------*/
 
@@ -624,7 +632,7 @@
    hypre_Box            *box;
    hypre_Box            *data_box;
    HYPRE_Int             i, j;
-                
+
    for (j = 0; j < num_entries; j++)
    {
       i = boxnums[j];
@@ -751,10 +759,10 @@
    hypre_CommEntryTypeOffset(comm_entry) = offset;
    hypre_CommEntryTypeDim(comm_entry) = dim;
    hypre_CommEntryTypeOrder(comm_entry) = order;
- 
+
    return hypre_error_flag;
 }
- 
+
 /*--------------------------------------------------------------------------
  * Initialize a non-blocking communication exchange.
  *
@@ -781,19 +789,22 @@
    HYPRE_Int            num_sends  = hypre_CommPkgNumSends(comm_pkg);
    HYPRE_Int            num_recvs  = hypre_CommPkgNumRecvs(comm_pkg);
    MPI_Comm             comm       = hypre_CommPkgComm(comm_pkg);
-                     
+
    HYPRE_Int            num_requests;
    hypre_MPI_Request         *requests;
    hypre_MPI_Status          *status;
    HYPRE_Complex      **send_buffers;
    HYPRE_Complex      **recv_buffers;
 
+   HYPRE_Complex      **send_buffers_data;
+   HYPRE_Complex      **recv_buffers_data;
+
    hypre_CommType      *comm_type, *from_type, *to_type;
    hypre_CommEntryType *comm_entry;
    HYPRE_Int            num_entries;
 
    HYPRE_Int           *length_array;
-   HYPRE_Int           *stride_array;
+   HYPRE_Int           *stride_array, unitst_array[HYPRE_MAXDIM+1];
    HYPRE_Int           *order;
 
    HYPRE_Complex       *dptr, *kptr, *lptr;
@@ -801,7 +812,7 @@
 
    HYPRE_Int            i, j, d, ll;
    HYPRE_Int            size;
-                      
+
    /*--------------------------------------------------------------------
     * allocate requests and status
     *--------------------------------------------------------------------*/
@@ -828,6 +839,31 @@
       }
    }
 
+   /* Prepare send buffers */
+#if defined(HYPRE_MEMORY_GPU)
+   send_buffers_data = hypre_TAlloc(HYPRE_Complex *, num_sends);
+   if (num_sends > 0)
+   {
+      size = hypre_CommPkgSendBufsize(comm_pkg);
+      if (size > global_send_size)
+      {
+         if (global_send_size > 0)
+            hypre_DeviceTFree(global_send_buffer);
+         global_send_buffer = hypre_DeviceCTAlloc(HYPRE_Complex, 5*size);
+         global_send_size   = 5*size;
+      }
+      send_buffers_data[0] = global_send_buffer;
+      for (i = 1; i < num_sends; i++)
+      {
+         comm_type = hypre_CommPkgSendType(comm_pkg, i-1);
+         size = hypre_CommTypeBufsize(comm_type);
+         send_buffers_data[i] = send_buffers_data[i-1] + size;
+      }
+   }
+#else
+   send_buffers_data = send_buffers;
+#endif
+
    /* allocate recv buffers */
    recv_buffers = hypre_TAlloc(HYPRE_Complex *, num_recvs);
    if (num_recvs > 0)
@@ -842,6 +878,31 @@
       }
    }
 
+   /* Prepare recv buffers */
+#if defined(HYPRE_MEMORY_GPU)
+   recv_buffers_data = hypre_TAlloc(HYPRE_Complex *, num_recvs);
+   if (num_recvs > 0)
+   {
+      size = hypre_CommPkgRecvBufsize(comm_pkg);
+      if (size > global_recv_size)
+      {
+         if (global_recv_size > 0)
+            hypre_DeviceTFree(global_recv_buffer);
+         global_recv_buffer = hypre_DeviceCTAlloc(HYPRE_Complex, 5*size);
+         global_recv_size   = 5*size;
+      }
+      recv_buffers_data[0] = global_recv_buffer;
+      for (i = 1; i < num_recvs; i++)
+      {
+         comm_type = hypre_CommPkgRecvType(comm_pkg, i-1);
+         size = hypre_CommTypeBufsize(comm_type);
+         recv_buffers_data[i] = recv_buffers_data[i-1] + size;
+      }
+   }
+#else
+   recv_buffers_data = recv_buffers;
+#endif
+
    /*--------------------------------------------------------------------
     * pack send buffers
     *--------------------------------------------------------------------*/
@@ -851,22 +912,9 @@
       comm_type = hypre_CommPkgSendType(comm_pkg, i);
       num_entries = hypre_CommTypeNumEntries(comm_type);
 
-      dptr = (HYPRE_Complex *) send_buffers[i];
-
+      dptr = (HYPRE_Complex *) send_buffers_data[i];
       if ( hypre_CommPkgFirstComm(comm_pkg) )
       {
-         qptr = (HYPRE_Int *) send_buffers[i];
-         *qptr = num_entries;
-         qptr ++;
-         memcpy(qptr, hypre_CommTypeRemBoxnums(comm_type),
-                num_entries*sizeof(HYPRE_Int));
-         qptr += num_entries;
-         memcpy(qptr, hypre_CommTypeRemBoxes(comm_type),
-                num_entries*sizeof(hypre_Box));
-
-         hypre_CommTypeRemBoxnums(comm_type) = NULL;
-         hypre_CommTypeRemBoxes(comm_type)   = NULL;
-
          dptr += hypre_CommPrefixSize(num_entries);
       }
 
@@ -876,6 +924,11 @@
          length_array = hypre_CommEntryTypeLengthArray(comm_entry);
          stride_array = hypre_CommEntryTypeStrideArray(comm_entry);
          order = hypre_CommEntryTypeOrder(comm_entry);
+         unitst_array[0] = 1;
+         for (d = 1; d <= ndim; d++)
+         {
+            unitst_array[d] = unitst_array[d-1]*length_array[d-1];
+         }
 
          lptr = send_data + hypre_CommEntryTypeOffset(comm_entry);
          for (ll = 0; ll < num_values; ll++)
@@ -884,50 +937,46 @@
             {
                kptr = lptr + order[ll]*stride_array[ndim];
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
                /* This is based on "Idea 2" in box.h */
                {
-                  HYPRE_Int      i[HYPRE_MAXDIM+1];
                   HYPRE_Int      n[HYPRE_MAXDIM+1];
                   HYPRE_Int      s[HYPRE_MAXDIM+1];
-                  HYPRE_Complex *p[HYPRE_MAXDIM+1];
-                  HYPRE_Int      I, N;
+                  HYPRE_Int      N;
 
                   /* Initialize */
                   N = 1;
                   for (d = 0; d < ndim; d++)
                   {
-                     i[d] = 0;
                      n[d] = length_array[d];
                      s[d] = stride_array[d];
-                     p[d] = kptr;
                      N *= n[d];
                   }
-                  i[ndim] = 0;
                   n[ndim] = 2;
                   s[ndim] = 0;
-                  p[ndim] = kptr;
 
                   /* Emulate ndim nested for loops */
-                  d = 0;
-                  for (I = 0; I < N; I++)
+                  hypre_BoxBoundaryCopyBegin(ndim, n, s, i, idx)
                   {
-                     dptr[I] = *p[0];
-
-                     while ( (i[d]+2) > n[d] )
-                     {
-                        d++;
-                     }
-                     i[d]++;
-                     p[d] += s[d];
-                     while ( d > 0 )
-                     {
-                        d--;
-                        i[d] = 0;
-                        p[d] = p[d+1];
-                     }
+                     dptr[idx] = kptr[i];
                   }
-                  dptr += N;
+                  hypre_BoxBoundaryCopyEnd();
                }
+#else
+               hypre_BasicBoxLoop2Begin(ndim, length_array,
+                                        stride_array, ki,
+                                        unitst_array, di);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
+#endif
+               hypre_BoxLoop2For(ki, di)
+               {
+                  dptr[di] = kptr[ki];
+               }
+               hypre_BoxLoop2End(ki, di);
+#endif
+
+               dptr += unitst_array[ndim];
             }
             else
             {
@@ -936,13 +985,47 @@
                {
                   size *= length_array[d];
                }
-               memset(dptr, 0, size*sizeof(HYPRE_Complex));
+               hypre_DeviceMemset(dptr, 0, HYPRE_Complex, size);
+
                dptr += size;
             }
          }
       }
    }
 
+   /* Copy buffer data from Device to Host */
+#if defined(HYPRE_MEMORY_GPU)
+   if (num_sends > 0)
+   {
+      HYPRE_Complex  *dptr_host;
+      size = hypre_CommPkgSendBufsize(comm_pkg);
+      dptr_host = (HYPRE_Complex *) send_buffers[0];
+      dptr      = (HYPRE_Complex *) send_buffers_data[0];
+      hypre_DataCopyFromData(dptr_host, dptr, HYPRE_Complex, size);
+   }
+#endif
+
+   for (i = 0; i < num_sends; i++)
+   {
+      comm_type = hypre_CommPkgSendType(comm_pkg, i);
+      num_entries = hypre_CommTypeNumEntries(comm_type);
+
+      dptr = (HYPRE_Complex *) send_buffers[i];
+      if ( hypre_CommPkgFirstComm(comm_pkg) )
+      {
+         qptr = (HYPRE_Int *) send_buffers[i];
+         *qptr = num_entries;
+         qptr ++;
+         memcpy(qptr, hypre_CommTypeRemBoxnums(comm_type),
+                num_entries*sizeof(HYPRE_Int));
+         qptr += num_entries;
+         memcpy(qptr, hypre_CommTypeRemBoxes(comm_type),
+                num_entries*sizeof(hypre_Box));
+         hypre_CommTypeRemBoxnums(comm_type) = NULL;
+         hypre_CommTypeRemBoxes(comm_type)   = NULL;
+      }
+   }
+
    /*--------------------------------------------------------------------
     * post receives and initiate sends
     *--------------------------------------------------------------------*/
@@ -1020,6 +1103,8 @@
    hypre_CommHandleSendBuffers(comm_handle) = send_buffers;
    hypre_CommHandleRecvBuffers(comm_handle) = recv_buffers;
    hypre_CommHandleAction(comm_handle)      = action;
+   hypre_CommHandleSendBuffersDevice(comm_handle) = send_buffers_data;
+   hypre_CommHandleRecvBuffersDevice(comm_handle) = recv_buffers_data;
 
    *comm_handle_ptr = comm_handle;
 
@@ -1041,7 +1126,7 @@
    HYPRE_Complex      **send_buffers = hypre_CommHandleSendBuffers(comm_handle);
    HYPRE_Complex      **recv_buffers = hypre_CommHandleRecvBuffers(comm_handle);
    HYPRE_Int            action       = hypre_CommHandleAction(comm_handle);
-                      
+
    HYPRE_Int            ndim         = hypre_CommPkgNDim(comm_pkg);
    HYPRE_Int            num_values   = hypre_CommPkgNumValues(comm_pkg);
    HYPRE_Int            num_sends    = hypre_CommPkgNumSends(comm_pkg);
@@ -1052,7 +1137,7 @@
    HYPRE_Int            num_entries;
 
    HYPRE_Int           *length_array;
-   HYPRE_Int           *stride_array;
+   HYPRE_Int           *stride_array, unitst_array[HYPRE_MAXDIM+1];
 
    HYPRE_Complex       *kptr, *lptr;
    HYPRE_Complex       *dptr;
@@ -1063,6 +1148,11 @@
 
    HYPRE_Int            i, j, d, ll;
 
+#if defined(HYPRE_MEMORY_GPU)
+   HYPRE_Complex      **send_buffers_data = hypre_CommHandleSendBuffersDevice(comm_handle);
+#endif
+   HYPRE_Complex      **recv_buffers_data = hypre_CommHandleRecvBuffersDevice(comm_handle);
+
    /*--------------------------------------------------------------------
     * finish communications
     *--------------------------------------------------------------------*/
@@ -1123,12 +1213,36 @@
     * unpack receive buffer data
     *--------------------------------------------------------------------*/
 
+   /* Copy buffer data from Host to Device */
+#if defined(HYPRE_MEMORY_GPU)
+   if (num_recvs > 0)
+   {
+      HYPRE_Complex  *dptr_host;
+      HYPRE_Int       size;
+      size = 0;
+      for (i = 0; i < num_recvs; i++)
+      {
+         comm_type = hypre_CommPkgRecvType(comm_pkg, i);
+         num_entries = hypre_CommTypeNumEntries(comm_type);
+         size += hypre_CommTypeBufsize(comm_type);
+         if ( hypre_CommPkgFirstComm(comm_pkg) )
+         {
+            size += hypre_CommPrefixSize(num_entries);
+         }
+      }
+      dptr_host = (HYPRE_Complex *) recv_buffers[0];
+      dptr      = (HYPRE_Complex *) recv_buffers_data[0];
+      hypre_DataCopyToData(dptr_host, dptr, HYPRE_Complex, size);
+   }
+#endif
+
    for (i = 0; i < num_recvs; i++)
    {
       comm_type = hypre_CommPkgRecvType(comm_pkg, i);
       num_entries = hypre_CommTypeNumEntries(comm_type);
 
-      dptr = (HYPRE_Complex *) recv_buffers[i];
+      dptr = (HYPRE_Complex *) recv_buffers_data[i];
+
       if ( hypre_CommPkgFirstComm(comm_pkg) )
       {
          dptr += hypre_CommPrefixSize(num_entries);
@@ -1139,6 +1253,11 @@
          comm_entry = hypre_CommTypeEntry(comm_type, j);
          length_array = hypre_CommEntryTypeLengthArray(comm_entry);
          stride_array = hypre_CommEntryTypeStrideArray(comm_entry);
+         unitst_array[0] = 1;
+         for (d = 1; d <= ndim; d++)
+         {
+            unitst_array[d] = unitst_array[d-1]*length_array[d-1];
+         }
 
          lptr = hypre_CommHandleRecvData(comm_handle) +
             hypre_CommEntryTypeOffset(comm_entry);
@@ -1146,59 +1265,60 @@
          {
             kptr = lptr + ll*stride_array[ndim];
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)|| defined(HYPRE_USE_CUDA)
             /* This is based on "Idea 2" in box.h */
             {
-               HYPRE_Int      i[HYPRE_MAXDIM+1];
                HYPRE_Int      n[HYPRE_MAXDIM+1];
                HYPRE_Int      s[HYPRE_MAXDIM+1];
-               HYPRE_Complex *p[HYPRE_MAXDIM+1];
-               HYPRE_Int      I, N;
+               HYPRE_Int      N;
 
                /* Initialize */
                N = 1;
                for (d = 0; d < ndim; d++)
                {
-                  i[d] = 0;
                   n[d] = length_array[d];
                   s[d] = stride_array[d];
-                  p[d] = kptr;
                   N *= n[d];
                }
-               i[ndim] = 0;
                n[ndim] = 2;
                s[ndim] = 0;
-               p[ndim] = kptr;
 
                /* Emulate ndim nested for loops */
-               d = 0;
-               for (I = 0; I < N; I++)
+               hypre_BoxBoundaryCopyBegin(ndim, n, s, i, idx)
                {
                   if (action > 0)
                   {
-                     /* add the data to existing values in memory */
-                     *p[0] += dptr[I];
+                     kptr[i] += dptr[idx];
                   }
                   else
                   {
-                     /* copy the data over existing values in memory */
-                     *p[0] = dptr[I];
-                  }
-
-                  while ( (i[d]+2) > n[d] )
-                  {
-                     d++;
-                  }
-                  i[d]++;
-                  p[d] += s[d];
-                  while ( d > 0 )
-                  {
-                     d--;
-                     i[d] = 0;
-                     p[d] = p[d+1];
+                     kptr[i] = dptr[idx];
                   }
                }
-               dptr += N;
+               hypre_BoxBoundaryCopyEnd();
+            }
+#else
+            hypre_BasicBoxLoop2Begin(ndim, length_array,
+                                     stride_array, ki,
+                                     unitst_array, di);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
+#endif
+            hypre_BoxLoop2For(ki, di)
+            {
+               if (action > 0)
+               {
+                  kptr[ki] += dptr[di];
+               }
+               else
+               {
+                  kptr[ki] = dptr[di];
+               }
             }
+            hypre_BoxLoop2End(ki, di);
+#endif
+
+            dptr += unitst_array[ndim];
          }
       }
    }
@@ -1223,9 +1343,18 @@
    {
       hypre_SharedTFree(recv_buffers[0]);
    }
+
+   hypre_TFree(comm_handle);
+
+#if defined(HYPRE_MEMORY_GPU)
    hypre_TFree(send_buffers);
    hypre_TFree(recv_buffers);
-   hypre_TFree(comm_handle);
+   hypre_TFree(send_buffers_data);
+   hypre_TFree(recv_buffers_data);
+#else
+   hypre_TFree(send_buffers);
+   hypre_TFree(recv_buffers);
+#endif
 
    return hypre_error_flag;
 }
@@ -1251,9 +1380,10 @@
    HYPRE_Int           *fr_stride_array;
    HYPRE_Complex       *to_dp;
    HYPRE_Int           *to_stride_array;
-                      
+   HYPRE_Complex       *fr_dpl, *to_dpl;
+
    HYPRE_Int           *length_array;
-   HYPRE_Int            i, d, ll;
+   HYPRE_Int            i, ll;
 
    HYPRE_Int           *order;
 
@@ -1285,17 +1415,20 @@
          {
             if (order[ll] > -1)
             {
+               fr_dpl = fr_dp + (order[ll])*fr_stride_array[ndim];
+               to_dpl = to_dp + (      ll )*to_stride_array[ndim];
+
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
                /* This is based on "Idea 2" in box.h */
                {
-                  HYPRE_Int      i[HYPRE_MAXDIM+1];
+                  //HYPRE_Int      i[HYPRE_MAXDIM+1];
                   HYPRE_Int      n[HYPRE_MAXDIM+1];
                   HYPRE_Int      fs[HYPRE_MAXDIM+1],  ts[HYPRE_MAXDIM+1];
                   HYPRE_Complex *fp[HYPRE_MAXDIM+1], *tp[HYPRE_MAXDIM+1];
-                  HYPRE_Int      I, N;
+                  HYPRE_Int      N,d;
 
                   /* Initialize */
                   N = 1;
-                  i[ndim]  = 0;
                   n[ndim]  = 2;
                   fs[ndim] = 0;
                   ts[ndim] = 0;
@@ -1303,7 +1436,6 @@
                   tp[ndim] = to_dp + (      ll )*to_stride_array[ndim];
                   for (d = 0; d < ndim; d++)
                   {
-                     i[d]  = 0;
                      n[d]  = length_array[d];
                      fs[d] = fr_stride_array[d];
                      ts[d] = to_stride_array[d];
@@ -1313,36 +1445,43 @@
                   }
 
                   /* Emulate ndim nested for loops */
-                  d = 0;
-                  for (I = 0; I < N; I++)
+                  hypre_BoxDataExchangeBegin(ndim, n, fs, i1, ts, i2)
                   {
                      if (action > 0)
                      {
                         /* add the data to existing values in memory */
-                        *tp[0] += *fp[0];
+                        to_dpl[i2] += fr_dpl[i1];
                      }
                      else
                      {
                         /* copy the data over existing values in memory */
-                        *tp[0] = *fp[0];
-                     }
-
-                     while ( (i[d]+2) > n[d] )
-                     {
-                        d++;
-                     }
-                     i[d]++;
-                     fp[d] += fs[d];
-                     tp[d] += ts[d];
-                     while ( d > 0 )
-                     {
-                        d--;
-                        i[d] = 0;
-                        fp[d] = fp[d+1];
-                        tp[d] = tp[d+1];
+                        to_dpl[i2] = fr_dpl[i1];
                      }
                   }
+                  hypre_BoxDataExchangeEnd();
+               }
+#else
+               hypre_BasicBoxLoop2Begin(ndim, length_array,
+                                        fr_stride_array, fi,
+                                        to_stride_array, ti);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
+#endif
+               hypre_BoxLoop2For(fi, ti)
+               {
+                  if (action > 0)
+                  {
+                     /* add the data to existing values in memory */
+                     to_dpl[ti] += fr_dpl[fi];
+                  }
+                  else
+                  {
+                     /* copy the data over existing values in memory */
+                     to_dpl[ti] = fr_dpl[fi];
+                  }
                }
+               hypre_BoxLoop2End(fi, ti);
+#endif
             }
          }
       }
diff -Nru hypre-2.11.2/src/struct_mv/struct_communication.h hypre-2.13.0/src/struct_mv/struct_communication.h
--- hypre-2.11.2/src/struct_mv/struct_communication.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_communication.h	2017-10-20 17:42:22.000000000 +0000
@@ -141,6 +141,9 @@
    HYPRE_Complex **send_buffers;
    HYPRE_Complex **recv_buffers;
 
+   HYPRE_Complex      **send_buffers_data;
+   HYPRE_Complex      **recv_buffers_data;
+	
    /* set = 0, add = 1 */
    HYPRE_Int       action;
 
@@ -248,5 +251,7 @@
 #define hypre_CommHandleSendBuffers(comm_handle) (comm_handle -> send_buffers)
 #define hypre_CommHandleRecvBuffers(comm_handle) (comm_handle -> recv_buffers)
 #define hypre_CommHandleAction(comm_handle)      (comm_handle -> action)
+#define hypre_CommHandleSendBuffersDevice(comm_handle)    (comm_handle -> send_buffers_data)
+#define hypre_CommHandleRecvBuffersDevice(comm_handle)    (comm_handle -> recv_buffers_data)
 
 #endif
diff -Nru hypre-2.11.2/src/struct_mv/struct_copy.c hypre-2.13.0/src/struct_mv/struct_copy.c
--- hypre-2.11.2/src/struct_mv/struct_copy.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_copy.c	2017-10-20 17:42:22.000000000 +0000
@@ -29,9 +29,6 @@
    hypre_Box       *x_data_box;
    hypre_Box       *y_data_box;
                    
-   HYPRE_Int        xi;
-   HYPRE_Int        yi;
-                   
    HYPRE_Complex   *xp;
    HYPRE_Complex   *yp;
                    
@@ -63,7 +60,7 @@
                           x_data_box, start, unit_stride, xi,
                           y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, yi)
       {
@@ -89,9 +86,6 @@
    hypre_Box       *x_data_box;
    hypre_Box       *y_data_box;
 
-   HYPRE_Int        xi;
-   HYPRE_Int        yi;
-
    HYPRE_Complex   *xp;
    HYPRE_Complex   *yp;
 
@@ -127,7 +121,7 @@
                              x_data_box, start, unit_stride, xi,
                              y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(xi, yi)
          {
diff -Nru hypre-2.11.2/src/struct_mv/struct_grid.c hypre-2.13.0/src/struct_mv/struct_grid.c
--- hypre-2.11.2/src/struct_mv/struct_grid.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_grid.c	2017-10-20 17:42:22.000000000 +0000
@@ -324,9 +324,7 @@
    {
       p = 1;
       hypre_BoxGetSize(box, loop_size);
-      hypre_BoxLoop0Begin(ndim, loop_size);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop0For()
+      hypre_SerialBoxLoop0Begin(ndim, loop_size);
       {
          pshift = pshifts[p];
          hypre_BoxLoopGetIndex(pshift);
@@ -345,7 +343,7 @@
             p++;
          }
       }
-      hypre_BoxLoop0End();
+      hypre_SerialBoxLoop0End();
    }
    hypre_BoxDestroy(box);
    
diff -Nru hypre-2.11.2/src/struct_mv/struct_innerprod.c hypre-2.13.0/src/struct_mv/struct_innerprod.c
--- hypre-2.11.2/src/struct_mv/struct_innerprod.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_innerprod.c	2017-10-20 17:42:22.000000000 +0000
@@ -27,15 +27,11 @@
                        hypre_StructVector *y )
 {
    HYPRE_Real       final_innerprod_result;
-   HYPRE_Real       local_result;
    HYPRE_Real       process_result;
                    
    hypre_Box       *x_data_box;
    hypre_Box       *y_data_box;
                    
-   HYPRE_Int        xi;
-   HYPRE_Int        yi;
-                   
    HYPRE_Complex   *xp;
    HYPRE_Complex   *yp;
                    
@@ -44,42 +40,46 @@
    hypre_Index      loop_size;
    hypre_IndexRef   start;
    hypre_Index      unit_stride;
-                   
+    
+   HYPRE_Int         ndim = hypre_StructVectorNDim(x);               
    HYPRE_Int        i;
 
-   local_result = 0.0;
-   process_result = 0.0;
-
+   hypre_Reductioninit(local_result);
+   
    hypre_SetIndex(unit_stride, 1);
-
+   
    boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y));
    hypre_ForBoxI(i, boxes)
    {
       box   = hypre_BoxArrayBox(boxes, i);
       start = hypre_BoxIMin(box);
-
+     
       x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i);
       y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i);
-
+     
       xp = hypre_StructVectorBoxData(x, i);
       yp = hypre_StructVectorBoxData(y, i);
-
+     
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size,
-                          x_data_box, start, unit_stride, xi,
-                          y_data_box, start, unit_stride, yi);
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) reduction(+:local_result) HYPRE_SMP_SCHEDULE
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
 #endif
-      hypre_BoxLoop2For(xi, yi)
+#define HYPRE_BOX_PRIVATE_VAR xi,yi
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:local_result)
+      hypre_newBoxLoop2ReductionBegin(ndim, loop_size,
+                                      x_data_box, start, unit_stride, xi,
+                                      y_data_box, start, unit_stride, yi,local_result);
       {
-         local_result += xp[xi] * hypre_conj(yp[yi]);
+         local_result += xp[xi] * hypre_conj(yp[yi]);		 
       }
-      hypre_BoxLoop2End(xi, yi);
+      hypre_newBoxLoop2ReductionEnd(xi, yi, local_result);
    }
    process_result = local_result;
-
+   
    hypre_MPI_Allreduce(&process_result, &final_innerprod_result, 1,
                        HYPRE_MPI_REAL, hypre_MPI_SUM, hypre_StructVectorComm(x));
 
diff -Nru hypre-2.11.2/src/struct_mv/struct_io.c hypre-2.13.0/src/struct_mv/struct_io.c
--- hypre-2.11.2/src/struct_mv/struct_io.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_io.c	2017-10-20 17:42:22.000000000 +0000
@@ -34,7 +34,6 @@
    hypre_Box       *data_box;
                    
    HYPRE_Int        data_box_volume;
-   HYPRE_Int        datai;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -43,11 +42,12 @@
                    
    HYPRE_Int        i, j, d;
    HYPRE_Complex    value;
-
+   HYPRE_Complex *data_host;
    /*----------------------------------------
     * Print data
     *----------------------------------------*/
-
+   hypre_StructPreparePrint();
+   
    hypre_SetIndex(stride, 1);
 
    hypre_ForBoxI(i, box_array)
@@ -59,10 +59,9 @@
       data_box_volume = hypre_BoxVolume(data_box);
 
       hypre_BoxGetSize(box, loop_size);
-
-      hypre_BoxLoop1Begin(dim, loop_size,
-                          data_box, start, stride, datai);
-      hypre_BoxLoop1For(datai)
+ 
+      hypre_SerialBoxLoop1Begin(dim, loop_size,
+                                data_box, start, stride, datai);
       {
          /* Print lines of the form: "%d: (%d, %d, %d; %d) %.14e\n" */
          hypre_BoxLoopGetIndex(index);
@@ -75,7 +74,7 @@
                hypre_fprintf(file, ", %d",
                              hypre_IndexD(start, d) + hypre_IndexD(index, d));
             }
-            value = data[datai + j*data_box_volume];
+            value = data_host[datai + j*data_box_volume];
 #ifdef HYPRE_COMPLEX
             hypre_fprintf(file, "; %d) %.14e , %.14e\n",
                           j, hypre_creal(value), hypre_cimag(value));
@@ -84,11 +83,13 @@
 #endif
          }
       }
-      hypre_BoxLoop1End(datai);
+      hypre_SerialBoxLoop1End(datai);
 
-      data += num_values*data_box_volume;
+      data_host += num_values*data_box_volume;
    }
 
+   hypre_StructPostPrint();
+   
    return hypre_error_flag;
 }
 
@@ -112,7 +113,7 @@
    hypre_Box       *box;
    hypre_Box       *data_box;
                    
-   HYPRE_Int        data_box_volume, datai;
+   HYPRE_Int        data_box_volume;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -156,9 +157,8 @@
 
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop1Begin(dim, loop_size,
-                          data_box, start, stride, datai);
-      hypre_BoxLoop1For(datai)
+      hypre_SerialBoxLoop1Begin(dim, loop_size,
+				data_box, start, stride, datai);
       {
          /* Print line of the form: "%d: (%d, %d, %d; %d) %.14e\n" */
          hypre_BoxLoopGetIndex(index);
@@ -177,7 +177,7 @@
          hypre_fprintf(file, "; %d) %.14e\n", center_rank, value);
 #endif
       }
-      hypre_BoxLoop1End(datai);
+      hypre_SerialBoxLoop1End(datai);
       data += data_box_volume;
    }
 
@@ -242,7 +242,6 @@
    hypre_Box       *data_box;
                    
    HYPRE_Int        data_box_volume;
-   HYPRE_Int        datai;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -266,9 +265,8 @@
 
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop1Begin(dim, loop_size,
-                          data_box, start, stride, datai);
-      hypre_BoxLoop1For(datai)
+      hypre_SerialBoxLoop1Begin(dim, loop_size,
+                                data_box, start, stride, datai);
       {
          /* Read lines of the form: "%d: (%d, %d, %d; %d) %le\n" */
          for (j = 0; j < num_values; j++)
@@ -282,7 +280,7 @@
                          &idummy, &data[datai + j*data_box_volume]);
          }
       }
-      hypre_BoxLoop1End(datai);
+      hypre_SerialBoxLoop1End(datai);
 
       data += num_values*data_box_volume;
    }
@@ -308,7 +306,6 @@
    hypre_Box       *data_box;
                    
    HYPRE_Int        data_box_volume, constant_stencil_size;
-   HYPRE_Int        datai;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -348,9 +345,8 @@
 
       if ( constant_coefficient==2 )
       {
-         hypre_BoxLoop1Begin(dim, loop_size,
-                             data_box, start, stride, datai);
-         hypre_BoxLoop1For(datai)
+         hypre_SerialBoxLoop1Begin(dim, loop_size,
+                                   data_box, start, stride, datai);
          {
             /* Read line of the form: "%d: (%d, %d, %d; %d) %.14e\n" */
             hypre_fscanf(file, "%d: (%d", &idummy, &idummy);
@@ -360,7 +356,7 @@
             }
             hypre_fscanf(file, "; %d) %le\n", &idummy, &data[datai]);
          }
-         hypre_BoxLoop1End(datai);
+         hypre_SerialBoxLoop1End(datai);
          data += data_box_volume;
       }
 
diff -Nru hypre-2.11.2/src/struct_mv/struct_matrix.c hypre-2.13.0/src/struct_mv/struct_matrix.c
--- hypre-2.11.2/src/struct_mv/struct_matrix.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_matrix.c	2017-10-20 17:42:22.000000000 +0000
@@ -103,12 +103,18 @@
       {
          if (hypre_StructMatrixDataAlloced(matrix))
          {
-            hypre_SharedTFree(hypre_StructMatrixData(matrix));
+            hypre_DeviceTFree(hypre_StructMatrixData(matrix));
          }
          hypre_CommPkgDestroy(hypre_StructMatrixCommPkg(matrix));
          
          hypre_ForBoxI(i, hypre_StructMatrixDataSpace(matrix))
-            hypre_TFree(hypre_StructMatrixDataIndices(matrix)[i]);
+	 {    
+            if (hypre_StructMatrixConstantCoefficient(matrix) < 2)
+               hypre_TFree(hypre_StructMatrixDataIndices(matrix)[i]);
+            else
+               hypre_UMTFree(hypre_StructMatrixDataIndices(matrix)[i]); 
+	 }
+	 
          hypre_TFree(hypre_StructMatrixDataIndices(matrix));
          
          hypre_BoxArrayDestroy(hypre_StructMatrixDataSpace(matrix));
@@ -340,7 +346,7 @@
             data_box = hypre_BoxArrayBox(data_space, i);
             data_box_volume  = hypre_BoxVolume(data_box);
 
-            data_indices[i] = hypre_CTAlloc(HYPRE_Int, stencil_size);
+            data_indices[i] = hypre_UMCTAlloc(HYPRE_Int, stencil_size);
 
             /* set pointers for "stored" coefficients */
             for (j = 0; j < stencil_size; j++)
@@ -423,11 +429,17 @@
 hypre_StructMatrixInitialize( hypre_StructMatrix *matrix )
 {
    HYPRE_Complex *data;
-
+   HYPRE_Int constant_coefficient;
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(matrix);
    hypre_StructMatrixInitializeShell(matrix);
 
-   data = hypre_StructMatrixData(matrix);
-   data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   //data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   
+   if (constant_coefficient == 0)
+      data = hypre_DeviceCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   else
+      data = hypre_UMCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   
    hypre_StructMatrixInitializeData(matrix, data);
    hypre_StructMatrixDataAlloced(matrix) = 1;
 
@@ -1007,7 +1019,6 @@
    hypre_Box           *data_box;
    hypre_IndexRef       data_start;
    hypre_Index          data_stride;
-   HYPRE_Int            datai;
    HYPRE_Complex       *datap;
                    
    hypre_Index          loop_size;
@@ -1074,7 +1085,7 @@
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size,
                                    data_box,data_start,data_stride,datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(datai)
                {
@@ -1129,7 +1140,7 @@
    hypre_IndexRef         start;
    hypre_Index            stride;
    HYPRE_Complex         *datap;
-   HYPRE_Int              i, j, ei, datai;
+   HYPRE_Int              i, j, ei;
    HYPRE_Int              num_entries;
    /* End - variables for ghost layer identity code below */
 
@@ -1206,7 +1217,7 @@
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size,
                                    data_box, start, stride, datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(datai)
                {
@@ -1403,7 +1414,6 @@
    HYPRE_Int             ndim = hypre_StructMatrixNDim(matrix);
    hypre_Box            *m_data_box;
                         
-   HYPRE_Int             mi;
    HYPRE_Complex        *mp;
 
    hypre_StructStencil  *stencil;
@@ -1451,7 +1461,7 @@
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size,
                                    m_data_box, start, unit_stride, mi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(mi)
                {
@@ -1832,7 +1842,7 @@
    hypre_StructStencil *stencil;
    hypre_BoxArray      *boundary;
 
-   HYPRE_Int           i, i2, ixyz, j;
+   HYPRE_Int           i, i2, j;
 
    /*-----------------------------------------------------------------------
     * Set the matrix coefficients
@@ -1857,7 +1867,7 @@
             data_box = hypre_BoxArrayBox(data_space, i);
             boundary = hypre_BoxArrayCreate( 0, ndim );
             hypre_GeneralBoxBoundaryIntersect(grid_box, grid, stencil_element,
-                boundary);
+                                              boundary);
             data = hypre_StructMatrixBoxData(matrix, i, j);
             hypre_ForBoxI(i2, boundary)
             {
@@ -1866,7 +1876,7 @@
                start = hypre_BoxIMin(tmp_box);
                hypre_BoxLoop1Begin(ndim, loop_size, data_box, start, stride, ixyz);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ixyz) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(ixyz)
                {
diff -Nru hypre-2.11.2/src/struct_mv/struct_matvec.c hypre-2.13.0/src/struct_mv/struct_matvec.c
--- hypre-2.11.2/src/struct_mv/struct_matvec.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_matvec.c	2017-10-20 17:42:22.000000000 +0000
@@ -109,8 +109,6 @@
    hypre_BoxArrayArray     *compute_box_aa;
    hypre_Box               *y_data_box;
                           
-   HYPRE_Int                yi;
-                          
    HYPRE_Complex           *xp;
    HYPRE_Complex           *yp;
                           
@@ -158,7 +156,7 @@
          hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                              y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(yi)
          {
@@ -222,7 +220,7 @@
                      hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                                          y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop1For(yi)
                      {
@@ -237,7 +235,7 @@
                      hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                                          y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop1For(yi)
                      {
@@ -321,8 +319,6 @@
    HYPRE_Int                xoff4;
    HYPRE_Int                xoff5;
    HYPRE_Int                xoff6;
-   HYPRE_Int                Ai;
-   HYPRE_Int                xi;
    hypre_BoxArray          *compute_box_a;
    hypre_Box               *compute_box;
                           
@@ -338,7 +334,6 @@
    HYPRE_Int                depth;
    hypre_Index              loop_size;
    hypre_IndexRef           start;
-   HYPRE_Int                yi;
    HYPRE_Int                ndim;
 
    stencil       = hypre_StructMatrixStencil(A);
@@ -399,7 +394,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -442,7 +437,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -481,7 +476,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -516,7 +511,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -547,7 +542,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -574,7 +569,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -597,7 +592,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -666,7 +661,7 @@
    HYPRE_Int                xoff5;
    HYPRE_Int                xoff6;
    HYPRE_Int                Ai;
-   HYPRE_Int                xi;
+
    hypre_BoxArray          *compute_box_a;
    hypre_Box               *compute_box;
                           
@@ -681,7 +676,6 @@
    HYPRE_Int                depth;
    hypre_Index              loop_size;
    hypre_IndexRef           start;
-   HYPRE_Int                yi;
    HYPRE_Int                ndim;
 
    stencil       = hypre_StructMatrixStencil(A);
@@ -749,7 +743,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -796,7 +790,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -838,7 +832,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -875,7 +869,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -907,7 +901,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -934,7 +928,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -956,7 +950,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1010,8 +1004,7 @@
    HYPRE_Int                xoff6;
    HYPRE_Int                si_center, center_rank;
    hypre_Index              center_index;
-   HYPRE_Int                Ai, Ai_CC;
-   HYPRE_Int                xi;
+   HYPRE_Int                Ai_CC;
    hypre_BoxArray          *compute_box_a;
    hypre_Box               *compute_box;
                           
@@ -1027,7 +1020,6 @@
    HYPRE_Int                depth;
    hypre_Index              loop_size;
    hypre_IndexRef           start;
-   HYPRE_Int                yi;
    HYPRE_Int                ndim;
 
    stencil       = hypre_StructMatrixStencil(A);
@@ -1120,7 +1112,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1180,7 +1172,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1233,7 +1225,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1280,7 +1272,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1321,7 +1313,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1356,7 +1348,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1382,7 +1374,7 @@
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1405,7 +1397,7 @@
                                 x_data_box, start, stride, xi,
                                 y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, yi)
             {
@@ -1421,7 +1413,7 @@
                                 x_data_box, start, stride, xi,
                                 y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, yi)
             {
diff -Nru hypre-2.11.2/src/struct_mv/struct_scale.c hypre-2.13.0/src/struct_mv/struct_scale.c
--- hypre-2.11.2/src/struct_mv/struct_scale.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_scale.c	2017-10-20 17:42:22.000000000 +0000
@@ -28,7 +28,6 @@
 {
    hypre_Box       *y_data_box;
                    
-   HYPRE_Int        yi;
    HYPRE_Complex   *yp;
                    
    hypre_BoxArray  *boxes;
@@ -55,7 +54,7 @@
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(y), loop_size,
                           y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(yi)
       {
diff -Nru hypre-2.11.2/src/struct_mv/struct_vector.c hypre-2.13.0/src/struct_mv/struct_vector.c
--- hypre-2.11.2/src/struct_mv/struct_vector.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/struct_mv/struct_vector.c	2017-10-20 17:42:22.000000000 +0000
@@ -70,7 +70,7 @@
       {
          if (hypre_StructVectorDataAlloced(vector))
          {
-            hypre_SharedTFree(hypre_StructVectorData(vector));
+            hypre_DeviceTFree(hypre_StructVectorData(vector));
          }
          hypre_TFree(hypre_StructVectorDataIndices(vector));
          hypre_BoxArrayDestroy(hypre_StructVectorDataSpace(vector));
@@ -186,7 +186,8 @@
 
    hypre_StructVectorInitializeShell(vector);
 
-   data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructVectorDataSize(vector));
+   data = hypre_DeviceCTAlloc(HYPRE_Complex, hypre_StructVectorDataSize(vector));
+
    hypre_StructVectorInitializeData(vector, data);
    hypre_StructVectorDataAlloced(vector) = 1;
 
@@ -294,13 +295,11 @@
    hypre_Box          *data_box;
    hypre_IndexRef      data_start;
    hypre_Index         data_stride;
-   HYPRE_Int           datai;
    HYPRE_Complex      *datap;
 
    hypre_Box          *dval_box;
    hypre_Index         dval_start;
    hypre_Index         dval_stride;
-   HYPRE_Int           dvali;
 
    hypre_Index         loop_size;
 
@@ -364,7 +363,7 @@
                                 data_box,data_start,data_stride,datai,
                                 dval_box,dval_start,dval_stride,dvali);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai,dvali) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(datai, dvali)
             {
@@ -480,7 +479,6 @@
    hypre_Box          *data_box;
    hypre_IndexRef      data_start;
    hypre_Index         data_stride;
-   HYPRE_Int           datai;
    HYPRE_Complex      *datap;
 
    hypre_Index         loop_size;
@@ -539,7 +537,7 @@
          hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                              data_box,data_start,data_stride,datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(datai)
          {
@@ -562,15 +560,26 @@
 {
    HYPRE_Complex *data      = hypre_StructVectorData(vector);
    HYPRE_Int      data_size = hypre_StructVectorDataSize(vector);
-   HYPRE_Int      i;
+   hypre_Index    imin, imax;
+   hypre_Box     *box;
+
+   box = hypre_BoxCreate(1);
+   hypre_IndexD(imin, 0) = 1;
+   hypre_IndexD(imax, 0) = data_size;
+   hypre_BoxSetExtents(box, imin, imax);
 
+   hypre_BoxLoop1Begin(1, imax,
+                       box, imin, imin, datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-   for (i = 0; i < data_size; i++)
+   hypre_BoxLoop1For(datai)
    {
-      data[i] = 0.0;
+      data[datai] = 0.0;
    }
+   hypre_BoxLoop1End(datai);
+   
+   hypre_BoxDestroy(box);
 
    return hypre_error_flag;
 }
@@ -616,7 +625,6 @@
 {
    hypre_Box          *x_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *xp, *yp;
 
    hypre_BoxArray     *boxes;
@@ -649,7 +657,7 @@
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                           x_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
@@ -670,7 +678,6 @@
 {
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
 
    hypre_BoxArray     *boxes;
@@ -702,7 +709,7 @@
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
@@ -727,7 +734,6 @@
 {
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
 
    hypre_BoxArray     *boxes;
@@ -756,27 +762,28 @@
  
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
-                          v_data_box, start, unit_stride, vi);
       i = hypre_IndexD(start, 0);
       j = hypre_IndexD(start, 1);
       k = hypre_IndexD(start, 2);
+      
+      hypre_SerialBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
+				v_data_box, start, unit_stride, vi);
+
 /* RDF: This won't work as written with threading on */
+       
 #if 0
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE ) HYPRE_SMP_SCHEDULE
 #endif
 #else
-      hypre_BoxLoopSetOneBlock();
 #endif
-      hypre_BoxLoop1For(vi)
       {
          vp[vi] = fcn(i, j, k);
          i++;
          j++;
          k++;
       }
-      hypre_BoxLoop1End(vi);
+      hypre_SerialBoxLoop1End(vi);
    }
 
    return hypre_error_flag;
@@ -791,7 +798,6 @@
    HYPRE_Int           ndim = hypre_StructVectorNDim(vector);
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
 
    hypre_BoxArray     *boxes;
@@ -830,7 +836,7 @@
          hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                              v_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(vi)
          {
@@ -853,7 +859,6 @@
                                          HYPRE_Int           force )
 {
    HYPRE_Int           ndim = hypre_StructVectorNDim(vector);
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
    hypre_BoxArray     *boxes;
    hypre_Box          *box;
@@ -905,7 +910,7 @@
             hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                                 v_data_box, start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE ) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(vi)
             {
@@ -930,7 +935,6 @@
 HYPRE_Int 
 hypre_StructVectorScaleValues( hypre_StructVector *vector, HYPRE_Complex factor )
 {
-   HYPRE_Int         datai;
    HYPRE_Complex    *data;
 
    hypre_Index       imin;
@@ -953,7 +957,7 @@
    hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                        box, imin, imin, datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    hypre_BoxLoop1For(datai)
    {
@@ -1180,7 +1184,6 @@
    A hypre_Index corresponding to max_index is put in max_xyz_index.
    We assume that there is only one box to deal with. */
 {
-   HYPRE_Int         datai;
    HYPRE_Real       *data;
 
    hypre_Index       imin;
@@ -1211,16 +1214,16 @@
       data = hypre_StructVectorBoxData(vector, i);
       hypre_BoxGetSize(box, loop_size);
       hypre_CopyIndex( hypre_BoxIMin(box), imin );
-
-      hypre_BoxLoop1Begin(ndim, loop_size,
-                          box, imin, unit_stride, datai);
       maxindex = hypre_BoxIndexRank( box, imin );
       maxvalue = data[maxindex];
       hypre_SetIndex(max_xyz_index, 0);
+/*FIXME: must run sequentially*/
+      zypre_BoxLoop1Begin(ndim, loop_size,
+                          box, imin, unit_stride, datai);      
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-      hypre_BoxLoop1For(datai)
+      zypre_BoxLoop1For(datai)
       {
          if ( data[datai] > maxvalue )
          {
@@ -1229,7 +1232,7 @@
             hypre_BoxLoopGetIndex(max_xyz_index);
          }
       }
-      hypre_BoxLoop1End(datai);
+      zypre_BoxLoop1End(datai);
       hypre_AddIndexes(max_xyz_index, imin, ndim, max_xyz_index);
    }
 
@@ -1245,7 +1248,7 @@
  *--------------------------------------------------------------------------*/
 hypre_StructVector *
 hypre_StructVectorClone(
-	hypre_StructVector *x)
+   hypre_StructVector *x)
 {
    MPI_Comm		comm = hypre_StructVectorComm(x);
    hypre_StructGrid    *grid = hypre_StructVectorGrid(x);
@@ -1259,11 +1262,12 @@
 
    hypre_StructVectorDataSize(y) = data_size;
    hypre_StructVectorDataSpace(y) = hypre_BoxArrayDuplicate(data_space);
-   hypre_StructVectorData(y) = hypre_CTAlloc(HYPRE_Complex,data_size);
+   hypre_StructVectorData(y) = hypre_DeviceCTAlloc(HYPRE_Complex, data_size);
+   
    hypre_StructVectorDataIndices(y) = hypre_CTAlloc(HYPRE_Int, data_space_size);
 
    for (i=0; i < data_space_size; i++)
-       hypre_StructVectorDataIndices(y)[i] = data_indices[i];
+      hypre_StructVectorDataIndices(y)[i] = data_indices[i];
 
    hypre_StructVectorCopy( x, y );
 
diff -Nru hypre-2.11.2/src/test/ams_driver.c hypre-2.13.0/src/test/ams_driver.c
--- hypre-2.11.2/src/test/ams_driver.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/ams_driver.c	2017-10-20 17:42:22.000000000 +0000
@@ -128,7 +128,7 @@
    hypre_MPI_Init(&argc, &argv);
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs);
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid);
-
+   hypre_GPUInit(-1);
    /* Set defaults */
    solver_id = 3;
    maxit = 100;
@@ -752,7 +752,7 @@
 
    if (zero_cond)
       HYPRE_ParVectorDestroy(interior_nodes);
-
+   hypre_GPUFinalize();
    hypre_MPI_Finalize();
 
    if (HYPRE_GetError() && !myid)
diff -Nru hypre-2.11.2/src/test/for_maxwell.c hypre-2.13.0/src/test/for_maxwell.c
--- hypre-2.11.2/src/test/for_maxwell.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/for_maxwell.c	2017-10-20 17:42:22.000000000 +0000
@@ -1935,7 +1935,7 @@
    for (j = 0; j < data.max_boxsize; j++)
    {
       values[j]= sin((HYPRE_Real)(j+1));
-      values[j]= (HYPRE_Real) rand()/RAND_MAX;
+      values[j]= (HYPRE_Real) hypre_Rand();
       values[j]= (HYPRE_Real) j;
    }
    for (part = 0; part < data.nparts; part++)
diff -Nru hypre-2.11.2/src/test/ij.c hypre-2.13.0/src/test/ij.c
--- hypre-2.11.2/src/test/ij.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/ij.c	2017-10-20 17:42:22.000000000 +0000
@@ -44,6 +44,9 @@
 #include "multivector.h"
 #include "HYPRE_MatvecFunctions.h"
 
+/* max dt */
+#define DT_INF 1.0e30
+
 HYPRE_Int
 BuildParIsoLaplacian( HYPRE_Int argc, char** argv, HYPRE_ParCSRMatrix *A_ptr );
 
@@ -58,7 +61,7 @@
 
 HYPRE_Int BuildParLaplacian (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr );
 HYPRE_Int BuildParSysLaplacian (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr );
-HYPRE_Int BuildParDifConv (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr );
+HYPRE_Int BuildParDifConv (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr);
 HYPRE_Int BuildParFromOneFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_Int num_functions , HYPRE_ParCSRMatrix *A_ptr );
 HYPRE_Int BuildFuncsFromFiles (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix A , HYPRE_Int **dof_func_ptr );
 HYPRE_Int BuildFuncsFromOneFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix A , HYPRE_Int **dof_func_ptr );
@@ -132,7 +135,7 @@
    HYPRE_IJVector      *ij_rbm;
 
    HYPRE_ParCSRMatrix  parcsr_A;
-   HYPRE_ParVector     b;
+   HYPRE_ParVector     b = NULL;
    HYPRE_ParVector     x;
    HYPRE_ParVector     *interp_vecs = NULL;
 
@@ -173,7 +176,7 @@
    HYPRE_Int Q_max = 0;
    HYPRE_Real Q_trunc = 0;
 
-   const HYPRE_Real dt_inf = 1.e40;
+   const HYPRE_Real dt_inf = DT_INF;
    HYPRE_Real dt = dt_inf;
 
    /* parameters for BoomerAMG */
@@ -213,6 +216,7 @@
    HYPRE_Int additive = -1;
    HYPRE_Int mult_add = -1;
    HYPRE_Int simple = -1;
+   HYPRE_Int add_last_lvl = -1;
    HYPRE_Int add_P_max_elmts = 0;
    HYPRE_Real add_trunc_factor = 0;
 
@@ -228,6 +232,9 @@
    HYPRE_Real   max_row_sum = 1.;
 
    HYPRE_Int cheby_order = 2;
+   HYPRE_Int cheby_eig_est = 10;
+   HYPRE_Int cheby_variant = 0;
+   HYPRE_Int cheby_scale = 1;
    HYPRE_Real cheby_fraction = .3;
 
    /* for CGC BM Aug 25, 2006 */
@@ -323,6 +330,20 @@
 
    /* end lobpcg */
 
+   /* mgr options */
+   HYPRE_Int mgr_bsize = 1;
+   HYPRE_Int mgr_nlevels = 0;
+   HYPRE_Int mgr_num_reserved_nodes = 0;
+   HYPRE_Int mgr_non_c_to_f = 1;
+   HYPRE_Int mgr_frelax_method = 0;
+   HYPRE_Int *mgr_num_cindexes = NULL; 
+   HYPRE_Int **mgr_cindexes = NULL;
+   HYPRE_Int *mgr_reserved_coarse_indexes = NULL;
+   HYPRE_Int mgr_relax_type = 0;
+   HYPRE_Int mgr_num_relax_sweeps = 2;
+   HYPRE_Int mgr_interp_type = 2;
+   HYPRE_Int mgr_num_interp_sweeps = 2;
+   /* end mgr options */
 
    HYPRE_Real     *nongalerk_tol = NULL;
    HYPRE_Int       nongalerk_num_tol = 0;
@@ -342,6 +363,8 @@
 
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+   hypre_GPUInit(-1);
+   //nvtxDomainHandle_t domain = nvtxDomainCreateA("Domain_A");
 /*
   hypre_InitMemoryDebug(myid);
 */
@@ -848,6 +871,33 @@
           pcgMode = atoi(argv[arg_index++]);
         }
       /* end lobpcg */
+      /* begin mgr options*/
+      else if ( strcmp(argv[arg_index], "-mgr_bsize") == 0 )
+      {                /* mgr block size */
+         arg_index++;        
+         mgr_bsize = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-mgr_nlevels") == 0 )
+      {                /* mgr number of coarsening levels */
+         arg_index++;        
+         mgr_nlevels = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-mgr_non_c_to_f") == 0 )
+      {                /* mgr intermediate coarse grid strategy */
+         arg_index++;        
+         mgr_non_c_to_f = atoi(argv[arg_index++]);
+      }      
+      else if ( strcmp(argv[arg_index], "-mgr_num_reserved_nodes") == 0 )
+      {                /* mgr number of reserved nodes to be put on coarsest grid */
+         arg_index++;        
+         mgr_num_reserved_nodes = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-mgr_frelax_method") == 0 )
+      {                /* mgr F-relaxation strategy: single/ multi level */
+         arg_index++;        
+         mgr_frelax_method = atoi(argv[arg_index++]);
+      }
+      /* end mgr options */
       else
       {
          arg_index++;
@@ -886,7 +936,8 @@
    /* defaults for BoomerAMG */
    if (solver_id == 0 || solver_id == 1 || solver_id == 3 || solver_id == 5
        || solver_id == 9 || solver_id == 13 || solver_id == 14
-       || solver_id == 15 || solver_id == 20 || solver_id == 51 || solver_id == 61)
+       || solver_id == 15 || solver_id == 20 || solver_id == 51 || solver_id == 61 
+       || solver_id == 70 || solver_id == 71 || solver_id == 72)
    {
       strong_threshold = 0.25;
       trunc_factor = 0.;
@@ -1207,6 +1258,21 @@
          arg_index++;
          cheby_order = atoi(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-cheby_eig_est") == 0 )
+      {
+         arg_index++;
+         cheby_eig_est = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-cheby_variant") == 0 )
+      {
+         arg_index++;
+         cheby_variant = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-cheby_scale") == 0 )
+      {
+         arg_index++;
+         cheby_scale = atoi(argv[arg_index++]);
+      }
       else if ( strcmp(argv[arg_index], "-cheby_fraction") == 0 )
       {
          arg_index++;
@@ -1227,6 +1293,11 @@
          arg_index++;
          simple  = atoi(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-add_end") == 0 )
+      {
+         arg_index++;
+         add_last_lvl  = atoi(argv[arg_index++]);
+      }
       else if ( strcmp(argv[arg_index], "-add_Pmx") == 0 )
       {
          arg_index++;
@@ -1291,289 +1362,313 @@
     * Print usage info
     *-----------------------------------------------------------*/
  
-   if ( (print_usage) && (myid == 0) )
+   if ( print_usage )
    {
-      hypre_printf("\n");
-      hypre_printf("Usage: %s [<options>]\n", argv[0]);
-      hypre_printf("\n");
-      hypre_printf("  -fromfile <filename>       : ");
-      hypre_printf("matrix read from multiple files (IJ format)\n");
-      hypre_printf("  -fromparcsrfile <filename> : ");
-      hypre_printf("matrix read from multiple files (ParCSR format)\n");
-      hypre_printf("  -fromonecsrfile <filename> : ");
-      hypre_printf("matrix read from a single file (CSR format)\n");
-      hypre_printf("\n");
-      hypre_printf("  -laplacian [<options>] : build 5pt 2D laplacian problem (default) \n");
-      hypre_printf("  -sysL <num functions>  : build SYSTEMS laplacian 7pt operator\n");
-      hypre_printf("  -9pt [<opts>]          : build 9pt 2D laplacian problem\n");
-      hypre_printf("  -27pt [<opts>]         : build 27pt 3D laplacian problem\n");
-      hypre_printf("  -difconv [<opts>]      : build convection-diffusion problem\n");
-      hypre_printf("    -n <nx> <ny> <nz>    : total problem size \n");
-      hypre_printf("    -P <Px> <Py> <Pz>    : processor topology\n");
-      hypre_printf("    -c <cx> <cy> <cz>    : diffusion coefficients\n");
-      hypre_printf("    -a <ax> <ay> <az>    : convection coefficients\n");
-      hypre_printf("\n");
-      hypre_printf("  -exact_size            : inserts immediately into ParCSR structure\n");
-      hypre_printf("  -storage_low           : allocates not enough storage for aux struct\n");
-      hypre_printf("  -concrete_parcsr       : use parcsr matrix type as concrete type\n");
-      hypre_printf("\n");
-      hypre_printf("  -rhsfromfile           : ");
-      hypre_printf("rhs read from multiple files (IJ format)\n");
-      hypre_printf("  -rhsfromonefile        : ");
-      hypre_printf("rhs read from a single file (CSR format)\n");
-      hypre_printf("  -rhsparcsrfile        :  ");
-      hypre_printf("rhs read from multiple files (ParCSR format)\n");
-      hypre_printf("  -rhsrand               : rhs is random vector\n");
-      hypre_printf("  -rhsisone              : rhs is vector with unit components (default)\n");
-      hypre_printf("  -xisone                : solution of all ones\n");
-      hypre_printf("  -rhszero               : rhs is zero vector\n");
-      hypre_printf("\n");
-      hypre_printf("  -dt <val>              : specify finite backward Euler time step\n");
-      hypre_printf("                         :    -rhsfromfile, -rhsfromonefile, -rhsrand,\n");
-      hypre_printf("                         :    -rhsrand, or -xisone will be ignored\n");
-      hypre_printf("  -srcfromfile           : ");
-      hypre_printf("backward Euler source read from multiple files (IJ format)\n");
-      hypre_printf("  -srcfromonefile        : ");
-      hypre_printf("backward Euler source read from a single file (IJ format)\n");
-      hypre_printf("  -srcrand               : ");
-      hypre_printf("backward Euler source is random vector with components in range 0 - 1\n");
-      hypre_printf("  -srcisone              : ");
-      hypre_printf("backward Euler source is vector with unit components (default)\n");
-      hypre_printf("  -srczero               : ");
-      hypre_printf("backward Euler source is zero-vector\n");
-      hypre_printf("\n");
-      hypre_printf("  -solver <ID>           : solver ID\n");
-      hypre_printf("       0=AMG               1=AMG-PCG        \n");
-      hypre_printf("       2=DS-PCG            3=AMG-GMRES      \n");
-      hypre_printf("       4=DS-GMRES          5=AMG-CGNR       \n");     
-      hypre_printf("       6=DS-CGNR           7=PILUT-GMRES    \n");     
-      hypre_printf("       8=ParaSails-PCG     9=AMG-BiCGSTAB   \n");
-      hypre_printf("       10=DS-BiCGSTAB     11=PILUT-BiCGSTAB \n");
-      hypre_printf("       12=Schwarz-PCG     13=GSMG           \n");     
-      hypre_printf("       14=GSMG-PCG        15=GSMG-GMRES\n");     
-      hypre_printf("       18=ParaSails-GMRES\n");     
-      hypre_printf("       20=Hybrid solver/ DiagScale, AMG \n");
-      hypre_printf("       43=Euclid-PCG      44=Euclid-GMRES   \n");
-      hypre_printf("       45=Euclid-BICGSTAB\n");
-      hypre_printf("       50=DS-LGMRES         51=AMG-LGMRES     \n");
-      hypre_printf("       60=DS-FlexGMRES         61=AMG-FlexGMRES     \n");
-      hypre_printf("\n");
-      hypre_printf("  -cljp                 : CLJP coarsening \n");
-      hypre_printf("  -cljp1                : CLJP coarsening, fixed random \n");
-      hypre_printf("  -cgc                  : CGC coarsening \n");
-      hypre_printf("  -cgce                 : CGC-E coarsening \n");
-      hypre_printf("  -pmis                 : PMIS coarsening \n");
-      hypre_printf("  -pmis1                : PMIS coarsening, fixed random \n");
-      hypre_printf("  -hmis                 : HMIS coarsening (default)\n");
-      hypre_printf("  -ruge                 : Ruge-Stueben coarsening (local)\n");
-      hypre_printf("  -ruge1p               : Ruge-Stueben coarsening 1st pass only(local)\n");
-      hypre_printf("  -ruge3                : third pass on boundary\n");
-      hypre_printf("  -ruge3c               : third pass on boundary, keep c-points\n");
-      hypre_printf("  -falgout              : local Ruge_Stueben followed by CLJP\n");
-      hypre_printf("  -gm                   : use global measures\n");
-      hypre_printf("\n");
-      hypre_printf("  -interptype  <val>    : set interpolation type\n");
-      hypre_printf("       0=Classical modified interpolation  \n");
-      hypre_printf("       1=least squares interpolation (for GSMG only)  \n");
-      hypre_printf("       0=Classical modified interpolation for hyperbolic PDEs \n");
-      hypre_printf("       3=direct interpolation with separation of weights  \n");
-      hypre_printf("       4=multipass interpolation  \n");
-      hypre_printf("       5=multipass interpolation with separation of weights  \n");
-      hypre_printf("       6=extended classical modified interpolation (default) \n");
-      hypre_printf("       7=extended (only if no common C neighbor) interpolation  \n");
-      hypre_printf("       8=standard interpolation  \n");
-      hypre_printf("       9=standard interpolation with separation of weights  \n");
-      hypre_printf("      12=FF interpolation  \n");
-      hypre_printf("      13=FF1 interpolation  \n");
- 
-      hypre_printf("      16=use modified unknown interpolation for a system (w/unknown or hybrid approach) \n");
-      hypre_printf("      17=use non-systems interp = 6 for a system (w/unknown or hybrid approach) \n");
-      hypre_printf("      18=use non-systems interp = 8 for a system (w/unknown or hybrid approach) \n");
-      hypre_printf("      19=use non-systems interp = 0 for a system (w/unknown or hybrid approach) \n");
-      
+      if ( myid == 0 )
+      {
+         hypre_printf("\n");
+         hypre_printf("Usage: %s [<options>]\n", argv[0]);
+         hypre_printf("\n");
+         hypre_printf("  -fromfile <filename>       : ");
+         hypre_printf("matrix read from multiple files (IJ format)\n");
+         hypre_printf("  -fromparcsrfile <filename> : ");
+         hypre_printf("matrix read from multiple files (ParCSR format)\n");
+         hypre_printf("  -fromonecsrfile <filename> : ");
+         hypre_printf("matrix read from a single file (CSR format)\n");
+         hypre_printf("\n");
+         hypre_printf("  -laplacian [<options>] : build 5pt 2D laplacian problem (default) \n");
+         hypre_printf("  -sysL <num functions>  : build SYSTEMS laplacian 7pt operator\n");
+         hypre_printf("  -9pt [<opts>]          : build 9pt 2D laplacian problem\n");
+         hypre_printf("  -27pt [<opts>]         : build 27pt 3D laplacian problem\n");
+         hypre_printf("  -difconv [<opts>]      : build convection-diffusion problem\n");
+         hypre_printf("    -n <nx> <ny> <nz>    : total problem size \n");
+         hypre_printf("    -P <Px> <Py> <Pz>    : processor topology\n");
+         hypre_printf("    -c <cx> <cy> <cz>    : diffusion coefficients\n");
+         hypre_printf("    -a <ax> <ay> <az>    : convection coefficients\n");
+         hypre_printf("    -atype <type>        : FD scheme for convection \n");
+         hypre_printf("           0=Forward (default)       1=Backward\n");
+         hypre_printf("           2=Centered                3=Upwind\n");
+         hypre_printf("\n");
+         hypre_printf("  -exact_size            : inserts immediately into ParCSR structure\n");
+         hypre_printf("  -storage_low           : allocates not enough storage for aux struct\n");
+         hypre_printf("  -concrete_parcsr       : use parcsr matrix type as concrete type\n");
+         hypre_printf("\n");
+         hypre_printf("  -rhsfromfile           : ");
+         hypre_printf("rhs read from multiple files (IJ format)\n");
+         hypre_printf("  -rhsfromonefile        : ");
+         hypre_printf("rhs read from a single file (CSR format)\n");
+         hypre_printf("  -rhsparcsrfile        :  ");
+         hypre_printf("rhs read from multiple files (ParCSR format)\n");
+         hypre_printf("  -rhsrand               : rhs is random vector\n");
+         hypre_printf("  -rhsisone              : rhs is vector with unit components (default)\n");
+         hypre_printf("  -xisone                : solution of all ones\n");
+         hypre_printf("  -rhszero               : rhs is zero vector\n");
+         hypre_printf("\n");
+         hypre_printf("  -dt <val>              : specify finite backward Euler time step\n");
+         hypre_printf("                         :    -rhsfromfile, -rhsfromonefile, -rhsrand,\n");
+         hypre_printf("                         :    -rhsrand, or -xisone will be ignored\n");
+         hypre_printf("  -srcfromfile           : ");
+         hypre_printf("backward Euler source read from multiple files (IJ format)\n");
+         hypre_printf("  -srcfromonefile        : ");
+         hypre_printf("backward Euler source read from a single file (IJ format)\n");
+         hypre_printf("  -srcrand               : ");
+         hypre_printf("backward Euler source is random vector with components in range 0 - 1\n");
+         hypre_printf("  -srcisone              : ");
+         hypre_printf("backward Euler source is vector with unit components (default)\n");
+         hypre_printf("  -srczero               : ");
+         hypre_printf("backward Euler source is zero-vector\n");
+         hypre_printf("\n");
+         hypre_printf("  -solver <ID>           : solver ID\n");
+         hypre_printf("       0=AMG               1=AMG-PCG        \n");
+         hypre_printf("       2=DS-PCG            3=AMG-GMRES      \n");
+         hypre_printf("       4=DS-GMRES          5=AMG-CGNR       \n");     
+         hypre_printf("       6=DS-CGNR           7=PILUT-GMRES    \n");     
+         hypre_printf("       8=ParaSails-PCG     9=AMG-BiCGSTAB   \n");
+         hypre_printf("       10=DS-BiCGSTAB     11=PILUT-BiCGSTAB \n");
+         hypre_printf("       12=Schwarz-PCG     13=GSMG           \n");     
+         hypre_printf("       14=GSMG-PCG        15=GSMG-GMRES\n");     
+         hypre_printf("       18=ParaSails-GMRES\n");     
+         hypre_printf("       20=Hybrid solver/ DiagScale, AMG \n");
+         hypre_printf("       43=Euclid-PCG      44=Euclid-GMRES   \n");
+         hypre_printf("       45=Euclid-BICGSTAB\n");
+         hypre_printf("       50=DS-LGMRES         51=AMG-LGMRES     \n");
+         hypre_printf("       60=DS-FlexGMRES         61=AMG-FlexGMRES     \n");
+	 hypre_printf("       70=MGR             71=MGR-PCG   72=MGR-FlexGMRES    \n");
+         hypre_printf("\n");
+         hypre_printf("  -cljp                 : CLJP coarsening \n");
+         hypre_printf("  -cljp1                : CLJP coarsening, fixed random \n");
+         hypre_printf("  -cgc                  : CGC coarsening \n");
+         hypre_printf("  -cgce                 : CGC-E coarsening \n");
+         hypre_printf("  -pmis                 : PMIS coarsening \n");
+         hypre_printf("  -pmis1                : PMIS coarsening, fixed random \n");
+         hypre_printf("  -hmis                 : HMIS coarsening (default)\n");
+         hypre_printf("  -ruge                 : Ruge-Stueben coarsening (local)\n");
+         hypre_printf("  -ruge1p               : Ruge-Stueben coarsening 1st pass only(local)\n");
+         hypre_printf("  -ruge3                : third pass on boundary\n");
+         hypre_printf("  -ruge3c               : third pass on boundary, keep c-points\n");
+         hypre_printf("  -falgout              : local Ruge_Stueben followed by CLJP\n");
+         hypre_printf("  -gm                   : use global measures\n");
+         hypre_printf("\n");
+         hypre_printf("  -interptype  <val>    : set interpolation type\n");
+         hypre_printf("       0=Classical modified interpolation  \n");
+         hypre_printf("       1=least squares interpolation (for GSMG only)  \n");
+         hypre_printf("       0=Classical modified interpolation for hyperbolic PDEs \n");
+         hypre_printf("       3=direct interpolation with separation of weights  \n");
+         hypre_printf("       4=multipass interpolation  \n");
+         hypre_printf("       5=multipass interpolation with separation of weights  \n");
+         hypre_printf("       6=extended classical modified interpolation (default) \n");
+         hypre_printf("       7=extended (only if no common C neighbor) interpolation  \n");
+         hypre_printf("       8=standard interpolation  \n");
+         hypre_printf("       9=standard interpolation with separation of weights  \n");
+         hypre_printf("      12=FF interpolation  \n");
+         hypre_printf("      13=FF1 interpolation  \n");
+
+         hypre_printf("      16=use modified unknown interpolation for a system (w/unknown or hybrid approach) \n");
+         hypre_printf("      17=use non-systems interp = 6 for a system (w/unknown or hybrid approach) \n");
+         hypre_printf("      18=use non-systems interp = 8 for a system (w/unknown or hybrid approach) \n");
+         hypre_printf("      19=use non-systems interp = 0 for a system (w/unknown or hybrid approach) \n");
+
+
+         hypre_printf("      10=classical block interpolation for nodal systems AMG\n");
+         hypre_printf("      11=classical block interpolation with diagonal blocks for nodal systems AMG\n");
+         hypre_printf("      20=same as 10, but don't add weak connect. to diag \n");
+         hypre_printf("      21=same as 11, but don't add weak connect. to diag \n");
+         hypre_printf("      22=classical block interpolation w/Ruge's variant for nodal systems AMG \n");
+         hypre_printf("      23=same as 22, but use row sums for diag scaling matrices,for nodal systems AMG \n");
+         hypre_printf("      24=direct block interpolation for nodal systems AMG\n");
 
-      hypre_printf("      10=classical block interpolation for nodal systems AMG\n");
-      hypre_printf("      11=classical block interpolation with diagonal blocks for nodal systems AMG\n");
-      hypre_printf("      20=same as 10, but don't add weak connect. to diag \n");
-      hypre_printf("      21=same as 11, but don't add weak connect. to diag \n");
-      hypre_printf("      22=classical block interpolation w/Ruge's variant for nodal systems AMG \n");
-      hypre_printf("      23=same as 22, but use row sums for diag scaling matrices,for nodal systems AMG \n");
-      hypre_printf("      24=direct block interpolation for nodal systems AMG\n");
 
 
-     
-      hypre_printf("\n");
-      hypre_printf("  -rlx  <val>            : relaxation type\n");
-      hypre_printf("       0=Weighted Jacobi  \n");
-      hypre_printf("       1=Gauss-Seidel (very slow!)  \n");
-      hypre_printf("       3=Hybrid Gauss-Seidel  \n");
-      hypre_printf("       4=Hybrid backward Gauss-Seidel  \n");
-      hypre_printf("       6=Hybrid symmetric Gauss-Seidel  \n");
-      hypre_printf("       8= symmetric L1-Gauss-Seidel  \n");
-      hypre_printf("       13= forward L1-Gauss-Seidel  \n");
-      hypre_printf("       14= backward L1-Gauss-Seidel  \n");
-      hypre_printf("       15=CG  \n");
-      hypre_printf("       16=Chebyshev  \n");
-      hypre_printf("       17=FCF-Jacobi  \n");
-      hypre_printf("       18=L1-Jacobi (may be used with -CF) \n");
-      hypre_printf("       9=Gauss elimination (use for coarsest grid only)  \n");
-      hypre_printf("       99=Gauss elimination with pivoting (use for coarsest grid only)  \n");
-      hypre_printf("       20= Nodal Weighted Jacobi (for systems only) \n");
-      hypre_printf("       23= Nodal Hybrid Jacobi/Gauss-Seidel (for systems only) \n");
-      hypre_printf("       26= Nodal Hybrid Symmetric Gauss-Seidel  (for systems only)\n");
-      hypre_printf("       29= Nodal Gauss elimination (use for coarsest grid only)  \n");
-      hypre_printf("  -rlx_coarse  <val>       : set relaxation type for coarsest grid\n");
-      hypre_printf("  -rlx_down    <val>       : set relaxation type for down cycle\n");
-      hypre_printf("  -rlx_up      <val>       : set relaxation type for up cycle\n");
-      hypre_printf("  -cheby_order  <val> : set order (1-4) for Chebyshev poly. smoother (default is 2)\n");
-      hypre_printf("  -cheby_fraction <val> : fraction of the spectrum for Chebyshev poly. smoother (default is .3)\n");
-      hypre_printf("  -nodal  <val>            : nodal system type\n");
-      hypre_printf("       0 = Unknown approach \n");
-      hypre_printf("       1 = Frobenius norm  \n");
-      hypre_printf("       2 = Sum of Abs.value of elements  \n");
-      hypre_printf("       3 = Largest magnitude element (includes its sign)  \n");
-      hypre_printf("       4 = Inf. norm  \n");
-      hypre_printf("       5 = One norm  (note: use with block version only) \n");
-      hypre_printf("       6 = Sum of all elements in block  \n");
-      hypre_printf("  -nodal_diag <val>        :how to treat diag elements\n");
-      hypre_printf("       0 = no special treatment \n");
-      hypre_printf("       1 = make diag = neg.sum of the off_diag  \n");
-      hypre_printf("       2 = make diag = neg. of diag \n");
-      hypre_printf("  -ns <val>              : Use <val> sweeps on each level\n");
-      hypre_printf("                           (default C/F down, F/C up, F/C fine\n");
-      hypre_printf("  -ns_coarse  <val>       : set no. of sweeps for coarsest grid\n");
-      hypre_printf("  -ns_down    <val>       : set no. of sweeps for down cycle\n");
-      hypre_printf("  -ns_up      <val>       : set no. of sweeps for up cycle\n");
-      hypre_printf("\n"); 
-      hypre_printf("  -mu   <val>            : set AMG cycles (1=V, 2=W, etc.)\n"); 
-      hypre_printf("  -th   <val>            : set AMG threshold Theta = val \n");
-      hypre_printf("  -tr   <val>            : set AMG interpolation truncation factor = val \n");
-      hypre_printf("  -Pmx  <val>            : set maximal no. of elmts per row for AMG interpolation (default: 4)\n");
-      hypre_printf("  -jtr  <val>            : set truncation threshold for Jacobi interpolation = val \n");
-      hypre_printf("  -Ssw  <val>            : set S-commpkg-switch = val \n");
-      hypre_printf("  -mxrs <val>            : set AMG maximum row sum threshold for dependency weakening \n");
-      hypre_printf("  -nf <val>              : set number of functions for systems AMG\n");
-      hypre_printf("  -numsamp <val>         : set number of sample vectors for GSMG\n");
-    
-      hypre_printf("  -postinterptype <val>  : invokes <val> no. of Jacobi interpolation steps after main interpolation\n");
-      hypre_printf("\n");
-      hypre_printf("  -cgcitr <val>          : set maximal number of coarsening iterations for CGC\n");
-      hypre_printf("  -solver_type <val>     : sets solver within Hybrid solver\n");
-      hypre_printf("                         : 1  PCG  (default)\n");
-      hypre_printf("                         : 2  GMRES\n");
-      hypre_printf("                         : 3  BiCGSTAB\n");
-     
-      hypre_printf("  -w   <val>             : set Jacobi relax weight = val\n");
-      hypre_printf("  -k   <val>             : dimension Krylov space for GMRES\n");
-      hypre_printf("  -aug   <val>           : number of augmentation vectors for LGMRES (-k indicates total approx space size)\n");
-
-      hypre_printf("  -mxl  <val>            : maximum number of levels (AMG, ParaSAILS)\n");
-      hypre_printf("  -tol  <val>            : set solver convergence tolerance = val\n");
-      hypre_printf("  -atol  <val>           : set solver absolute convergence tolerance = val\n");
-      hypre_printf("  -max_iter  <val>       : set max iterations\n");
-      hypre_printf("  -mg_max_iter  <val>    : set max iterations for mg solvers\n");
-      hypre_printf("  -agg_nl  <val>         : set number of aggressive coarsening levels (default:0)\n");
-      hypre_printf("  -np  <val>             : set number of paths of length 2 for aggr. coarsening\n");
-      hypre_printf("\n");
-      hypre_printf("  -sai_th   <val>        : set ParaSAILS threshold = val \n");
-      hypre_printf("  -sai_filt <val>        : set ParaSAILS filter = val \n");
-      hypre_printf("\n");
-      hypre_printf("  -level   <val>         : set k in ILU(k) for Euclid \n");
-      hypre_printf("  -bj <val>              : enable block Jacobi ILU for Euclid \n");
-      hypre_printf("  -ilut <val>            : set drop tolerance for ILUT in Euclid\n");
-      hypre_printf("                           Note ILUT is sequential only!\n");
-      hypre_printf("  -sparseA <val>         : set drop tolerance in ILU(k) for Euclid \n");
-      hypre_printf("  -rowScale <val>        : enable row scaling in Euclid \n");
-      hypre_printf("\n");  
-      hypre_printf("  -drop_tol  <val>       : set threshold for dropping in PILUT\n");
-      hypre_printf("  -nonzeros_to_keep <val>: number of nonzeros in each row to keep\n");
-      hypre_printf("\n");  
-      hypre_printf("  -iout <val>            : set output flag\n");
-      hypre_printf("       0=no output    1=matrix stats\n"); 
-      hypre_printf("       2=cycle stats  3=matrix & cycle stats\n"); 
-      hypre_printf("\n");  
-      hypre_printf("  -dbg <val>             : set debug flag\n");
-      hypre_printf("       0=no debugging\n       1=internal timing\n       2=interpolation truncation\n       3=more detailed timing in coarsening routine\n");
-      hypre_printf("\n");
-      hypre_printf("  -print                 : print out the system\n");
-      hypre_printf("\n");
-      /* begin lobpcg */
+         hypre_printf("\n");
+         hypre_printf("  -rlx  <val>            : relaxation type\n");
+         hypre_printf("       0=Weighted Jacobi  \n");
+         hypre_printf("       1=Gauss-Seidel (very slow!)  \n");
+         hypre_printf("       3=Hybrid Gauss-Seidel  \n");
+         hypre_printf("       4=Hybrid backward Gauss-Seidel  \n");
+         hypre_printf("       6=Hybrid symmetric Gauss-Seidel  \n");
+         hypre_printf("       8= symmetric L1-Gauss-Seidel  \n");
+         hypre_printf("       13= forward L1-Gauss-Seidel  \n");
+         hypre_printf("       14= backward L1-Gauss-Seidel  \n");
+         hypre_printf("       15=CG  \n");
+         hypre_printf("       16=Chebyshev  \n");
+         hypre_printf("       17=FCF-Jacobi  \n");
+         hypre_printf("       18=L1-Jacobi (may be used with -CF) \n");
+         hypre_printf("       9=Gauss elimination (use for coarsest grid only)  \n");
+         hypre_printf("       99=Gauss elimination with pivoting (use for coarsest grid only)  \n");
+         hypre_printf("       20= Nodal Weighted Jacobi (for systems only) \n");
+         hypre_printf("       23= Nodal Hybrid Jacobi/Gauss-Seidel (for systems only) \n");
+         hypre_printf("       26= Nodal Hybrid Symmetric Gauss-Seidel  (for systems only)\n");
+         hypre_printf("       29= Nodal Gauss elimination (use for coarsest grid only)  \n");
+         hypre_printf("  -rlx_coarse  <val>       : set relaxation type for coarsest grid\n");
+         hypre_printf("  -rlx_down    <val>       : set relaxation type for down cycle\n");
+         hypre_printf("  -rlx_up      <val>       : set relaxation type for up cycle\n");
+         hypre_printf("  -cheby_order  <val> : set order (1-4) for Chebyshev poly. smoother (default is 2)\n");
+         hypre_printf("  -cheby_fraction <val> : fraction of the spectrum for Chebyshev poly. smoother (default is .3)\n");
+         hypre_printf("  -nodal  <val>            : nodal system type\n");
+         hypre_printf("       0 = Unknown approach \n");
+         hypre_printf("       1 = Frobenius norm  \n");
+         hypre_printf("       2 = Sum of Abs.value of elements  \n");
+         hypre_printf("       3 = Largest magnitude element (includes its sign)  \n");
+         hypre_printf("       4 = Inf. norm  \n");
+         hypre_printf("       5 = One norm  (note: use with block version only) \n");
+         hypre_printf("       6 = Sum of all elements in block  \n");
+         hypre_printf("  -nodal_diag <val>        :how to treat diag elements\n");
+         hypre_printf("       0 = no special treatment \n");
+         hypre_printf("       1 = make diag = neg.sum of the off_diag  \n");
+         hypre_printf("       2 = make diag = neg. of diag \n");
+         hypre_printf("  -ns <val>              : Use <val> sweeps on each level\n");
+         hypre_printf("                           (default C/F down, F/C up, F/C fine\n");
+         hypre_printf("  -ns_coarse  <val>       : set no. of sweeps for coarsest grid\n");
+         /*hypre_printf("  -ns_down    <val>       : set no. of sweeps for down cycle\n");
+           hypre_printf("  -ns_up      <val>       : set no. of sweeps for up cycle\n");*/
+         hypre_printf("\n"); 
+         hypre_printf("  -mu   <val>            : set AMG cycles (1=V, 2=W, etc.)\n"); 
+         hypre_printf("  -th   <val>            : set AMG threshold Theta = val \n");
+         hypre_printf("  -tr   <val>            : set AMG interpolation truncation factor = val \n");
+         hypre_printf("  -Pmx  <val>            : set maximal no. of elmts per row for AMG interpolation (default: 4)\n");
+         hypre_printf("  -jtr  <val>            : set truncation threshold for Jacobi interpolation = val \n");
+         hypre_printf("  -Ssw  <val>            : set S-commpkg-switch = val \n");
+         hypre_printf("  -mxrs <val>            : set AMG maximum row sum threshold for dependency weakening \n");
+         hypre_printf("  -nf <val>              : set number of functions for systems AMG\n");
+         hypre_printf("  -numsamp <val>         : set number of sample vectors for GSMG\n");
 
-      hypre_printf("LOBPCG options:\n");
-      hypre_printf("\n");
-      hypre_printf("  -lobpcg                 : run LOBPCG instead of PCG\n");
-      hypre_printf("\n");
-      hypre_printf("  -gen                    : solve generalized EVP with B = Laplacian\n");
-      hypre_printf("\n");
-      hypre_printf("  -con                    : solve constrained EVP using 'vectors.*.*'\n");
-      hypre_printf("                            as constraints (see -vout 1 below)\n");
-      hypre_printf("\n");
-      hypre_printf("  -solver none            : no HYPRE preconditioner is used\n");
-      hypre_printf("\n");
-      hypre_printf("  -itr <val>              : maximal number of LOBPCG iterations\n");
-      hypre_printf("                            (default 100);\n");
-      hypre_printf("\n");
-      hypre_printf("  -vrand <val>            : compute <val> eigenpairs using random\n");
-      hypre_printf("                            initial vectors (default 1)\n");
-      hypre_printf("\n");
-      hypre_printf("  -seed <val>             : use <val> as the seed for the random\n");
-      hypre_printf("                            number generator(default seed is based\n");
-      hypre_printf("                            on the time of the run)\n");
-      hypre_printf("\n");
-      hypre_printf("  -vfromfile              : read initial vectors from files\n");
-      hypre_printf("                            vectors.i.j where i is vector number\n");
-      hypre_printf("                            and j is processor number\n");
-      hypre_printf("\n");
-      hypre_printf("  -orthchk                : check eigenvectors for orthonormality\n");
-      hypre_printf("\n");
-      hypre_printf("  -verb <val>             : verbosity level\n");
-      hypre_printf("  -verb 0                 : no print\n");
-      hypre_printf("  -verb 1                 : print initial eigenvalues and residuals,\n");
-      hypre_printf("                            the iteration number, the number of\n");
-      hypre_printf("                            non-convergent eigenpairs and final\n");
-      hypre_printf("                            eigenvalues and residuals (default)\n");
-      hypre_printf("  -verb 2                 : print eigenvalues and residuals on each\n");
-      hypre_printf("                            iteration\n");
-      hypre_printf("\n");
-      hypre_printf("  -pcgitr <val>           : maximal number of inner PCG iterations\n");
-      hypre_printf("                            for preconditioning (default 1);\n");
-      hypre_printf("                            if <val> = 0 then the preconditioner\n");
-      hypre_printf("                            is applied directly\n");
-      hypre_printf("\n");
-      hypre_printf("  -pcgtol <val>           : residual tolerance for inner iterations\n");
-      hypre_printf("                            (default 0.01)\n");
-      hypre_printf("\n");
-      hypre_printf("  -vout <val>             : file output level\n");
-      hypre_printf("  -vout 0                 : no files created (default)\n");
-      hypre_printf("  -vout 1                 : write eigenvalues to values.txt, residuals\n");
-      hypre_printf("                            to residuals.txt and eigenvectors to \n");
-      hypre_printf("                            vectors.i.j where i is vector number\n");
-      hypre_printf("                            and j is processor number\n");
-      hypre_printf("  -vout 2                 : in addition to the above, write the\n");
-      hypre_printf("                            eigenvalues history (the matrix whose\n");
-      hypre_printf("                            i-th column contains eigenvalues at\n");
-      hypre_printf("                            (i+1)-th iteration) to val_hist.txt and\n");
-      hypre_printf("                            residuals history to res_hist.txt\n");
-      hypre_printf("\nNOTE: in this test driver LOBPCG only works with solvers 1, 2, 8, 12, 14 and 43\n");
-      hypre_printf("\ndefault solver is 1\n");
-      hypre_printf("\n");
+         hypre_printf("  -postinterptype <val>  : invokes <val> no. of Jacobi interpolation steps after main interpolation\n");
+         hypre_printf("\n");
+         hypre_printf("  -cgcitr <val>          : set maximal number of coarsening iterations for CGC\n");
+         hypre_printf("  -solver_type <val>     : sets solver within Hybrid solver\n");
+         hypre_printf("                         : 1  PCG  (default)\n");
+         hypre_printf("                         : 2  GMRES\n");
+         hypre_printf("                         : 3  BiCGSTAB\n");
+
+         hypre_printf("  -w   <val>             : set Jacobi relax weight = val\n");
+         hypre_printf("  -k   <val>             : dimension Krylov space for GMRES\n");
+         hypre_printf("  -aug   <val>           : number of augmentation vectors for LGMRES (-k indicates total approx space size)\n");
+
+         hypre_printf("  -mxl  <val>            : maximum number of levels (AMG, ParaSAILS)\n");
+         hypre_printf("  -tol  <val>            : set solver convergence tolerance = val\n");
+         hypre_printf("  -atol  <val>           : set solver absolute convergence tolerance = val\n");
+         hypre_printf("  -max_iter  <val>       : set max iterations\n");
+         hypre_printf("  -mg_max_iter  <val>    : set max iterations for mg solvers\n");
+         hypre_printf("  -agg_nl  <val>         : set number of aggressive coarsening levels (default:0)\n");
+         hypre_printf("  -np  <val>             : set number of paths of length 2 for aggr. coarsening\n");
+         hypre_printf("\n");
+         hypre_printf("  -sai_th   <val>        : set ParaSAILS threshold = val \n");
+         hypre_printf("  -sai_filt <val>        : set ParaSAILS filter = val \n");
+         hypre_printf("\n");
+         hypre_printf("  -level   <val>         : set k in ILU(k) for Euclid \n");
+         hypre_printf("  -bj <val>              : enable block Jacobi ILU for Euclid \n");
+         hypre_printf("  -ilut <val>            : set drop tolerance for ILUT in Euclid\n");
+         hypre_printf("                           Note ILUT is sequential only!\n");
+         hypre_printf("  -sparseA <val>         : set drop tolerance in ILU(k) for Euclid \n");
+         hypre_printf("  -rowScale <val>        : enable row scaling in Euclid \n");
+         hypre_printf("\n");  
+         hypre_printf("  -drop_tol  <val>       : set threshold for dropping in PILUT\n");
+         hypre_printf("  -nonzeros_to_keep <val>: number of nonzeros in each row to keep\n");
+         hypre_printf("\n");  
+         hypre_printf("  -iout <val>            : set output flag\n");
+         hypre_printf("       0=no output    1=matrix stats\n"); 
+         hypre_printf("       2=cycle stats  3=matrix & cycle stats\n"); 
+         hypre_printf("\n");  
+         hypre_printf("  -dbg <val>             : set debug flag\n");
+         hypre_printf("       0=no debugging\n       1=internal timing\n       2=interpolation truncation\n       3=more detailed timing in coarsening routine\n");
+         hypre_printf("\n");
+         hypre_printf("  -print                 : print out the system\n");
+         hypre_printf("\n");
+         /* begin lobpcg */
 
-      /* end lobpcg */
+         hypre_printf("LOBPCG options:\n");
+         hypre_printf("\n");
+         hypre_printf("  -lobpcg                 : run LOBPCG instead of PCG\n");
+         hypre_printf("\n");
+         hypre_printf("  -gen                    : solve generalized EVP with B = Laplacian\n");
+         hypre_printf("\n");
+         hypre_printf("  -con                    : solve constrained EVP using 'vectors.*.*'\n");
+         hypre_printf("                            as constraints (see -vout 1 below)\n");
+         hypre_printf("\n");
+         hypre_printf("  -solver none            : no HYPRE preconditioner is used\n");
+         hypre_printf("\n");
+         hypre_printf("  -itr <val>              : maximal number of LOBPCG iterations\n");
+         hypre_printf("                            (default 100);\n");
+         hypre_printf("\n");
+         hypre_printf("  -vrand <val>            : compute <val> eigenpairs using random\n");
+         hypre_printf("                            initial vectors (default 1)\n");
+         hypre_printf("\n");
+         hypre_printf("  -seed <val>             : use <val> as the seed for the random\n");
+         hypre_printf("                            number generator(default seed is based\n");
+         hypre_printf("                            on the time of the run)\n");
+         hypre_printf("\n");
+         hypre_printf("  -vfromfile              : read initial vectors from files\n");
+         hypre_printf("                            vectors.i.j where i is vector number\n");
+         hypre_printf("                            and j is processor number\n");
+         hypre_printf("\n");
+         hypre_printf("  -orthchk                : check eigenvectors for orthonormality\n");
+         hypre_printf("\n");
+         hypre_printf("  -verb <val>             : verbosity level\n");
+         hypre_printf("  -verb 0                 : no print\n");
+         hypre_printf("  -verb 1                 : print initial eigenvalues and residuals,\n");
+         hypre_printf("                            the iteration number, the number of\n");
+         hypre_printf("                            non-convergent eigenpairs and final\n");
+         hypre_printf("                            eigenvalues and residuals (default)\n");
+         hypre_printf("  -verb 2                 : print eigenvalues and residuals on each\n");
+         hypre_printf("                            iteration\n");
+         hypre_printf("\n");
+         hypre_printf("  -pcgitr <val>           : maximal number of inner PCG iterations\n");
+         hypre_printf("                            for preconditioning (default 1);\n");
+         hypre_printf("                            if <val> = 0 then the preconditioner\n");
+         hypre_printf("                            is applied directly\n");
+         hypre_printf("\n");
+         hypre_printf("  -pcgtol <val>           : residual tolerance for inner iterations\n");
+         hypre_printf("                            (default 0.01)\n");
+         hypre_printf("\n");
+         hypre_printf("  -vout <val>             : file output level\n");
+         hypre_printf("  -vout 0                 : no files created (default)\n");
+         hypre_printf("  -vout 1                 : write eigenvalues to values.txt, residuals\n");
+         hypre_printf("                            to residuals.txt and eigenvectors to \n");
+         hypre_printf("                            vectors.i.j where i is vector number\n");
+         hypre_printf("                            and j is processor number\n");
+         hypre_printf("  -vout 2                 : in addition to the above, write the\n");
+         hypre_printf("                            eigenvalues history (the matrix whose\n");
+         hypre_printf("                            i-th column contains eigenvalues at\n");
+         hypre_printf("                            (i+1)-th iteration) to val_hist.txt and\n");
+         hypre_printf("                            residuals history to res_hist.txt\n");
+         hypre_printf("\nNOTE: in this test driver LOBPCG only works with solvers 1, 2, 8, 12, 14 and 43\n");
+         hypre_printf("\ndefault solver is 1\n");
+         hypre_printf("\n");
 
-      hypre_printf("  -plot_grids            : print out information for plotting the grids\n");
-      hypre_printf("  -plot_file_name <val>  : file name for plotting output\n");
-      hypre_printf("\n");
-      hypre_printf("  -smtype <val>      :smooth type\n");
-      hypre_printf("  -smlv <val>        :smooth num levels\n");
-      hypre_printf("  -ov <val>          :over lap:\n");
-      hypre_printf("  -dom <val>         :domain type\n");
-      hypre_printf("  -use_ns            : use non-symm schwarz smoother\n");
-      hypre_printf("  -var <val>         : schwarz smoother variant (0-3) \n");
-      hypre_printf("  -blk_sm <val>      : same as '-smtype 6 -ov 0 -dom 1 -smlv <val>'\n");
-      hypre_printf("  -nongalerk_tol <val> <list>    : specify the NonGalerkin drop tolerance\n");
-      hypre_printf("                                   and list contains the values, where last value\n");
-      hypre_printf("                                   in list is repeated if val < num_levels in AMG\n");
-      exit(1);
+         /* end lobpcg */
+
+         hypre_printf("  -plot_grids            : print out information for plotting the grids\n");
+         hypre_printf("  -plot_file_name <val>  : file name for plotting output\n");
+         hypre_printf("\n");
+         hypre_printf("  -smtype <val>      :smooth type\n");
+         hypre_printf("  -smlv <val>        :smooth num levels\n");
+         hypre_printf("  -ov <val>          :over lap:\n");
+         hypre_printf("  -dom <val>         :domain type\n");
+         hypre_printf("  -use_ns            : use non-symm schwarz smoother\n");
+         hypre_printf("  -var <val>         : schwarz smoother variant (0-3) \n");
+         hypre_printf("  -blk_sm <val>      : same as '-smtype 6 -ov 0 -dom 1 -smlv <val>'\n");
+         hypre_printf("  -nongalerk_tol <val> <list>    : specify the NonGalerkin drop tolerance\n");
+         hypre_printf("                                   and list contains the values, where last value\n");
+         hypre_printf("                                   in list is repeated if val < num_levels in AMG\n");
+
+         /* MGR options */
+         hypre_printf("  -mgr_bsize   <val>               : set block size = val\n");      
+         hypre_printf("  -mgr_nlevels   <val>             : set number of coarsening levels = val\n");      
+         hypre_printf("  -mgr_num_reserved_nodes   <val>  : set number of reserved nodes \n");      
+         hypre_printf("                                     to be kept till the coarsest grid = val\n");      
+         hypre_printf("  -mgr_non_c_to_f   <val>          : set strategy for intermediate coarse grid \n");      
+         hypre_printf("  -mgr_non_c_to_f   0              : Allow some non Cpoints to be labeled \n");
+         hypre_printf("                                     Cpoints on intermediate grid \n");      
+         hypre_printf("  -mgr_non_c_to_f   1              : set non Cpoints strictly to Fpoints \n");            
+         hypre_printf("  -mgr_frelax_method   <val>       : set F-relaxation strategy \n");      
+         hypre_printf("  -mgr_frelax_method   0           : Use 'single-level smoother' strategy \n");
+         hypre_printf("                                     for F-relaxation \n");      
+         hypre_printf("  -mgr_frelax_method   1           : Use a 'multi-level smoother' strategy \n");  
+         hypre_printf("                                     for F-relaxation \n");      
+         /* end MGR options */
+      }
+
+      goto final;
    }
 
    /*-----------------------------------------------------------
@@ -1636,7 +1731,8 @@
    else if ( build_matrix_type == 6 )
    {
       BuildParVarDifConv(argc, argv, build_matrix_arg_index, &parcsr_A, &b);
-      /*HYPRE_ParCSRMatrixPrint(parcsr_A,"mat100");*/
+      build_rhs_type      = 6;
+      build_src_type      = 5;
    }
    else if ( build_matrix_type == 7 )
    {
@@ -2163,6 +2259,10 @@
       ierr = HYPRE_IJVectorGetObject( ij_x, &object );
       x = (HYPRE_ParVector) object;
    }
+   else if ( build_rhs_type == 6) 
+   {
+      ij_b = NULL;
+   }
 
    if ( build_src_type == 0 )
    {
@@ -2328,6 +2428,33 @@
       ierr = HYPRE_IJVectorGetObject( ij_x, &object );
       x = (HYPRE_ParVector) object;
    }
+   else if ( build_src_type == 5 )
+   {
+      if (myid == 0)
+      {
+         hypre_printf("  Initial guess is random \n");
+      }
+
+      /* Initial guess */
+      HYPRE_IJVectorCreate(hypre_MPI_COMM_WORLD, first_local_col, last_local_col, &ij_x);
+      HYPRE_IJVectorSetObjectType(ij_x, HYPRE_PARCSR);
+      HYPRE_IJVectorInitialize(ij_x);
+
+      /* For backward Euler the previous backward Euler iterate (assumed
+         random in 0 - 1 here) is usually used as the initial guess */
+      values = hypre_CTAlloc(HYPRE_Real, local_num_cols);
+      /* hypre_SeedRand(myid+2747); */
+      hypre_SeedRand(myid);
+      for (i = 0; i < local_num_cols; i++)
+      {
+         values[i] = hypre_Rand();
+      }
+      HYPRE_IJVectorSetValues(ij_x, local_num_cols, NULL, values);
+      hypre_TFree(values);
+
+      ierr = HYPRE_IJVectorGetObject( ij_x, &object );
+      x = (HYPRE_ParVector) object;
+   }
 
    hypre_EndTiming(time_index);
    hypre_PrintTiming("IJ Vector Setup", hypre_MPI_COMM_WORLD);
@@ -2359,7 +2486,14 @@
    if (print_system)
    {
       HYPRE_IJMatrixPrint(ij_A, "IJ.out.A");
-      HYPRE_IJVectorPrint(ij_b, "IJ.out.b");
+      if (ij_b)
+      {
+         HYPRE_IJVectorPrint(ij_b, "IJ.out.b");
+      }
+      else if (b)
+      {
+         HYPRE_ParVectorPrint(b, "ParVec.out.b");
+      }
       HYPRE_IJVectorPrint(ij_x, "IJ.out.x0");
 
       /* HYPRE_ParCSRMatrixPrint( parcsr_A, "new_mat.A" );*/
@@ -2497,6 +2631,9 @@
       HYPRE_BoomerAMGSetAddRelaxWt(amg_solver, add_relax_wt);
       HYPRE_BoomerAMGSetChebyOrder(amg_solver, cheby_order);
       HYPRE_BoomerAMGSetChebyFraction(amg_solver, cheby_fraction);
+      HYPRE_BoomerAMGSetChebyEigEst(amg_solver, cheby_eig_est);
+      HYPRE_BoomerAMGSetChebyVariant(amg_solver, cheby_variant);
+      HYPRE_BoomerAMGSetChebyScale(amg_solver, cheby_scale);
       HYPRE_BoomerAMGSetRelaxOrder(amg_solver, relax_order);
       HYPRE_BoomerAMGSetRelaxWt(amg_solver, relax_wt);
       HYPRE_BoomerAMGSetOuterWt(amg_solver, outer_wt);
@@ -2536,6 +2673,7 @@
       HYPRE_BoomerAMGSetAdditive(amg_solver, additive);
       HYPRE_BoomerAMGSetMultAdditive(amg_solver, mult_add);
       HYPRE_BoomerAMGSetSimple(amg_solver, simple);
+      HYPRE_BoomerAMGSetAddLastLvl(amg_solver, add_last_lvl);
       HYPRE_BoomerAMGSetMultAddPMaxElmts(amg_solver, add_P_max_elmts);
       HYPRE_BoomerAMGSetMultAddTruncFactor(amg_solver, add_trunc_factor);
 
@@ -2653,6 +2791,9 @@
       HYPRE_BoomerAMGSetAddRelaxWt(amg_solver, add_relax_wt);
       HYPRE_BoomerAMGSetChebyOrder(amg_solver, cheby_order);
       HYPRE_BoomerAMGSetChebyFraction(amg_solver, cheby_fraction);
+      HYPRE_BoomerAMGSetChebyEigEst(amg_solver, cheby_eig_est);
+      HYPRE_BoomerAMGSetChebyVariant(amg_solver, cheby_variant);
+      HYPRE_BoomerAMGSetChebyScale(amg_solver, cheby_scale);
       HYPRE_BoomerAMGSetRelaxOrder(amg_solver, relax_order);
       HYPRE_BoomerAMGSetRelaxWt(amg_solver, relax_wt);
       HYPRE_BoomerAMGSetOuterWt(amg_solver, outer_wt);
@@ -2690,6 +2831,7 @@
       HYPRE_BoomerAMGSetAdditive(amg_solver, additive);
       HYPRE_BoomerAMGSetMultAdditive(amg_solver, mult_add);
       HYPRE_BoomerAMGSetSimple(amg_solver, simple);
+      HYPRE_BoomerAMGSetAddLastLvl(amg_solver, add_last_lvl);
       HYPRE_BoomerAMGSetMultAddPMaxElmts(amg_solver, add_P_max_elmts);
       HYPRE_BoomerAMGSetMultAddTruncFactor(amg_solver, add_trunc_factor);
       HYPRE_BoomerAMGSetRAP2(amg_solver, rap2);
@@ -2760,7 +2902,7 @@
 
    /* begin lobpcg */
    if (!lobpcgFlag && (solver_id == 1 || solver_id == 2 || solver_id == 8 || 
-       solver_id == 12 || solver_id == 14 || solver_id == 43))
+       solver_id == 12 || solver_id == 14 || solver_id == 43 || solver_id == 71))
    /*end lobpcg */
    {
       time_index = hypre_InitializeTiming("PCG Setup");
@@ -2817,6 +2959,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -2855,6 +3000,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -2972,6 +3118,9 @@
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
          if (level_w > -1)
@@ -3008,6 +3157,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -3051,6 +3201,91 @@
                              (HYPRE_PtrToSolverFcn) HYPRE_EuclidSetup,
                              pcg_precond);
       }
+      else if( solver_id == 71 )
+      {
+         /* use MGR preconditioning */
+         if (myid == 0) hypre_printf("Solver:  MGR-PCG\n");
+      
+         HYPRE_MGRCreate(&pcg_precond);
+
+         mgr_num_cindexes = hypre_CTAlloc(HYPRE_Int, mgr_nlevels);
+         for(i=0; i<mgr_nlevels; i++)
+         { /* assume 1 coarse index per level */
+            mgr_num_cindexes[i] = 1;
+         }
+         mgr_cindexes = hypre_CTAlloc(HYPRE_Int*, mgr_nlevels);
+         for(i=0; i<mgr_nlevels; i++)
+         {
+            mgr_cindexes[i] = hypre_CTAlloc(HYPRE_Int, mgr_num_cindexes[i]);
+         }
+         for(i=0; i<mgr_nlevels; i++)
+         { /* assume coarse point is at index 0 */
+            mgr_cindexes[i][0] = 0;
+         }
+         mgr_reserved_coarse_indexes = hypre_CTAlloc(HYPRE_Int, mgr_num_reserved_nodes);
+         for(i=0; i<mgr_num_reserved_nodes; i++)
+         { /* generate artificial reserved nodes */
+            mgr_reserved_coarse_indexes[i] = last_local_row-i;//2*i+1;
+         }
+      
+         /* set MGR data by block */
+         HYPRE_MGRSetCpointsByBlock( pcg_precond, mgr_bsize, mgr_nlevels, mgr_num_cindexes,mgr_cindexes);
+         /* set reserved coarse nodes */
+         if(mgr_num_reserved_nodes)HYPRE_MGRSetReservedCoarseNodes(pcg_precond, mgr_num_reserved_nodes, mgr_reserved_coarse_indexes);
+      
+         /* set intermediate coarse grid strategy */
+         HYPRE_MGRSetNonCpointsToFpoints(pcg_precond, mgr_non_c_to_f);
+         /* set F relaxation strategy */
+         HYPRE_MGRSetFRelaxMethod(pcg_precond, mgr_frelax_method);
+         /* set relax type for single level F-relaxation and post-relaxation */
+         HYPRE_MGRSetRelaxType(pcg_precond, 0);
+         HYPRE_MGRSetNumRelaxSweeps(pcg_precond, 2);
+         /* set interpolation type */
+         HYPRE_MGRSetInterpType(pcg_precond, 2);
+         HYPRE_MGRSetNumInterpSweeps(pcg_precond, 2);
+         /* set print level */
+         HYPRE_MGRSetPrintLevel(pcg_precond, 1);
+         /* set max iterations */
+         HYPRE_MGRSetMaxIter(pcg_precond, 1);
+         HYPRE_MGRSetTol(pcg_precond, pc_tol);
+      
+         /* create AMG coarse grid solver */
+      
+         HYPRE_BoomerAMGCreate(&amg_solver); 
+         /* BM Aug 25, 2006 */
+         HYPRE_BoomerAMGSetCGCIts(amg_solver, cgcits);
+         HYPRE_BoomerAMGSetInterpType(amg_solver, 0);
+         HYPRE_BoomerAMGSetPostInterpType(amg_solver, post_interp_type);
+         HYPRE_BoomerAMGSetCoarsenType(amg_solver, 6);
+         HYPRE_BoomerAMGSetPMaxElmts(amg_solver, 0);
+         /* note: log is written to standard output, not to file */
+         HYPRE_BoomerAMGSetPrintLevel(amg_solver, 1);
+         HYPRE_BoomerAMGSetCycleType(amg_solver, cycle_type);
+         HYPRE_BoomerAMGSetNumSweeps(amg_solver, num_sweeps);
+         HYPRE_BoomerAMGSetRelaxType(amg_solver, 3);
+         if (relax_down > -1)
+            HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_down, 1);
+         if (relax_up > -1)
+            HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_up, 2);
+         if (relax_coarse > -1)
+            HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_coarse, 3);
+         HYPRE_BoomerAMGSetRelaxOrder(amg_solver, 1);
+         HYPRE_BoomerAMGSetMaxLevels(amg_solver, max_levels);
+         HYPRE_BoomerAMGSetSmoothType(amg_solver, smooth_type);
+         HYPRE_BoomerAMGSetSmoothNumSweeps(amg_solver, smooth_num_sweeps);
+         HYPRE_BoomerAMGSetMaxIter(amg_solver, 1);
+         HYPRE_BoomerAMGSetTol(amg_solver, 0.0);
+
+         /* set the MGR coarse solver. Comment out to use default CG solver in MGR */
+         HYPRE_MGRSetCoarseSolver( pcg_precond, HYPRE_BoomerAMGSolve, HYPRE_BoomerAMGSetup, amg_solver);
+      
+         /* setup MGR-PCG solver */         
+         HYPRE_PCGSetPrecond(pcg_solver,
+                             (HYPRE_PtrToSolverFcn) HYPRE_MGRSolve,
+                             (HYPRE_PtrToSolverFcn) HYPRE_MGRSetup,
+                             pcg_precond);
+      
+      }
  
       HYPRE_PCGGetPrecond(pcg_solver, &pcg_precond_gotten);
       if (pcg_precond_gotten !=  pcg_precond)
@@ -3064,7 +3299,7 @@
 
       HYPRE_PCGSetup(pcg_solver, (HYPRE_Matrix)parcsr_A, 
                      (HYPRE_Vector)b, (HYPRE_Vector)x);
- 
+
       hypre_EndTiming(time_index);
       hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD);
       hypre_FinalizeTiming(time_index);
@@ -3115,6 +3350,31 @@
       {
          HYPRE_EuclidDestroy(pcg_precond);
       }
+      else if(solver_id == 71)
+      {     
+         /* free memory */
+         if(mgr_num_cindexes)
+            hypre_TFree(mgr_num_cindexes);
+         mgr_num_cindexes = NULL; 
+      
+         if(mgr_reserved_coarse_indexes)
+            hypre_TFree(mgr_reserved_coarse_indexes);
+         mgr_reserved_coarse_indexes = NULL; 
+      
+         if(mgr_cindexes)
+         {
+            for( i=0; i<mgr_nlevels; i++)
+            {
+            if(mgr_cindexes[i])
+               hypre_TFree(mgr_cindexes[i]);
+            }
+            hypre_TFree(mgr_cindexes);
+            mgr_cindexes = NULL;
+         }
+
+         HYPRE_BoomerAMGDestroy(amg_solver);
+         HYPRE_MGRDestroy(pcg_precond);
+      }
 
       if (myid == 0)
       {
@@ -4060,6 +4320,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4098,6 +4361,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4203,6 +4467,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4241,6 +4508,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4423,6 +4691,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4461,6 +4732,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4540,7 +4812,7 @@
     * Solve the system using FlexGMRES 
     *-----------------------------------------------------------*/
 
-   if (solver_id == 60 || solver_id == 61 )
+   if (solver_id == 60 || solver_id == 61 || solver_id == 72)
    {
       time_index = hypre_InitializeTiming("FlexGMRES Setup");
       hypre_BeginTiming(time_index);
@@ -4596,6 +4868,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4634,6 +4909,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4650,6 +4926,91 @@
                                    (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSetup,
                                    pcg_precond);
       }
+      else if( solver_id == 72 )
+      {
+         /* use MGR preconditioning */
+         if (myid == 0) hypre_printf("Solver:  MGR-FlexGMRES\n");
+      
+         HYPRE_MGRCreate(&pcg_precond);
+
+         mgr_num_cindexes = hypre_CTAlloc(HYPRE_Int, mgr_nlevels);
+         for(i=0; i<mgr_nlevels; i++)
+         { /* assume 1 coarse index per level */
+            mgr_num_cindexes[i] = 1;
+         }
+         mgr_cindexes = hypre_CTAlloc(HYPRE_Int*, mgr_nlevels);
+         for(i=0; i<mgr_nlevels; i++)
+         {
+            mgr_cindexes[i] = hypre_CTAlloc(HYPRE_Int, mgr_num_cindexes[i]);
+         }
+         for(i=0; i<mgr_nlevels; i++)
+         { /* assume coarse point is at index 0 */
+            mgr_cindexes[i][0] = 0;
+         }
+         mgr_reserved_coarse_indexes = hypre_CTAlloc(HYPRE_Int, mgr_num_reserved_nodes);
+         for(i=0; i<mgr_num_reserved_nodes; i++)
+         { /* generate artificial reserved nodes */
+            mgr_reserved_coarse_indexes[i] = last_local_row-i;//2*i+1;
+         }
+      
+         /* set MGR data by block */
+         HYPRE_MGRSetCpointsByBlock( pcg_precond, mgr_bsize, mgr_nlevels, mgr_num_cindexes,mgr_cindexes);
+         /* set reserved coarse nodes */
+         if(mgr_num_reserved_nodes)HYPRE_MGRSetReservedCoarseNodes(pcg_precond, mgr_num_reserved_nodes, mgr_reserved_coarse_indexes);
+      
+         /* set intermediate coarse grid strategy */
+         HYPRE_MGRSetNonCpointsToFpoints(pcg_precond, mgr_non_c_to_f);
+         /* set F relaxation strategy */
+         HYPRE_MGRSetFRelaxMethod(pcg_precond, mgr_frelax_method);
+         /* set relax type for single level F-relaxation and post-relaxation */
+         HYPRE_MGRSetRelaxType(pcg_precond, 0);
+         HYPRE_MGRSetNumRelaxSweeps(pcg_precond, 2);
+         /* set interpolation type */
+         HYPRE_MGRSetInterpType(pcg_precond, 2);
+         HYPRE_MGRSetNumInterpSweeps(pcg_precond, 2);
+         /* set print level */
+         HYPRE_MGRSetPrintLevel(pcg_precond, 1);
+         /* set max iterations */
+         HYPRE_MGRSetMaxIter(pcg_precond, 1);
+         HYPRE_MGRSetTol(pcg_precond, pc_tol);
+      
+         /* create AMG coarse grid solver */
+      
+         HYPRE_BoomerAMGCreate(&amg_solver); 
+         /* BM Aug 25, 2006 */
+         HYPRE_BoomerAMGSetCGCIts(amg_solver, cgcits);
+         HYPRE_BoomerAMGSetInterpType(amg_solver, 0);
+         HYPRE_BoomerAMGSetPostInterpType(amg_solver, post_interp_type);
+         HYPRE_BoomerAMGSetCoarsenType(amg_solver, 6);
+         HYPRE_BoomerAMGSetPMaxElmts(amg_solver, 0);
+         /* note: log is written to standard output, not to file */
+         HYPRE_BoomerAMGSetPrintLevel(amg_solver, 1);
+         HYPRE_BoomerAMGSetCycleType(amg_solver, cycle_type);
+         HYPRE_BoomerAMGSetNumSweeps(amg_solver, num_sweeps);
+         HYPRE_BoomerAMGSetRelaxType(amg_solver, 3);
+         if (relax_down > -1)
+            HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_down, 1);
+         if (relax_up > -1)
+            HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_up, 2);
+         if (relax_coarse > -1)
+            HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_coarse, 3);
+         HYPRE_BoomerAMGSetRelaxOrder(amg_solver, 1);
+         HYPRE_BoomerAMGSetMaxLevels(amg_solver, max_levels);
+         HYPRE_BoomerAMGSetSmoothType(amg_solver, smooth_type);
+         HYPRE_BoomerAMGSetSmoothNumSweeps(amg_solver, smooth_num_sweeps);
+         HYPRE_BoomerAMGSetMaxIter(amg_solver, 1);
+         HYPRE_BoomerAMGSetTol(amg_solver, 0.0);
+
+         /* set the MGR coarse solver. Comment out to use default CG solver in MGR */
+         HYPRE_MGRSetCoarseSolver( pcg_precond, HYPRE_BoomerAMGSolve, HYPRE_BoomerAMGSetup, amg_solver);
+      
+         /* setup MGR-PCG solver */
+	 HYPRE_FlexGMRESSetMaxIter(pcg_solver, mg_max_iter);
+         HYPRE_FlexGMRESSetPrecond(pcg_solver,
+				   (HYPRE_PtrToSolverFcn) HYPRE_MGRSolve,
+				   (HYPRE_PtrToSolverFcn) HYPRE_MGRSetup,
+                                   pcg_precond);
+      }
       else if (solver_id == 60)
       {
          /* use diagonal scaling as preconditioner */
@@ -4774,6 +5135,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4813,6 +5177,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4996,6 +5361,9 @@
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -5027,6 +5395,7 @@
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -5113,12 +5482,164 @@
    }
 
    /*-----------------------------------------------------------
+    * Solve the system using MGR
+    *-----------------------------------------------------------*/   
+
+   if (solver_id == 70)
+   {
+      if (myid == 0) hypre_printf("Solver:  MGR\n");
+      time_index = hypre_InitializeTiming("MGR Setup");
+      hypre_BeginTiming(time_index);
+      
+      HYPRE_Solver mgr_solver;
+      HYPRE_MGRCreate(&mgr_solver);
+
+      mgr_num_cindexes = hypre_CTAlloc(HYPRE_Int, mgr_nlevels);
+      for(i=0; i<mgr_nlevels; i++)
+      { /* assume 1 coarse index per level */
+         mgr_num_cindexes[i] = 1;
+      }
+      mgr_cindexes = hypre_CTAlloc(HYPRE_Int*, mgr_nlevels);
+      for(i=0; i<mgr_nlevels; i++)
+      {
+         mgr_cindexes[i] = hypre_CTAlloc(HYPRE_Int, mgr_num_cindexes[i]);
+      }
+      for(i=0; i<mgr_nlevels; i++)
+      { /* assume coarse point is at index 0 */
+         mgr_cindexes[i][0] = 0;
+      }
+      mgr_reserved_coarse_indexes = hypre_CTAlloc(HYPRE_Int, mgr_num_reserved_nodes);
+      for(i=0; i<mgr_num_reserved_nodes; i++)
+      { /* generate artificial reserved nodes */
+         mgr_reserved_coarse_indexes[i] = last_local_row-i;//2*i+1;
+      }
+      
+      /* set MGR data by block */
+      HYPRE_MGRSetCpointsByBlock( mgr_solver, mgr_bsize, mgr_nlevels, mgr_num_cindexes,mgr_cindexes);
+      /* set reserved coarse nodes */
+      if(mgr_num_reserved_nodes)HYPRE_MGRSetReservedCoarseNodes(mgr_solver, mgr_num_reserved_nodes, mgr_reserved_coarse_indexes);
+      
+      /* set intermediate coarse grid strategy */
+      HYPRE_MGRSetNonCpointsToFpoints(mgr_solver, mgr_non_c_to_f);
+      /* set F relaxation strategy */
+      HYPRE_MGRSetFRelaxMethod(mgr_solver, mgr_frelax_method);
+      /* set relax type for single level F-relaxation and post-relaxation */
+      HYPRE_MGRSetRelaxType(mgr_solver, mgr_relax_type);
+      HYPRE_MGRSetNumRelaxSweeps(mgr_solver, mgr_num_relax_sweeps);
+      /* set interpolation type */
+      HYPRE_MGRSetInterpType(mgr_solver, mgr_interp_type);
+      HYPRE_MGRSetNumInterpSweeps(mgr_solver, mgr_num_interp_sweeps);
+      /* set print level */
+      HYPRE_MGRSetPrintLevel(mgr_solver, 3);
+      /* set max iterations */
+      HYPRE_MGRSetMaxIter(mgr_solver, max_iter);
+      HYPRE_MGRSetTol(mgr_solver, tol);
+      
+      /* create AMG coarse grid solver */
+      
+      HYPRE_BoomerAMGCreate(&amg_solver); 
+      /* BM Aug 25, 2006 */
+      HYPRE_BoomerAMGSetCGCIts(amg_solver, cgcits);
+      HYPRE_BoomerAMGSetInterpType(amg_solver, 0);
+      HYPRE_BoomerAMGSetPostInterpType(amg_solver, post_interp_type);
+      HYPRE_BoomerAMGSetCoarsenType(amg_solver, 6);
+      HYPRE_BoomerAMGSetTol(amg_solver, tol);
+      HYPRE_BoomerAMGSetPMaxElmts(amg_solver, 0);
+      HYPRE_BoomerAMGSetCycleType(amg_solver, cycle_type);
+      HYPRE_BoomerAMGSetNumSweeps(amg_solver, num_sweeps);
+      HYPRE_BoomerAMGSetRelaxType(amg_solver, 3);
+      if (relax_down > -1)
+         HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_down, 1);
+      if (relax_up > -1)
+         HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_up, 2);
+      if (relax_coarse > -1)
+         HYPRE_BoomerAMGSetCycleRelaxType(amg_solver, relax_coarse, 3);
+      HYPRE_BoomerAMGSetRelaxOrder(amg_solver, 1);
+      HYPRE_BoomerAMGSetMaxLevels(amg_solver, max_levels);
+      HYPRE_BoomerAMGSetSmoothType(amg_solver, smooth_type);
+      HYPRE_BoomerAMGSetSmoothNumSweeps(amg_solver, smooth_num_sweeps);
+      if(mgr_nlevels < 1 || mgr_bsize < 2)
+      {
+         HYPRE_BoomerAMGSetMaxIter(amg_solver, max_iter);
+         HYPRE_BoomerAMGSetPrintLevel(amg_solver, 3);
+      }
+      else
+      {
+         HYPRE_BoomerAMGSetMaxIter(amg_solver, 1);
+         HYPRE_BoomerAMGSetTol(amg_solver, 0.0);
+         HYPRE_BoomerAMGSetPrintLevel(amg_solver, 1);
+      }
+      /* set the MGR coarse solver. Comment out to use default CG solver in MGR */
+      HYPRE_MGRSetCoarseSolver( mgr_solver, HYPRE_BoomerAMGSolve, HYPRE_BoomerAMGSetup, amg_solver);
+      
+      /* setup MGR solver */
+      HYPRE_MGRSetup(mgr_solver, parcsr_A, b, x);
+
+      hypre_EndTiming(time_index);
+      hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD);
+      hypre_FinalizeTiming(time_index);
+      hypre_ClearTiming();
+ 
+      time_index = hypre_InitializeTiming("MGR Solve");
+      hypre_BeginTiming(time_index);
+      
+      /* MGR solve */
+      HYPRE_MGRSolve(mgr_solver, parcsr_A, b, x);
+
+      hypre_EndTiming(time_index);
+      hypre_PrintTiming("Solve phase times", hypre_MPI_COMM_WORLD);
+      hypre_FinalizeTiming(time_index);
+      hypre_ClearTiming();
+
+      HYPRE_MGRGetNumIterations(mgr_solver, &num_iterations);
+      HYPRE_MGRGetFinalRelativeResidualNorm(mgr_solver, &final_res_norm);
+
+      if (myid == 0)
+      {
+         hypre_printf("\n");
+         hypre_printf("MGR Iterations = %d\n", num_iterations);
+         hypre_printf("Final Relative Residual Norm = %e\n", final_res_norm);
+         hypre_printf("\n");
+      }
+
+#if SECOND_TIME
+      /* run a second time to check for memory leaks */
+      HYPRE_ParVectorSetRandomValues(x, 775);
+      HYPRE_MGRSetup(mgr_solver, parcsr_A, b, x);
+      HYPRE_MGRSolve(mgr_solver, parcsr_A, b, x);
+#endif
+      
+      /* free memory */
+      if(mgr_num_cindexes)
+         hypre_TFree(mgr_num_cindexes);
+      mgr_num_cindexes = NULL; 
+      
+      if(mgr_reserved_coarse_indexes)
+         hypre_TFree(mgr_reserved_coarse_indexes);
+      mgr_reserved_coarse_indexes = NULL; 
+      
+      if(mgr_cindexes)
+      {
+         for( i=0; i<mgr_nlevels; i++)
+         {
+            if(mgr_cindexes[i])
+               hypre_TFree(mgr_cindexes[i]);
+         }
+         hypre_TFree(mgr_cindexes);
+         mgr_cindexes = NULL;
+      }
+
+      HYPRE_BoomerAMGDestroy(amg_solver);      
+      HYPRE_MGRDestroy(mgr_solver);
+   }
+
+   /*-----------------------------------------------------------
     * Print the solution and other info
     *-----------------------------------------------------------*/
 
    /* RDF: Why is this here? */
-   if (!(build_rhs_type ==1 || build_rhs_type ==7))
-      HYPRE_IJVectorGetObjectType(ij_b, &j);
+   /*if (!(build_rhs_type ==1 || build_rhs_type ==7))
+      HYPRE_IJVectorGetObjectType(ij_b, &j);*/
 
    if (print_system)
    {
@@ -5133,7 +5654,7 @@
    else HYPRE_ParCSRMatrixDestroy(parcsr_A);
 
    /* for build_rhs_type = 1 or 7, we did not create ij_b  - just b*/
-   if (build_rhs_type ==1 || build_rhs_type ==7)
+   if (build_rhs_type ==1 || build_rhs_type ==7 || build_rhs_type==6)
       HYPRE_ParVectorDestroy(b);
    else
       HYPRE_IJVectorDestroy(ij_b);
@@ -5152,7 +5673,9 @@
 /*
   hypre_FinalizeMemoryDebug();
 */
+final: 
 
+   hypre_GPUFinalize();
    hypre_MPI_Finalize();
 
    return (0);
@@ -5762,6 +6285,17 @@
 }
 
 /*----------------------------------------------------------------------
+ * returns the sign of a real number
+ *  1 : positive
+ *  0 : zero
+ * -1 : negative
+ *----------------------------------------------------------------------*/
+static inline HYPRE_Int sign_double(HYPRE_Real a)
+{
+   return ( (0.0 < a) - (0.0 > a) );
+}
+
+/*----------------------------------------------------------------------
  * Build standard 7-point convection-diffusion operator 
  * Parameters given in command line.
  * Operator:
@@ -5774,18 +6308,19 @@
 BuildParDifConv( HYPRE_Int                  argc,
                  char                *argv[],
                  HYPRE_Int                  arg_index,
-                 HYPRE_ParCSRMatrix  *A_ptr     )
+                 HYPRE_ParCSRMatrix  *A_ptr)
 {
-   HYPRE_Int                 nx, ny, nz;
-   HYPRE_Int                 P, Q, R;
+   HYPRE_Int           nx, ny, nz;
+   HYPRE_Int           P, Q, R;
    HYPRE_Real          cx, cy, cz;
-   HYPRE_Real          ax, ay, az;
+   HYPRE_Real          ax, ay, az, atype;
    HYPRE_Real          hinx,hiny,hinz;
+   HYPRE_Int           sign_prod;
 
    HYPRE_ParCSRMatrix  A;
 
-   HYPRE_Int                 num_procs, myid;
-   HYPRE_Int                 p, q, r;
+   HYPRE_Int           num_procs, myid;
+   HYPRE_Int           p, q, r;
    HYPRE_Real         *values;
 
    /*-----------------------------------------------------------
@@ -5815,6 +6350,8 @@
    ay = 1.;
    az = 1.;
 
+   atype = 0;
+
    /*-----------------------------------------------------------
     * Parse command line
     *-----------------------------------------------------------*/
@@ -5849,6 +6386,11 @@
          ay = atof(argv[arg_index++]);
          az = atof(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-atype") == 0 )
+      {
+         arg_index++;
+         atype = atoi(argv[arg_index++]);
+      }
       else
       {
          arg_index++;
@@ -5895,28 +6437,146 @@
    /*-----------------------------------------------------------
     * Generate the matrix 
     *-----------------------------------------------------------*/
- 
+   /* values[7]:
+    *    [0]: center
+    *    [1]: X-
+    *    [2]: Y-
+    *    [3]: Z-
+    *    [4]: X+
+    *    [5]: Y+
+    *    [6]: Z+
+    */    
    values = hypre_CTAlloc(HYPRE_Real, 7);
 
-   values[1] = -cx/(hinx*hinx);
-   values[2] = -cy/(hiny*hiny);
-   values[3] = -cz/(hinz*hinz);
-   values[4] = -cx/(hinx*hinx) + ax/hinx;
-   values[5] = -cy/(hiny*hiny) + ay/hiny;
-   values[6] = -cz/(hinz*hinz) + az/hinz;
-
    values[0] = 0.;
-   if (nx > 1)
+
+   if (0 == atype) /* forward scheme for conv */
    {
-      values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx;
+      values[1] = -cx/(hinx*hinx);
+      values[2] = -cy/(hiny*hiny);
+      values[3] = -cz/(hinz*hinz);
+      values[4] = -cx/(hinx*hinx) + ax/hinx;
+      values[5] = -cy/(hiny*hiny) + ay/hiny;
+      values[6] = -cz/(hinz*hinz) + az/hinz;
+
+      if (nx > 1)
+      {
+         values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx;
+      }
+      if (ny > 1)
+      {
+         values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny;
+      }
+      if (nz > 1)
+      {
+         values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz;
+      }
+   } 
+   else if (1 == atype) /* backward scheme for conv */
+   {
+      values[1] = -cx/(hinx*hinx) - ax/hinx;
+      values[2] = -cy/(hiny*hiny) - ay/hiny;
+      values[3] = -cz/(hinz*hinz) - az/hinz;
+      values[4] = -cx/(hinx*hinx);
+      values[5] = -cy/(hiny*hiny);
+      values[6] = -cz/(hinz*hinz);
+
+      if (nx > 1)
+      {
+         values[0] += 2.0*cx/(hinx*hinx) + 1.*ax/hinx;
+      }
+      if (ny > 1)
+      {
+         values[0] += 2.0*cy/(hiny*hiny) + 1.*ay/hiny;
+      }
+      if (nz > 1)
+      {
+         values[0] += 2.0*cz/(hinz*hinz) + 1.*az/hinz;
+      }
    }
-   if (ny > 1)
+   else if (3 == atype) /* upwind scheme */
    {
-      values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny;
+      sign_prod = sign_double(cx) * sign_double(ax);
+      if (sign_prod == 1) /* same sign use back scheme */
+      {
+         values[1] = -cx/(hinx*hinx) - ax/hinx;
+         values[4] = -cx/(hinx*hinx);
+         if (nx > 1)
+         {
+            values[0] += 2.0*cx/(hinx*hinx) + 1.*ax/hinx;
+         }
+      }
+      else /* diff sign use forward scheme */
+      {
+         values[1] = -cx/(hinx*hinx);
+         values[4] = -cx/(hinx*hinx) + ax/hinx;
+         if (nx > 1)
+         {
+            values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx;
+         }
+      }
+
+      sign_prod = sign_double(cy) * sign_double(ay);
+      if (sign_prod == 1) /* same sign use back scheme */
+      {
+         values[2] = -cy/(hiny*hiny) - ay/hiny;
+         values[5] = -cy/(hiny*hiny);
+         if (ny > 1)
+         {
+            values[0] += 2.0*cy/(hiny*hiny) + 1.*ay/hiny;
+         }
+      }
+      else /* diff sign use forward scheme */
+      {
+         values[2] = -cy/(hiny*hiny);
+         values[5] = -cy/(hiny*hiny) + ay/hiny;
+         if (ny > 1)
+         {
+            values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny;
+         }
+      }
+
+      sign_prod = sign_double(cz) * sign_double(az);
+      if (sign_prod == 1) /* same sign use back scheme */
+      {
+         values[3] = -cz/(hinz*hinz) - az/hinz;
+         values[6] = -cz/(hinz*hinz);
+         if (nz > 1)
+         {
+            values[0] += 2.0*cz/(hinz*hinz) + 1.*az/hinz;
+         }
+      }
+      else /* diff sign use forward scheme */
+      {
+         values[3] = -cz/(hinz*hinz);
+         values[6] = -cz/(hinz*hinz) + az/hinz;
+         if (nz > 1)
+         {
+            values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz;
+         }
+      }
    }
-   if (nz > 1)
+   else /* centered difference scheme */
    {
-      values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz;
+      values[1] = -cx/(hinx*hinx) - ax/(2.*hinx);
+      values[2] = -cy/(hiny*hiny) - ay/(2.*hiny);
+      values[3] = -cz/(hinz*hinz) - az/(2.*hinz);
+      values[4] = -cx/(hinx*hinx) + ax/(2.*hinx);
+      values[5] = -cy/(hiny*hiny) + ay/(2.*hiny);
+      values[6] = -cz/(hinz*hinz) + az/(2.*hinz);
+
+      if (nx > 1)
+      {
+         values[0] += 2.0*cx/(hinx*hinx);
+      }
+      if (ny > 1)
+      {
+         values[0] += 2.0*cy/(hiny*hiny);
+      }
+      if (nz > 1)
+      {
+         values[0] += 2.0*cz/(hinz*hinz);
+      }
    }
 
    A = (HYPRE_ParCSRMatrix) GenerateDifConv(hypre_MPI_COMM_WORLD,
@@ -6580,8 +7240,9 @@
    HYPRE_ParCSRMatrix  A;
    HYPRE_ParVector  rhs;
 
-   HYPRE_Int                 num_procs, myid;
-   HYPRE_Int                 p, q, r;
+   HYPRE_Int           num_procs, myid;
+   HYPRE_Int           p, q, r;
+   HYPRE_Int           type;
    HYPRE_Real          eps;
 
    /*-----------------------------------------------------------
@@ -6601,6 +7262,11 @@
    P  = 1;
    Q  = num_procs;
    R  = 1;
+   eps = 1.0;
+
+   /* type: 0   : default FD;
+    *       1-3 : FD and examples 1-3 in Ruge-Stuben paper */
+   type = 0;
 
    /*-----------------------------------------------------------
     * Parse command line
@@ -6627,6 +7293,11 @@
          arg_index++;
          eps  = atof(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-vardifconvRS") == 0 )
+      {
+         arg_index++;
+         type = atoi(argv[arg_index++]);
+      }
       else
       {
          arg_index++;
@@ -6667,8 +7338,17 @@
     * Generate the matrix
     *-----------------------------------------------------------*/
 
-   A = (HYPRE_ParCSRMatrix) GenerateVarDifConv(hypre_MPI_COMM_WORLD,
-                                               nx, ny, nz, P, Q, R, p, q, r, eps, &rhs);
+   if (0 == type)
+   {
+      A = (HYPRE_ParCSRMatrix) GenerateVarDifConv(hypre_MPI_COMM_WORLD,
+                                                  nx, ny, nz, P, Q, R, p, q, r, eps, &rhs);
+   }
+   else
+   {
+      A = (HYPRE_ParCSRMatrix) GenerateRSVarDifConv(hypre_MPI_COMM_WORLD,
+                                                    nx, ny, nz, P, Q, R, p, q, r, eps, &rhs,
+                                                    type);
+   }
 
    *A_ptr = A;
    *rhs_ptr = rhs;
diff -Nru hypre-2.11.2/src/test/Makefile hypre-2.13.0/src/test/Makefile
--- hypre-2.11.2/src/test/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -10,10 +10,12 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+default:all
 
 include ../config/Makefile.config
 
 CINCLUDES = ${INCLUDES} ${MPIINCLUDE}
+
 CDEFS = -DHYPRE_TIMING -DHYPRE_FORTRAN
 CXXDEFS = -DNOFEI -DHYPRE_TIMING -DMPICH_SKIP_MPICXX
 
@@ -162,6 +164,10 @@
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $@.o ${LFLAGS}
 
+struct_newboxloop: struct_newboxloop.o $(KOKKOS_LINK_DEPENDS)
+	@echo  "Building" $@ "... "
+	${LINK_CC} -o $@ $@.o ${LFLAGS}
+
 # RDF: Keep these for now
 
 hypre_set_precond: hypre_set_precond.o
diff -Nru hypre-2.11.2/src/test/maxwell_unscaled.c hypre-2.13.0/src/test/maxwell_unscaled.c
--- hypre-2.11.2/src/test/maxwell_unscaled.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/maxwell_unscaled.c	2017-10-20 17:42:22.000000000 +0000
@@ -1336,6 +1336,8 @@
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs);
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid);
 
+   hypre_GPUInit(-1);
+
    hypre_InitMemoryDebug(myid);
 
    /*-----------------------------------------------------------
@@ -1741,7 +1743,7 @@
    for (j = 0; j < data.max_boxsize; j++)
    {
       values[j]= sin((HYPRE_Real)(j+1));
-      values[j]= (HYPRE_Real) rand()/RAND_MAX;
+      values[j]= (HYPRE_Real) hypre_Rand();
       values[j]= (HYPRE_Real) j;
    }
    for (part = 0; part < data.nparts; part++)
@@ -1900,6 +1902,7 @@
    hypre_FinalizeMemoryDebug();
 
    /* Finalize MPI */
+   hypre_GPUFinalize();
    hypre_MPI_Finalize();
 
    return (0);
diff -Nru hypre-2.11.2/src/test/runtest.sh hypre-2.13.0/src/test/runtest.sh
--- hypre-2.11.2/src/test/runtest.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/runtest.sh	2017-10-20 17:42:22.000000000 +0000
@@ -82,7 +82,7 @@
          # RunString="${RunString} -nodes $POE_NUM_NODES $MY_ARGS"
          RunString="poe $MY_ARGS -rmpool pdebug -procs $POE_NUM_PROCS -nodes $POE_NUM_NODES"
          ;;
-      rzzeus*|rzmerl*|ansel*|aztec*|cab*|sierra*|vulcan*)
+      rztopaz*|aztec*|cab*|quartz*|sierra*|syrah*|vulcan*)
          shift
          if [ $NumThreads -gt 0 ] ; then
             export OMP_NUM_THREADS=$NumThreads
diff -Nru hypre-2.11.2/src/test/sstruct.c hypre-2.13.0/src/test/sstruct.c
--- hypre-2.11.2/src/test/sstruct.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/sstruct.c	2017-10-20 17:42:22.000000000 +0000
@@ -466,7 +466,6 @@
          sdata_line = fgets((sdata + sdata_size), maxline, file);
       }
    }
-
    /* broadcast the data size */
    hypre_MPI_Bcast(&sdata_size, 1, HYPRE_MPI_INT, 0, hypre_MPI_COMM_WORLD);
 
@@ -2419,10 +2418,15 @@
 
    /* Initialize MPI */
    hypre_MPI_Init(&argc, &argv);
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 10;
+   Kokkos::initialize (args);
+#endif
 
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs);
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid);
-
+   hypre_GPUInit(-1);
    hypre_InitMemoryDebug(myid);
 
    /*-----------------------------------------------------------
@@ -5734,6 +5738,10 @@
    hypre_FinalizeMemoryDebug();
 
    /* Finalize MPI */
+   hypre_GPUFinalize();
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
    hypre_MPI_Finalize();
 
    return (0);
diff -Nru hypre-2.11.2/src/test/struct.c hypre-2.13.0/src/test/struct.c
--- hypre-2.11.2/src/test/struct.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/struct.c	2017-10-20 17:42:22.000000000 +0000
@@ -188,7 +188,12 @@
 
    /* Initialize MPI */
    hypre_MPI_Init(&argc, &argv);
-
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 10;
+   Kokkos::initialize (args);
+#endif
+   
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
 
@@ -839,7 +844,7 @@
        * Set up the stencil structure needed for matrix creation
        * which is always the case for read_fromfile_param == 0
        *-----------------------------------------------------------*/
- 
+      
       HYPRE_StructStencilCreate(dim, (2-sym)*dim + 1, &stencil);
       for (s = 0; s < (2-sym)*dim + 1; s++)
       {
@@ -884,7 +889,6 @@
            (read_rhsfromfile_param ==1) 
          )
       {
-         hypre_printf("\nreading linear system from files: matrix, rhs and x0\n");
          /* ghost selection for reading the matrix and vectors */
          for (i = 0; i < dim; i++)
          {
@@ -970,12 +974,14 @@
          HYPRE_StructGridSetPeriodic(grid, periodic);
          HYPRE_StructGridSetNumGhost(grid, num_ghost);
          HYPRE_StructGridAssemble(grid);
-
+	 
+	 
          /*-----------------------------------------------------------
           * Set up the matrix structure
           *-----------------------------------------------------------*/
-
+	 
          HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, grid, stencil, &A);
+
          if ( solver_id == 3 || solver_id == 4 ||
               solver_id == 13 || solver_id == 14 )
          {
@@ -1008,13 +1014,14 @@
                constant_coefficient = 2;
             }
          }
+
          HYPRE_StructMatrixSetSymmetric(A, sym);
          HYPRE_StructMatrixInitialize(A);
 
          /*-----------------------------------------------------------
           * Fill in the matrix elements
           *-----------------------------------------------------------*/
-   
+
          AddValuesMatrix(A,grid,cx,cy,cz,conx,cony,conz);
 
          /* Zero out stencils reaching to real boundary */
@@ -1022,7 +1029,6 @@
 
          if ( constant_coefficient == 0 ) SetStencilBndry(A,grid,periodic); 
          HYPRE_StructMatrixAssemble(A);
-
          /*-----------------------------------------------------------
           * Set up the linear system
           *-----------------------------------------------------------*/
@@ -1041,7 +1047,7 @@
 
          HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x);
          HYPRE_StructVectorInitialize(x);
-    
+
          AddValuesVector(grid,x,periodx0,0.0);
          HYPRE_StructVectorAssemble(x);
 
@@ -2765,6 +2771,9 @@
    }
 
    /* Finalize MPI */
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
    hypre_MPI_Finalize();
 
    return (0);
@@ -2799,8 +2808,8 @@
    {
       box      = hypre_BoxArrayBox(gridboxes, ib);
       volume   =  hypre_BoxVolume(box);
-      values   = hypre_CTAlloc(HYPRE_Real, volume);
-
+      values   = hypre_UMCTAlloc(HYPRE_Real, volume);
+      
       /*-----------------------------------------------------------
        * For periodic b.c. in all directions, need rhs to satisfy 
        * compatibility condition. Achieved by setting a source and
@@ -2827,8 +2836,10 @@
 
       ilower = hypre_BoxIMin(box);
       iupper = hypre_BoxIMax(box);
+	  
       HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values);
-      hypre_TFree(values);
+      hypre_UMTFree(values);
+      
 
    }
 
@@ -2900,8 +2911,8 @@
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
-
+	    values     = hypre_UMCTAlloc(HYPRE_Real, stencil_size*volume);
+	    
             for (i = 0; i < stencil_size*volume; i += stencil_size)
             {
                switch (dim)
@@ -2925,14 +2936,16 @@
             }
             ilower = hypre_BoxIMin(box);
             iupper = hypre_BoxIMax(box);
+	    
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
                                            stencil_indices, values);
-            hypre_TFree(values);
+	    hypre_UMTFree(values);
+	    
          }
       }
       else if ( constant_coefficient==1 )
       {
-         values   = hypre_CTAlloc(HYPRE_Real, stencil_size);
+	 values   = hypre_CTAlloc(HYPRE_Real, stencil_size);
          switch (dim)
          {
             case 1:
@@ -2963,7 +2976,7 @@
          hypre_assert( constant_coefficient==2 );
 
          /* stencil index for the center equals dim, so it's easy to leave out */
-         values   = hypre_CTAlloc(HYPRE_Real, stencil_size-1);
+	 values   = hypre_UMCTAlloc(HYPRE_Real, stencil_size-1);
          switch (dim)
          {
             case 1:
@@ -2984,14 +2997,13 @@
             HYPRE_StructMatrixSetConstantValues(A, stencil_size-1,
                                                 stencil_indices, values);
          }
-         hypre_TFree(values);
+	 hypre_UMTFree(values);
 
          hypre_ForBoxI(bi, gridboxes)
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, volume);
-
+	    values   = hypre_UMCTAlloc(HYPRE_Real, volume);
             for ( i=0; i < volume; ++i )
             {
                values[i] = center;
@@ -3000,7 +3012,7 @@
             iupper = hypre_BoxIMax(box);
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
                                            stencil_indices+dim, values);
-            hypre_TFree(values);
+	    hypre_UMTFree(values);
          }
       }
    }
@@ -3043,7 +3055,7 @@
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
+            values   = hypre_UMCTAlloc(HYPRE_Real, stencil_size*volume);
 
             for (i = 0; i < stencil_size*volume; i += stencil_size)
             {
@@ -3077,7 +3089,7 @@
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
                                            stencil_indices, values);
 
-            hypre_TFree(values);
+            hypre_UMTFree(values);
          }
       }
       else if ( constant_coefficient==1 )
@@ -3120,7 +3132,7 @@
       else
       {
          hypre_assert( constant_coefficient==2 );
-         values = hypre_CTAlloc( HYPRE_Real, stencil_size-1 );
+         values = hypre_UMCTAlloc( HYPRE_Real, stencil_size-1 );
          switch (dim)
          {  /* no center in stencil_indices and values */
             case 1:
@@ -3160,7 +3172,7 @@
             HYPRE_StructMatrixSetConstantValues(A, stencil_size,
                                                 stencil_indices, values);
          }
-         hypre_TFree(values);
+         hypre_UMTFree(values);
 
 
          /* center is variable */
@@ -3169,7 +3181,7 @@
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, volume);
+            values   = hypre_UMCTAlloc(HYPRE_Real, volume);
 
             for ( i=0; i < volume; ++i )
             {
@@ -3179,7 +3191,7 @@
             iupper = hypre_BoxIMax(box);
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
                                            stencil_indices, values);
-            hypre_TFree(values);
+            hypre_UMTFree(values);
          }
       }
    }
@@ -3260,7 +3272,7 @@
       {
          for (ib = 0; ib < size; ib++)
          {
-            values = hypre_CTAlloc(HYPRE_Real, vol[ib]);
+	    values = hypre_UMCTAlloc(HYPRE_Real, vol[ib]);
         
             for (i = 0; i < vol[ib]; i++)
             {
@@ -3286,7 +3298,7 @@
                                               1, stencil_indices, values);
                ilower[ib][d] = j;
             }
-            hypre_TFree(values);
+	    hypre_UMTFree(values);
          }
       }
    }
diff -Nru hypre-2.11.2/src/test/struct_migrate.c hypre-2.13.0/src/test/struct_migrate.c
--- hypre-2.11.2/src/test/struct_migrate.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/struct_migrate.c	2017-10-20 17:42:22.000000000 +0000
@@ -65,7 +65,11 @@
 
    /* Initialize MPI */
    hypre_MPI_Init(&argc, &argv);
-
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 10;
+   Kokkos::initialize (args);
+#endif
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
 
@@ -362,7 +366,7 @@
 
    if (myid == 0)
    {
-      printf("\nCheck = %1.0f (success = 0)\n\n", check);
+      hypre_printf("\nCheck = %1.0f (success = 0)\n\n", check);
    }
 
    /*-----------------------------------------------------------
@@ -397,6 +401,9 @@
    HYPRE_StructVectorDestroy(check_vector);
 
    /* Finalize MPI */
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
    hypre_MPI_Finalize();
 
    return (0);
@@ -413,7 +420,7 @@
 {
    HYPRE_Int          ierr = 0;
    hypre_BoxArray    *gridboxes;
-   HYPRE_Int          i,ib;
+   HYPRE_Int          ib;
    hypre_IndexRef     ilower;
    hypre_IndexRef     iupper;
    hypre_Box         *box;
@@ -426,18 +433,19 @@
    hypre_ForBoxI(ib, gridboxes)
    {
       box      = hypre_BoxArrayBox(gridboxes, ib);
-      volume   =  hypre_BoxVolume(box);
-      values   = hypre_CTAlloc(HYPRE_Real, volume);
+      volume   = hypre_BoxVolume(box);
+      values   = hypre_DeviceCTAlloc(HYPRE_Real, volume);
 
-      for (i = 0; i < volume; i++)
+      hypre_LoopBegin(volume,i)
       {
          values[i] = value;
       }
-
+      hypre_LoopEnd();
+	
       ilower = hypre_BoxIMin(box);
       iupper = hypre_BoxIMax(box);
       HYPRE_StructVectorSetBoxValues(vector, ilower, iupper, values);
-      hypre_TFree(values);
+      hypre_DeviceTFree(values);
    }
 
    return ierr;
diff -Nru hypre-2.11.2/src/test/struct_newboxloop.c hypre-2.13.0/src/test/struct_newboxloop.c
--- hypre-2.11.2/src/test/struct_newboxloop.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/struct_newboxloop.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,1956 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "_hypre_utilities.h"
+#include "HYPRE_struct_ls.h"
+#include "HYPRE_krylov.h"
+
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+#define HYPRE_MFLOPS 0
+#if HYPRE_MFLOPS
+#include "_hypre_struct_mv.h"
+#endif
+
+/* RDF: Why is this include here? */
+#include "_hypre_struct_mv.h"
+
+#ifdef HYPRE_DEBUG
+#include <cegdb.h>
+#endif
+
+/* begin lobpcg */
+
+#define NO_SOLVER -9198
+
+#include <time.h>
+ 
+#include "fortran_matrix.h"
+#include "HYPRE_lobpcg.h"
+#include "interpreter.h"
+#include "multivector.h"
+#include "HYPRE_MatvecFunctions.h"
+
+/* end lobpcg */
+
+HYPRE_Int  SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* period);
+
+HYPRE_Int  AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
+                           HYPRE_Real        cx,
+                           HYPRE_Real        cy,
+                           HYPRE_Real        cz,
+                           HYPRE_Real        conx,
+                           HYPRE_Real        cony,
+                           HYPRE_Real        conz) ;
+
+HYPRE_Int AddValuesVector( hypre_StructGrid  *gridvector,
+                           hypre_StructVector *zvector,
+                           HYPRE_Int          *period, 
+                           HYPRE_Real         value  )  ;
+
+/*--------------------------------------------------------------------------
+ * Test driver for structured matrix interface (structured storage)
+ *--------------------------------------------------------------------------*/
+ 
+/*----------------------------------------------------------------------
+ * Standard 7-point laplacian in 3D with grid and anisotropy determined
+ * as command line arguments.  Do `driver -help' for usage info.
+ *----------------------------------------------------------------------*/
+
+hypre_int
+main( hypre_int argc,
+      char *argv[] )
+{
+   HYPRE_Int           arg_index;
+   HYPRE_Int           print_usage;
+   HYPRE_Int           nx, ny, nz;
+   HYPRE_Int           P, Q, R;
+   HYPRE_Int           bx, by, bz;
+   HYPRE_Int           px, py, pz;
+   HYPRE_Real          cx, cy, cz;
+   HYPRE_Real          conx, cony, conz;
+   HYPRE_Int           solver_id;
+   HYPRE_Int           solver_type;
+
+   /*HYPRE_Real          dxyz[3];*/
+
+   HYPRE_Int           A_num_ghost[6] = {0, 0, 0, 0, 0, 0};
+   HYPRE_Int           v_num_ghost[3] = {0,0,0};
+                     
+   HYPRE_StructMatrix  A;
+   HYPRE_StructVector  b;
+   HYPRE_StructVector  x;
+
+   HYPRE_StructSolver  solver;
+   HYPRE_StructSolver  precond;
+   HYPRE_Int           num_iterations;
+   HYPRE_Int           time_index;
+   HYPRE_Real          final_res_norm;
+   HYPRE_Real          cf_tol;
+
+   HYPRE_Int           num_procs, myid;
+
+   HYPRE_Int           p, q, r;
+   HYPRE_Int           dim;
+   HYPRE_Int           n_pre, n_post;
+   HYPRE_Int           nblocks ;
+   HYPRE_Int           skip;
+   HYPRE_Int           sym;
+   HYPRE_Int           rap;
+   HYPRE_Int           relax;
+   HYPRE_Real          jacobi_weight;
+   HYPRE_Int           usr_jacobi_weight;
+   HYPRE_Int           jump;
+   HYPRE_Int           rep, reps;
+
+   HYPRE_Int         **iupper;
+   HYPRE_Int         **ilower;
+
+   HYPRE_Int           istart[3];
+   HYPRE_Int           periodic[3];
+   HYPRE_Int         **offsets;
+   HYPRE_Int           constant_coefficient = 0;
+   HYPRE_Int          *stencil_entries;
+   HYPRE_Int           stencil_size;
+   HYPRE_Int           diag_rank;
+   hypre_Index         diag_index;
+
+   HYPRE_StructGrid    grid;
+   HYPRE_StructGrid    readgrid;
+   HYPRE_StructStencil stencil;
+
+   HYPRE_Int           i, s;
+   HYPRE_Int           ix, iy, iz, ib;
+
+   HYPRE_Int           read_fromfile_param;
+   HYPRE_Int           read_fromfile_index;
+   HYPRE_Int           read_rhsfromfile_param;
+   HYPRE_Int           read_rhsfromfile_index;
+   HYPRE_Int           read_x0fromfile_param;
+   HYPRE_Int           read_x0fromfile_index;
+   HYPRE_Int           periodx0[3] = {0,0,0};
+   HYPRE_Int          *readperiodic;
+   HYPRE_Int           sum;
+   HYPRE_Int           inner;
+
+   HYPRE_Int           print_system = 0;
+
+   /* begin lobpcg */
+   
+   HYPRE_Int lobpcgFlag = 0;
+   HYPRE_Int lobpcgSeed = 0;
+   HYPRE_Int blockSize = 1;
+   HYPRE_Int verbosity = 1;
+   HYPRE_Int iterations;
+   HYPRE_Int maxIterations = 100;
+   HYPRE_Int checkOrtho = 0;
+   HYPRE_Int printLevel = 0;
+   HYPRE_Int pcgIterations = 0;
+   HYPRE_Int pcgMode = 0;
+   HYPRE_Real tol = 1e-6;
+   HYPRE_Real pcgTol = 1e-2;
+   HYPRE_Real nonOrthF;
+
+   FILE* filePtr;
+
+   mv_MultiVectorPtr eigenvectors = NULL;
+   mv_MultiVectorPtr constrains = NULL;
+   HYPRE_Real* eigenvalues = NULL;
+
+   HYPRE_Real* residuals;
+   utilities_FortranMatrix* residualNorms;
+   utilities_FortranMatrix* residualNormsHistory;
+   utilities_FortranMatrix* eigenvaluesHistory;
+   utilities_FortranMatrix* printBuffer;
+   utilities_FortranMatrix* gramXX;
+   utilities_FortranMatrix* identity;
+
+   HYPRE_StructSolver        lobpcg_solver;
+
+   mv_InterfaceInterpreter* interpreter;
+   HYPRE_MatvecFunctions matvec_fn;
+   /* end lobpcg */
+
+   /*-----------------------------------------------------------
+    * Initialize some stuff
+    *-----------------------------------------------------------*/
+
+   /* Initialize MPI */
+   hypre_MPI_Init(&argc, &argv);
+   
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 12;
+   Kokkos::initialize (args);
+#endif
+   
+   hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
+   hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+
+
+#ifdef HYPRE_DEBUG
+   cegdb(&argc, &argv, myid);
+#endif
+
+   hypre_InitMemoryDebug(myid);
+
+   /*-----------------------------------------------------------
+    * Set defaults
+    *-----------------------------------------------------------*/
+ 
+   dim = 3;
+
+   skip  = 0;
+   sym  = 1;
+   rap = 0;
+   relax = 1;
+   usr_jacobi_weight= 0;
+   jump  = 0;
+   reps = 1;
+
+   nx = 10;
+   ny = 10;
+   nz = 10;
+
+   P  = num_procs;
+   Q  = 1;
+   R  = 1;
+
+   bx = 1;
+   by = 1;
+   bz = 1;
+
+   cx = 1.0;
+   cy = 1.0;
+   cz = 1.0;
+   conx = 0.0;
+   cony = 0.0;
+   conz = 0.0;
+
+   n_pre  = 1;
+   n_post = 1;
+
+   solver_id = 0;
+   solver_type = 1;
+
+   istart[0] = -3;
+   istart[1] = -3;
+   istart[2] = -3;
+
+   px = 0;
+   py = 0;
+   pz = 0;
+
+   cf_tol = 0.90;
+
+   /* setting defaults for the reading parameters    */
+   read_fromfile_param = 0;
+   read_fromfile_index = argc;
+   read_rhsfromfile_param = 0;
+   read_rhsfromfile_index = argc;
+   read_x0fromfile_param = 0;
+   read_x0fromfile_index = argc;
+   sum = 0;
+
+   /* ghosts for the building of matrix: default  */
+   for (i = 0; i < dim; i++)
+   {
+      A_num_ghost[2*i] = 1;
+      A_num_ghost[2*i + 1] = 1;
+   }
+
+   /*-----------------------------------------------------------
+    * Parse command line
+    *-----------------------------------------------------------*/
+ 
+   print_usage = 0;
+   arg_index = 1;
+   while (arg_index < argc)
+   {
+      if ( strcmp(argv[arg_index], "-n") == 0 )
+      {
+         arg_index++;
+         nx = atoi(argv[arg_index++]);
+         ny = atoi(argv[arg_index++]);
+         nz = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-istart") == 0 )
+      {
+         arg_index++;
+         istart[0] = atoi(argv[arg_index++]);
+         istart[1] = atoi(argv[arg_index++]);
+         istart[2] = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-P") == 0 )
+      {
+         arg_index++;
+         P  = atoi(argv[arg_index++]);
+         Q  = atoi(argv[arg_index++]);
+         R  = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-b") == 0 )
+      {
+         arg_index++;
+         bx = atoi(argv[arg_index++]);
+         by = atoi(argv[arg_index++]);
+         bz = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-p") == 0 )
+      {
+         arg_index++;
+         px = atoi(argv[arg_index++]);
+         py = atoi(argv[arg_index++]);
+         pz = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-c") == 0 )
+      {
+         arg_index++;
+         cx = atof(argv[arg_index++]);
+         cy = atof(argv[arg_index++]);
+         cz = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-convect") == 0 )
+      {
+         arg_index++;
+         conx = atof(argv[arg_index++]);
+         cony = atof(argv[arg_index++]);
+         conz = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-d") == 0 )
+      {
+         arg_index++;
+         dim = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-fromfile") == 0 )
+      {
+         arg_index++;
+         read_fromfile_param = 1;
+         read_fromfile_index = arg_index;
+      }
+      else if ( strcmp(argv[arg_index], "-rhsfromfile") == 0 )
+      {
+         arg_index++;
+         read_rhsfromfile_param = 1;
+         read_rhsfromfile_index = arg_index;
+      }
+      else if ( strcmp(argv[arg_index], "-x0fromfile") == 0 )
+      {
+         arg_index++;
+         read_x0fromfile_param = 1;
+         read_x0fromfile_index = arg_index;
+      }
+      else if (strcmp(argv[arg_index], "-repeats") == 0 )
+      {
+         arg_index++;
+         reps = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-solver") == 0 )
+      {
+         arg_index++;
+
+	 /* begin lobpcg */
+	 if ( strcmp(argv[arg_index], "none") == 0 ) {
+            solver_id = NO_SOLVER;
+            arg_index++;
+	 }
+	 else /* end lobpcg */
+            solver_id = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-v") == 0 )
+      {
+         arg_index++;
+         n_pre = atoi(argv[arg_index++]);
+         n_post = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-rap") == 0 )
+      {
+         arg_index++;
+         rap = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-relax") == 0 )
+      {
+         arg_index++;
+         relax = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-w") == 0 )
+      {
+         arg_index++;
+         jacobi_weight= atof(argv[arg_index++]);
+         usr_jacobi_weight= 1; /* flag user weight */
+      }
+      else if ( strcmp(argv[arg_index], "-sym") == 0 )
+      {
+         arg_index++;
+         sym = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-skip") == 0 )
+      {
+         arg_index++;
+         skip = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-jump") == 0 )
+      {
+         arg_index++;
+         jump = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-solver_type") == 0 )
+      {
+         arg_index++;
+         solver_type = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-cf") == 0 )
+      {
+         arg_index++;
+         cf_tol = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-print") == 0 )
+      {
+         arg_index++;
+         print_system = 1;
+      }
+      else if ( strcmp(argv[arg_index], "-help") == 0 )
+      {
+         print_usage = 1;
+         break;
+      }
+      /* begin lobpcg */
+      else if ( strcmp(argv[arg_index], "-lobpcg") == 0 ) 
+      {				         /* use lobpcg */
+         arg_index++;
+	 lobpcgFlag = 1;
+      }
+      else if ( strcmp(argv[arg_index], "-orthchk") == 0 )
+      {			/* lobpcg: check orthonormality */
+         arg_index++;
+	 checkOrtho = 1;
+      }
+      else if ( strcmp(argv[arg_index], "-verb") == 0 ) 
+      {			  /* lobpcg: verbosity level */
+         arg_index++;
+         verbosity = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-vrand") == 0 ) 
+      {                         /* lobpcg: block size */
+         arg_index++;
+         blockSize = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-seed") == 0 )
+      {		           /* lobpcg: seed for srand */
+         arg_index++;
+         lobpcgSeed = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-itr") == 0 ) 
+      {		     /* lobpcg: max # of iterations */
+         arg_index++;
+         maxIterations = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-tol") == 0 ) 
+      {		               /* lobpcg: tolerance */
+         arg_index++;
+         tol = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-pcgitr") == 0 ) 
+      {		   /* lobpcg: max inner pcg iterations */
+         arg_index++;
+         pcgIterations = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-pcgtol") == 0 ) 
+      {	     /* lobpcg: inner pcg iterations tolerance */
+         arg_index++;
+         pcgTol = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-pcgmode") == 0 ) 
+      {		 /* lobpcg: initial guess for inner pcg */
+         arg_index++;	      /* 0: zero, otherwise rhs */
+         pcgMode = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-vout") == 0 )
+      {			      /* lobpcg: print level */
+         arg_index++;
+         printLevel = atoi(argv[arg_index++]);
+      }
+      /* end lobpcg */
+      else
+      {
+         arg_index++;
+      }
+   }
+
+   /* begin lobpcg */
+
+   if ( solver_id == 0 && lobpcgFlag )
+      solver_id = 10;
+
+   /*end lobpcg */
+
+   sum = read_x0fromfile_param + read_rhsfromfile_param + read_fromfile_param; 
+
+   /*-----------------------------------------------------------
+    * Print usage info
+    *-----------------------------------------------------------*/
+ 
+   if ( (print_usage) && (myid == 0) )
+   {
+      hypre_printf("\n");
+      hypre_printf("Usage: %s [<options>]\n", argv[0]);
+      hypre_printf("\n");
+      hypre_printf("  -n <nx> <ny> <nz>   : problem size per block\n");
+      hypre_printf("  -istart <istart[0]> <istart[1]> <istart[2]> : start of box\n");
+      hypre_printf("  -P <Px> <Py> <Pz>   : processor topology\n");
+      hypre_printf("  -b <bx> <by> <bz>   : blocking per processor\n");
+      hypre_printf("  -p <px> <py> <pz>   : periodicity in each dimension\n");
+      hypre_printf("  -c <cx> <cy> <cz>   : diffusion coefficients\n");
+      hypre_printf("  -convect <x> <y> <z>: convection coefficients\n");
+      hypre_printf("  -d <dim>            : problem dimension (2 or 3)\n");
+      hypre_printf("  -fromfile <name>    : prefix name for matrixfiles\n");
+      hypre_printf("  -rhsfromfile <name> : prefix name for rhsfiles\n");
+      hypre_printf("  -x0fromfile <name>  : prefix name for firstguessfiles\n");
+      hypre_printf("  -repeats <reps>     : number of times to repeat the run, default 1.\n");
+      hypre_printf("  -solver <ID>        : solver ID\n");
+      hypre_printf("                        0  - axpy\n");
+      hypre_printf("                        1  - spMV\n");
+      hypre_printf("                        2  - inner product\n");
+	  hypre_printf("                        3  - spMV with constant coeffs\n");
+	  hypre_printf("                        4  - spMV with constant coeffs var diag\n");
+      hypre_printf("                        8  - Jacobi\n");
+      hypre_printf("  -sym <s>            : symmetric storage (1) or not (0)\n");
+      hypre_printf("  -jump <num>         : num levels to jump in SparseMSG\n");
+      hypre_printf("\n");
+   }
+
+   if ( print_usage )
+   {
+      exit(1);
+   }
+
+   /*-----------------------------------------------------------
+    * Check a few things
+    *-----------------------------------------------------------*/
+
+   if ((P*Q*R) > num_procs)
+   {
+      if (myid == 0)
+      {
+         hypre_printf("Error: PxQxR is more than the number of processors\n");
+      }
+      exit(1);
+   }
+   else if ((P*Q*R) < num_procs)
+   {
+      if (myid == 0)
+      {
+         hypre_printf("Warning: PxQxR is less than the number of processors\n");
+      }
+   }
+
+   if ((conx != 0.0 || cony !=0 || conz != 0) && sym == 1 )
+   {
+      if (myid == 0)
+      {
+         hypre_printf("Warning: Convection produces non-symmetric matrix\n");
+      }
+      sym = 0;
+   }
+
+   /*-----------------------------------------------------------
+    * Print driver parameters
+    *-----------------------------------------------------------*/
+ 
+   if (myid == 0 && sum == 0)
+   {
+#ifdef HYPRE_USE_DEFAULT
+	   hypre_printf("Running with openMP macro\n");
+#endif
+#ifdef HYPRE_USE_KOKKOS
+	   hypre_printf("Running with Kokkos macro\n");
+#endif
+#ifdef HYPRE_USE_CUDA
+	   hypre_printf("Running with CUDA macro\n");
+#endif
+#ifdef HYPRE_USE_RAJA
+	   hypre_printf("Running with CUDA macro\n");
+#endif
+#ifdef  HYPRE_USE_KOKKOS_CUDA
+	   hypre_printf("Running kokkos with CUDA macro\n");
+#endif
+	   
+      hypre_printf("Running with these driver parameters:\n");
+      hypre_printf("  (nx, ny, nz)    = (%d, %d, %d)\n", nx, ny, nz);
+      hypre_printf("  (istart[0],istart[1],istart[2]) = (%d, %d, %d)\n", \
+                   istart[0],istart[1],istart[2]);
+      hypre_printf("  (Px, Py, Pz)    = (%d, %d, %d)\n", P,  Q,  R);
+      hypre_printf("  (bx, by, bz)    = (%d, %d, %d)\n", bx, by, bz);
+      hypre_printf("  (px, py, pz)    = (%d, %d, %d)\n", px, py, pz);
+      hypre_printf("  (cx, cy, cz)    = (%f, %f, %f)\n", cx, cy, cz);
+      hypre_printf("  (conx,cony,conz)= (%f, %f, %f)\n", conx, cony, conz);
+      hypre_printf("  (n_pre, n_post) = (%d, %d)\n", n_pre, n_post);
+      hypre_printf("  dim             = %d\n", dim);
+      hypre_printf("  skip            = %d\n", skip);
+      hypre_printf("  sym             = %d\n", sym);
+      hypre_printf("  rap             = %d\n", rap);
+      hypre_printf("  relax           = %d\n", relax);
+      hypre_printf("  jump            = %d\n", jump);
+      hypre_printf("  solver ID       = %d\n", solver_id);
+   }
+
+   if (myid == 0 && sum > 0)
+   {
+      hypre_printf("Running with these driver parameters:\n");
+      hypre_printf("  (cx, cy, cz)    = (%f, %f, %f)\n", cx, cy, cz);
+      hypre_printf("  (conx,cony,conz)= (%f, %f, %f)\n", conx, cony, conz);
+      hypre_printf("  (n_pre, n_post) = (%d, %d)\n", n_pre, n_post);
+      hypre_printf("  dim             = %d\n", dim);
+      hypre_printf("  skip            = %d\n", skip);
+      hypre_printf("  sym             = %d\n", sym);
+      hypre_printf("  rap             = %d\n", rap);
+      hypre_printf("  relax           = %d\n", relax);
+      hypre_printf("  jump            = %d\n", jump);
+      hypre_printf("  solver ID       = %d\n", solver_id);
+      hypre_printf("  the grid is read from  file \n");
+	     
+   }
+  
+   /*-----------------------------------------------------------
+	* Set up the stencil structure (7 points) when matrix is NOT read from file
+	* Set up the grid structure used when NO files are read
+	*-----------------------------------------------------------*/
+   
+   switch (dim)
+   {
+   case 1:
+	   nblocks = bx;
+	   if(sym)
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 2);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[0][0] = -1; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[1][0] = 0; 
+	   }
+	   else
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 3);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[0][0] = -1;
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[1][0] = 0;
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[2][0] = 1;
+	   }
+	   /* compute p from P and myid */
+	   p = myid % P;
+	   break;
+	   
+   case 2:
+	   nblocks = bx*by;
+	   if(sym)
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 3);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+	   }
+	   else
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 5);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+		   offsets[3] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[3][0] = 1; 
+		   offsets[3][1] = 0; 
+		   offsets[4] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[4][0] = 0; 
+		   offsets[4][1] = 1; 
+	   }
+	   /* compute p,q from P,Q and myid */
+	   p = myid % P;
+	   q = (( myid - p)/P) % Q;
+	   break;
+	   
+   case 3:
+	   nblocks = bx*by*bz;
+	   if(sym)
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 4);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[0][2] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[1][2] = 0; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+		   offsets[2][2] = -1; 
+		   offsets[3] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[3][0] = 0; 
+		   offsets[3][1] = 0; 
+		   offsets[3][2] = 0; 
+	   }
+	   else
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 7);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[0][2] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[1][2] = 0; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+		   offsets[2][2] = -1; 
+		   offsets[3] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[3][0] = 0; 
+		   offsets[3][1] = 0; 
+		   offsets[3][2] = 0; 
+		   offsets[4] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[4][0] = 1; 
+		   offsets[4][1] = 0; 
+		   offsets[4][2] = 0; 
+		   offsets[5] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[5][0] = 0; 
+		   offsets[5][1] = 1; 
+		   offsets[5][2] = 0; 
+		   offsets[6] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[6][0] = 0; 
+		   offsets[6][1] = 0; 
+		   offsets[6][2] = 1; 
+	   }
+	   /* compute p,q,r from P,Q,R and myid */
+	   p = myid % P;
+	   q = (( myid - p)/P) % Q;
+	   r = ( myid - p - P*q)/( P*Q );
+	   break;
+   }
+   
+   if (myid >= (P*Q*R))
+   {
+	   /* My processor has no data on it */
+	   nblocks = bx = by = bz = 0;
+   }
+   
+   /*-----------------------------------------------------------
+	* Set up the stencil structure needed for matrix creation
+	* which is always the case for read_fromfile_param == 0
+	*-----------------------------------------------------------*/
+   
+   HYPRE_StructStencilCreate(dim, (2-sym)*dim + 1, &stencil);
+   for (s = 0; s < (2-sym)*dim + 1; s++)
+   {
+	   HYPRE_StructStencilSetElement(stencil, s, offsets[s]);
+   }
+   
+   /*-----------------------------------------------------------
+	* Set up periodic
+	*-----------------------------------------------------------*/
+   
+   periodic[0] = px;
+   periodic[1] = py;
+   periodic[2] = pz;
+   
+   /*-----------------------------------------------------------
+	* Set up dxyz for PFMG solver
+	*-----------------------------------------------------------*/
+
+   /* We do the extreme cases first reading everything from files => sum = 3
+	* building things from scratch (grid,stencils,extents) sum = 0 */
+
+   if ( (read_fromfile_param ==1) &&
+		(read_x0fromfile_param ==1) &&
+		(read_rhsfromfile_param ==1) 
+	   )
+   {
+	   hypre_printf("\nreading linear system from files: matrix, rhs and x0\n");
+	   /* ghost selection for reading the matrix and vectors */
+	   for (i = 0; i < dim; i++)
+	   {
+		   A_num_ghost[2*i] = 1;
+		   A_num_ghost[2*i + 1] = 1;
+		   v_num_ghost[2*i] = 1;
+		   v_num_ghost[2*i + 1] = 1;
+	   }
+	   
+	   A = (HYPRE_StructMatrix)
+		   hypre_StructMatrixRead(hypre_MPI_COMM_WORLD,
+								  argv[read_fromfile_index],A_num_ghost);
+	   
+	   b = (HYPRE_StructVector)
+		   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+								  argv[read_rhsfromfile_index],v_num_ghost);
+	   
+	   x = (HYPRE_StructVector)
+		   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+								  argv[read_x0fromfile_index],v_num_ghost);
+   }
+   
+   /* beginning of sum == 0  */
+   if (sum == 0)    /* no read from any file */
+   {
+	   /*-----------------------------------------------------------
+		* prepare space for the extents
+		*-----------------------------------------------------------*/
+	   
+	   ilower = hypre_CTAlloc(HYPRE_Int*, nblocks);
+	   iupper = hypre_CTAlloc(HYPRE_Int*, nblocks);
+	   for (i = 0; i < nblocks; i++)
+	   {
+		   ilower[i] = hypre_CTAlloc(HYPRE_Int, dim);
+		   iupper[i] = hypre_CTAlloc(HYPRE_Int, dim);
+	   }
+	   
+	   /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */
+	   ib = 0;
+	   switch (dim)
+	   {
+	   case 1:
+		   for (ix = 0; ix < bx; ix++)
+		   {
+			   ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+			   iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+			   ib++;
+		   }
+		   break;
+	   case 2:
+		   for (iy = 0; iy < by; iy++)
+			   for (ix = 0; ix < bx; ix++)
+			   {
+				   ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+				   iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+				   ilower[ib][1] = istart[1]+ ny*(by*q+iy);
+				   iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
+				   ib++;
+			   }
+		   break;
+	   case 3:
+		   for (iz = 0; iz < bz; iz++)
+			   for (iy = 0; iy < by; iy++)
+				   for (ix = 0; ix < bx; ix++)
+				   {
+					   ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+					   iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+					   ilower[ib][1] = istart[1]+ ny*(by*q+iy);
+					   iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
+					   ilower[ib][2] = istart[2]+ nz*(bz*r+iz);
+					   iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1;
+					   ib++;
+				   }
+		   break;
+	   }
+	   
+	   HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid);
+	   for (ib = 0; ib < nblocks; ib++)
+	   {
+		   /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/
+		   HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]);
+	   }
+	   HYPRE_StructGridSetPeriodic(grid, periodic);
+	   HYPRE_StructGridAssemble(grid);
+	   
+	   /*-----------------------------------------------------------
+		* Set up the matrix structure
+		*-----------------------------------------------------------*/
+	   
+	   for (i = 0; i < dim; i++)
+	   {
+		   A_num_ghost[2*i] = 1;
+		   A_num_ghost[2*i + 1] = 1;
+	   }
+	   
+	   HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, grid, stencil, &A);
+	   if ( solver_id == 3 || solver_id == 4 ||
+			solver_id == 13 || solver_id == 14 )
+	   {
+		   stencil_size  = hypre_StructStencilSize(stencil);
+		   stencil_entries = hypre_CTAlloc(HYPRE_Int, stencil_size);
+		   if ( solver_id == 3 || solver_id == 13)
+		   {
+               for ( i=0; i<stencil_size; ++i ) stencil_entries[i]=i;
+               hypre_StructMatrixSetConstantEntries(
+				   A, stencil_size, stencil_entries );
+               /* ... note: SetConstantEntries is where the constant_coefficient
+                  flag is set in A */
+               hypre_TFree( stencil_entries );
+               constant_coefficient = 1;
+		   }
+		   if ( solver_id == 4 || solver_id == 14)
+		   {
+               hypre_SetIndex3(diag_index, 0, 0, 0);
+               diag_rank = hypre_StructStencilElementRank( stencil, diag_index );
+               hypre_assert( stencil_size>=1 );
+               if ( diag_rank==0 ) stencil_entries[diag_rank]=1;
+               else stencil_entries[diag_rank]=0;
+               for ( i=0; i<stencil_size; ++i )
+               {
+                  if ( i!= diag_rank ) stencil_entries[i]=i;
+               }
+               hypre_StructMatrixSetConstantEntries(
+                  A, stencil_size, stencil_entries );
+               hypre_TFree( stencil_entries );
+               constant_coefficient = 2;
+		   }
+	   }
+	   HYPRE_StructMatrixSetSymmetric(A, sym);
+	   HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+	   HYPRE_StructMatrixInitialize(A);
+	   
+	   /*-----------------------------------------------------------
+		* Fill in the matrix elements
+		*-----------------------------------------------------------*/
+	   
+	   AddValuesMatrix(A,grid,cx,cy,cz,conx,cony,conz);
+	   
+	   /* Zero out stencils reaching to real boundary */
+	   /* But in constant coefficient case, no special stencils! */
+	   
+	   if ( constant_coefficient == 0 ) SetStencilBndry(A,grid,periodic); 
+	   HYPRE_StructMatrixAssemble(A);
+	   
+	   /*-----------------------------------------------------------
+		* Set up the linear system
+		*-----------------------------------------------------------*/
+	   
+	   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &b);
+	   HYPRE_StructVectorInitialize(b);
+	   
+	   /*-----------------------------------------------------------
+		* For periodic b.c. in all directions, need rhs to satisfy 
+		* compatibility condition. Achieved by setting a source and
+		*  sink of equal strength.  All other problems have rhs = 1.
+		*-----------------------------------------------------------*/
+	   
+	   AddValuesVector(grid,b,periodic,1.0);
+	   HYPRE_StructVectorAssemble(b);
+	   
+	   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x);
+	   HYPRE_StructVectorInitialize(x);
+	   
+	   AddValuesVector(grid,x,periodx0,1.0);
+	   HYPRE_StructVectorAssemble(x);
+	   
+	   HYPRE_StructGridDestroy(grid);
+	   
+	   for (i = 0; i < nblocks; i++)
+	   {
+		   hypre_TFree(iupper[i]);
+		   hypre_TFree(ilower[i]);
+	   }
+	   hypre_TFree(ilower);
+	   hypre_TFree(iupper);
+   }
+   
+   /* the grid will be read from file.  */
+   if ( (sum > 0 ) && (sum < 3))
+   {
+	   /* the grid will come from rhs or from x0 */
+	   if (read_fromfile_param == 0)
+	   {
+		   
+		   if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param == 0))
+		   {                     
+               /* read right hand side, extract grid, construct matrix,
+                  construct x0 */
+			   
+               hypre_printf("\ninitial rhs from file prefix :%s\n",
+                            argv[read_rhsfromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               b = (HYPRE_StructVector)
+                  hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_rhsfromfile_index],
+                                         v_num_ghost);
+           
+               readgrid = hypre_StructVectorGrid(b) ;
+               readperiodic = hypre_StructGridPeriodic(readgrid);  
+           
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &x);
+               HYPRE_StructVectorInitialize(x);
+           
+               AddValuesVector(readgrid,x,periodx0,0.0);
+               HYPRE_StructVectorAssemble(x);
+           
+               HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD,
+                                        readgrid, stencil, &A);
+               HYPRE_StructMatrixSetSymmetric(A, 1);
+               HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+               HYPRE_StructMatrixInitialize(A);
+
+               /*-----------------------------------------------------------
+                * Fill in the matrix elements
+                *-----------------------------------------------------------*/
+   
+               AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz);
+			   
+               /* Zero out stencils reaching to real boundary */
+           
+               if ( constant_coefficient==0 )
+				   SetStencilBndry(A,readgrid,readperiodic); 
+               HYPRE_StructMatrixAssemble(A);
+		   }   
+		   /* done with one case rhs=1 x0 = 0 */
+		   
+		   /* case when rhs=0 and read x0=1 */
+		   if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param > 0))
+		   {                     
+               /* read right hand side, extract grid, construct matrix,
+                  construct x0 */
+			   
+               hypre_printf("\ninitial x0 from file prefix :%s\n",
+                            argv[read_x0fromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               x = (HYPRE_StructVector)
+				   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+										  argv[read_x0fromfile_index],v_num_ghost);
+			   
+               readgrid = hypre_StructVectorGrid(x) ;
+               readperiodic = hypre_StructGridPeriodic(readgrid);  
+
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b);
+               HYPRE_StructVectorInitialize(b);
+               AddValuesVector(readgrid,b,readperiodic,1.0);
+
+               HYPRE_StructVectorAssemble(b);
+
+               HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD,
+                                        readgrid, stencil, &A);
+               HYPRE_StructMatrixSetSymmetric(A, 1);
+               HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+               HYPRE_StructMatrixInitialize(A);
+
+               /*-----------------------------------------------------------
+                * Fill in the matrix elements
+                *-----------------------------------------------------------*/
+   
+               AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz);
+
+               /* Zero out stencils reaching to real boundary */
+			   
+               if ( constant_coefficient == 0 )
+				   SetStencilBndry(A,readgrid,readperiodic); 
+               HYPRE_StructMatrixAssemble(A);
+		   }
+		   /* done with one case rhs=0 x0 = 1  */
+		   
+		   /* the other case when read rhs > 0 and read x0 > 0  */
+		   if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param > 0))
+		   {                    
+               /* read right hand side, extract grid, construct matrix,
+                  construct x0 */
+			   
+               hypre_printf("\ninitial rhs  from file prefix :%s\n",
+                            argv[read_rhsfromfile_index]);
+               hypre_printf("\ninitial x0  from file prefix :%s\n",
+                            argv[read_x0fromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               b = (HYPRE_StructVector)
+				   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_rhsfromfile_index],
+                                         v_num_ghost);
+
+               x = (HYPRE_StructVector)
+                  hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_x0fromfile_index],
+                                         v_num_ghost);
+
+               readgrid= hypre_StructVectorGrid(b) ;
+               readperiodic = hypre_StructGridPeriodic(readgrid); 
+
+               HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD,
+                                        readgrid, stencil, &A);
+               HYPRE_StructMatrixSetSymmetric(A, 1);
+               HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+               HYPRE_StructMatrixInitialize(A);
+
+               /*-----------------------------------------------------------
+                * Fill in the matrix elements
+                *-----------------------------------------------------------*/
+   
+               AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz);
+
+               /* Zero out stencils reaching to real boundary */
+
+               if ( constant_coefficient == 0 )
+                  SetStencilBndry(A,readgrid,readperiodic); 
+               HYPRE_StructMatrixAssemble(A);
+		   }
+		   /* done with one case rhs=1 x0 = 1  */
+	   }
+	   /* done with the case where you no read matrix  */
+       
+	   if (read_fromfile_param == 1)  /* still sum > 0  */
+	   {   
+		   hypre_printf("\nreading matrix from file:%s\n",
+						argv[read_fromfile_index]);
+		   /* ghost selection for reading the matrix  */
+		   for (i = 0; i < dim; i++)
+		   {
+               A_num_ghost[2*i] = 1;
+               A_num_ghost[2*i + 1] = 1;
+		   }
+		   
+		   A = (HYPRE_StructMatrix)
+               hypre_StructMatrixRead(hypre_MPI_COMM_WORLD,
+                                      argv[read_fromfile_index], A_num_ghost);
+		   
+		   readgrid = hypre_StructMatrixGrid(A);
+		   readperiodic  =  hypre_StructGridPeriodic(readgrid);  
+		   
+		   if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param == 0))
+		   {                
+               /* read right hand side ,construct x0 */
+               hypre_printf("\ninitial rhs from file prefix :%s\n",
+                            argv[read_rhsfromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               b = (HYPRE_StructVector)
+				   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+										  argv[read_rhsfromfile_index],
+										  v_num_ghost);
+			   
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid,&x);
+               HYPRE_StructVectorInitialize(x);
+               AddValuesVector(readgrid,x,periodx0,0.0);
+               HYPRE_StructVectorAssemble(x);
+		   }
+		   
+		   if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param > 0))
+		   {                   
+               /* read x0, construct rhs*/
+               hypre_printf("\ninitial x0 from file prefix :%s\n",
+                            argv[read_x0fromfile_index]);
+
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+                  v_num_ghost[2*i] = 1;
+                  v_num_ghost[2*i + 1] = 1;
+               }
+  
+               x = (HYPRE_StructVector)
+                  hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_x0fromfile_index],
+                                         v_num_ghost);
+
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b);
+               HYPRE_StructVectorInitialize(b);
+               AddValuesVector(readgrid,b,readperiodic,1.0);
+               HYPRE_StructVectorAssemble(b);
+            }
+
+            if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param == 0))
+            {                    
+               /* construct x0 , construct b*/
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b);
+               HYPRE_StructVectorInitialize(b);
+               AddValuesVector(readgrid,b,readperiodic,1.0);
+               HYPRE_StructVectorAssemble(b);
+
+
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &x);
+               HYPRE_StructVectorInitialize(x);
+               AddValuesVector(readgrid,x,periodx0,0.0);
+               HYPRE_StructVectorAssemble(x); 
+            }   
+	   }
+	   /* finish the read of matrix  */
+   }
+   /* finish the sum > 0 case   */
+   
+   /*-----------------------------------------------------------
+	* Print out the system and initial guess
+	*-----------------------------------------------------------*/
+   
+   if (print_system)
+   {
+	   HYPRE_StructMatrixPrint("struct.out.A", A, 0);
+	   HYPRE_StructVectorPrint("struct.out.b", b, 0);
+	   HYPRE_StructVectorPrint("struct.out.x0", x, 0);
+   }
+   
+   /*-----------------------------------------------------------
+	* axpy
+	*-----------------------------------------------------------*/
+   
+#if !HYPRE_MFLOPS
+
+   hypre_MPI_Barrier(hypre_MPI_COMM_WORLD);
+   
+   if (solver_id == 0)
+   {
+     //timeval tstart,tstop;
+	   time_index = hypre_InitializeTiming("axpy");
+	   hypre_BeginTiming(time_index);
+	   //gettimeofday(&tstart,NULL);
+
+	   for ( rep=0; rep<reps; ++rep )
+	   {
+		   hypre_StructAxpy(2.0, b, x);
+	   }
+	   //gettimeofday(&tstop,NULL);
+	   hypre_EndTiming(time_index);
+
+	   //HYPRE_Real telapsed = (tstop.tv_sec - tstart.tv_sec) + (tstop.tv_usec - tstart.tv_usec)/1e6 ;
+	   //hypre_printf("axpy, \t %d, \t %f\n",nx*ny*nz,telapsed/reps);
+	   
+	   hypre_PrintTiming("Time for axpy", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+   }
+   
+   /*-----------------------------------------------------------
+	* sparse matrix vector multiplication
+	*-----------------------------------------------------------*/
+   
+   else if ( solver_id == 1 || solver_id == 3 || solver_id == 4 )
+   {
+     //timeval tstart,tstop;
+	   void *matvec_data;
+
+	   matvec_data = hypre_StructMatvecCreate();
+	   hypre_StructMatvecSetup(matvec_data, A, x);
+	   
+	   time_index = hypre_InitializeTiming("Mat-Vec");
+	   hypre_BeginTiming(time_index);
+	   //gettimeofday(&tstart,NULL);
+
+	   for ( rep=0; rep<reps; ++rep )
+	   {
+		   hypre_StructMatvecCompute(matvec_data, 1.0, A, x, 1.0, b);
+	   }
+	   //gettimeofday(&tstop,NULL);
+	   hypre_EndTiming(time_index);
+
+	   //HYPRE_Real telapsed = (tstop.tv_sec - tstart.tv_sec) + (tstop.tv_usec - tstart.tv_usec)/1e6 ;
+	   //hypre_printf("Mat-Vec, \t %d, \t %f\n",nx*ny*nz,telapsed/reps);
+
+	   hypre_PrintTiming("Time for Mat-Vec", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+   }
+
+   /*-----------------------------------------------------------
+	* inner product
+	*-----------------------------------------------------------*/
+
+   else if (solver_id == 2)
+   {
+     //timeval tstart,tstop;
+	   time_index = hypre_InitializeTiming("inner");
+	   hypre_BeginTiming(time_index);
+	   //gettimeofday(&tstart,NULL);
+	   for ( rep=0; rep<reps; ++rep )
+	   {
+		   inner = hypre_StructInnerProd(x,b);
+	   }
+	   //gettimeofday(&tstop,NULL);
+	   hypre_EndTiming(time_index);
+	   //HYPRE_Real telapsed = (tstop.tv_sec - tstart.tv_sec) + (tstop.tv_usec - tstart.tv_usec)/1e6 ;
+	   //hypre_printf("inner, \t %d, \t %f\n",nx*ny*nz,telapsed/reps);
+
+	   hypre_PrintTiming("Time for inner product", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+   }
+
+   /*-----------------------------------------------------------
+	* Solve the system using Jacobi
+	*-----------------------------------------------------------*/
+   
+   else if ( solver_id == 8 )
+   {
+	   time_index = hypre_InitializeTiming("Jacobi Setup");
+	   hypre_BeginTiming(time_index);
+	   
+	   HYPRE_StructJacobiCreate(hypre_MPI_COMM_WORLD, &solver);
+	   HYPRE_StructJacobiSetMaxIter(solver, 100);
+	   HYPRE_StructJacobiSetTol(solver, 1.0e-06);
+	   HYPRE_StructJacobiSetup(solver, A, b, x);
+	   
+	   hypre_EndTiming(time_index);
+	   hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+	   
+	   time_index = hypre_InitializeTiming("Jacobi Solve");
+	   hypre_BeginTiming(time_index);
+	   
+	   HYPRE_StructJacobiSolve(solver, A, b, x);
+	   
+	   hypre_EndTiming(time_index);
+	   hypre_PrintTiming("Solve phase times", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+	   
+	   HYPRE_StructJacobiGetNumIterations(solver, &num_iterations);
+	   HYPRE_StructJacobiGetFinalRelativeResidualNorm(solver, &final_res_norm);
+	   HYPRE_StructJacobiDestroy(solver);
+   }
+
+   /*-----------------------------------------------------------
+	* Solve the system using CG
+	*-----------------------------------------------------------*/
+   
+   if ((solver_id > 9) && (solver_id < 20))
+   {
+   }
+
+   /*-----------------------------------------------------------
+	* Print the solution and other info
+	*-----------------------------------------------------------*/
+
+   if (print_system)
+   {
+	   HYPRE_StructVectorPrint("struct.out.x", x, 0);
+   }
+#endif
+   /*-----------------------------------------------------------
+	* Compute MFLOPs for Matvec
+	*-----------------------------------------------------------*/
+
+#if HYPRE_MFLOPS
+   {
+	   void *matvec_data;
+	   HYPRE_Int   i, imax, N;
+	   
+	   /* compute imax */
+	   N = (P*nx)*(Q*ny)*(R*nz);
+	   imax = (5*1000000) / N;
+	   
+	   matvec_data = hypre_StructMatvecCreate();
+	   hypre_StructMatvecSetup(matvec_data, A, x);
+	   
+	   time_index = hypre_InitializeTiming("Matvec");
+	   hypre_BeginTiming(time_index);
+	   
+	   for (i = 0; i < imax; i++)
+	   {
+		   hypre_StructMatvecCompute(matvec_data, 1.0, A, x, 1.0, b);
+	   }
+	   /* this counts mult-adds */
+	   hypre_IncFLOPCount(7*N*imax);
+	   
+	   hypre_EndTiming(time_index);
+	   hypre_PrintTiming("Matvec time", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+	   
+	   hypre_StructMatvecDestroy(matvec_data);
+   }
+#endif
+   
+   /*-----------------------------------------------------------
+	* Finalize things
+	*-----------------------------------------------------------*/
+   
+   HYPRE_StructStencilDestroy(stencil);
+   HYPRE_StructMatrixDestroy(A);
+   HYPRE_StructVectorDestroy(b);
+   HYPRE_StructVectorDestroy(x);
+   
+   for ( i = 0; i < (dim + 1); i++)
+	   hypre_TFree(offsets[i]);
+   hypre_TFree(offsets);
+   
+   hypre_FinalizeMemoryDebug();
+   
+   /* Finalize MPI */
+   hypre_MPI_Finalize();
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
+   return (0);
+}
+
+/*-------------------------------------------------------------------------
+ * add constant values to a vector. Need to pass the initialized vector, grid,
+ * period of grid and the constant value.
+ *-------------------------------------------------------------------------*/
+
+HYPRE_Int
+AddValuesVector( hypre_StructGrid  *gridvector,
+                 hypre_StructVector *zvector,
+                 HYPRE_Int          *period, 
+                 HYPRE_Real         value  )
+{
+/* #include  "_hypre_struct_mv.h" */
+   HYPRE_Int ierr = 0;
+   hypre_BoxArray     *gridboxes;
+   HYPRE_Int          i,ib;
+   hypre_IndexRef     ilower;
+   hypre_IndexRef     iupper;
+   hypre_Box          *box;
+   HYPRE_Real         *values;
+   HYPRE_Int          volume,dim;
+
+   gridboxes =  hypre_StructGridBoxes(gridvector);
+   dim       =  hypre_StructGridNDim(gridvector);
+
+   ib=0;
+   hypre_ForBoxI(ib, gridboxes)
+   {
+      box      = hypre_BoxArrayBox(gridboxes, ib);
+      volume   =  hypre_BoxVolume(box);
+      values   = hypre_CTAlloc(HYPRE_Real, volume);
+
+      /*-----------------------------------------------------------
+       * For periodic b.c. in all directions, need rhs to satisfy 
+       * compatibility condition. Achieved by setting a source and
+       *  sink of equal strength.  All other problems have rhs = 1.
+       *-----------------------------------------------------------*/
+
+      if ((dim == 2 && period[0] != 0 && period[1] != 0) ||
+          (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0))
+      {
+         for (i = 0; i < volume; i++)
+         {
+            values[i] = 0.0;
+         }
+         values[0]         =  value;
+         values[volume - 1] = -value;
+      }
+      else
+      {
+         for (i = 0; i < volume; i++)
+         {
+            values[i] = value;
+         }
+      }
+
+      ilower = hypre_BoxIMin(box);
+      iupper = hypre_BoxIMax(box);
+      HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values);
+      hypre_TFree(values);
+
+   }
+
+   return ierr;
+}
+
+/******************************************************************************
+ * Adds values to matrix based on a 7 point (3d) 
+ * symmetric stencil for a convection-diffusion problem.
+ * It need an initialized matrix, an assembled grid, and the constants
+ * that determine the 7 point (3d) convection-diffusion.
+ ******************************************************************************/
+
+HYPRE_Int
+AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
+                HYPRE_Real        cx,
+                HYPRE_Real        cy,
+                HYPRE_Real        cz,
+                HYPRE_Real        conx,
+                HYPRE_Real        cony,
+                HYPRE_Real        conz)
+{
+
+   HYPRE_Int ierr=0;
+   hypre_BoxArray     *gridboxes;
+   HYPRE_Int           i,s,bi;
+   hypre_IndexRef      ilower;
+   hypre_IndexRef      iupper;
+   hypre_Box          *box;
+   HYPRE_Real         *values;
+   HYPRE_Real          east,west;
+   HYPRE_Real          north,south;
+   HYPRE_Real          top,bottom;
+   HYPRE_Real          center;
+   HYPRE_Int           volume,dim,sym;
+   HYPRE_Int          *stencil_indices;
+   HYPRE_Int           stencil_size;
+   HYPRE_Int           constant_coefficient;
+
+   gridboxes =  hypre_StructGridBoxes(gridmatrix);
+   dim       =  hypre_StructGridNDim(gridmatrix);
+   sym       =  hypre_StructMatrixSymmetric(A);
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(A);
+
+   bi=0;
+
+   east = -cx;
+   west = -cx;
+   north = -cy;
+   south = -cy;
+   top = -cz;
+   bottom = -cz;
+   center = 2.0*cx;
+   if (dim > 1) center += 2.0*cy;
+   if (dim > 2) center += 2.0*cz;
+
+   stencil_size = 1 + (2 - sym) * dim;
+   stencil_indices = hypre_CTAlloc(HYPRE_Int, stencil_size);
+   for (s = 0; s < stencil_size; s++)
+   {
+      stencil_indices[s] = s;
+   }
+
+   if(sym)
+   {
+      if ( constant_coefficient==0 )
+      {
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
+
+            for (i = 0; i < stencil_size*volume; i += stencil_size)
+            {
+               switch (dim)
+               {
+                  case 1:
+                     values[i  ] = west;
+                     values[i+1] = center;
+                     break;
+                  case 2:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = center;
+                     break;
+                  case 3:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = bottom;
+                     values[i+3] = center;
+                     break;
+               }
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
+                                           stencil_indices, values);
+            hypre_TFree(values);
+         }
+      }
+      else if ( constant_coefficient==1 )
+      {
+         values   = hypre_CTAlloc(HYPRE_Real, stencil_size);
+         switch (dim)
+         {
+            case 1:
+               values[0] = west;
+               values[1] = center;
+               break;
+            case 2:
+               values[0] = west;
+               values[1] = south;
+               values[2] = center;
+               break;
+            case 3:
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               values[3] = center;
+               break;
+         }
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size,
+                                                stencil_indices, values);
+         }
+         hypre_TFree(values);
+      }
+      else
+      {
+         hypre_assert( constant_coefficient==2 );
+
+         /* stencil index for the center equals dim, so it's easy to leave out */
+         values   = hypre_CTAlloc(HYPRE_Real, stencil_size-1);
+         switch (dim)
+         {
+            case 1:
+               values[0] = west;
+               break;
+            case 2:
+               values[0] = west;
+               values[1] = south;
+               break;
+            case 3:
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               break;
+         }
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size-1,
+                                                stencil_indices, values);
+         }
+         hypre_TFree(values);
+
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, volume);
+
+            for ( i=0; i < volume; ++i )
+            {
+               values[i] = center;
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
+                                           stencil_indices+dim, values);
+            hypre_TFree(values);
+         }
+      }
+   }
+   else
+   {
+      if (conx > 0.0)
+      {
+         west   -= conx;
+         center += conx;
+      }
+      else if (conx < 0.0) 
+      {
+         east   += conx;
+         center -= conx;
+      }
+      if (cony > 0.0)
+      {
+         south  -= cony;
+         center += cony;
+      }
+      else if (cony < 0.0) 
+      {
+         north  += cony;
+         center -= cony;
+      }
+      if (conz > 0.0)
+      {
+         bottom -= conz;
+         center += conz;
+      }
+      else if (cony < 0.0) 
+      {
+         top    += conz;
+         center -= conz;
+      }
+
+      if ( constant_coefficient==0 )
+      {
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
+
+            for (i = 0; i < stencil_size*volume; i += stencil_size)
+            {
+               switch (dim)
+               {
+                  case 1:
+                     values[i  ] = west;
+                     values[i+1] = center;
+                     values[i+2] = east;
+                     break;
+                  case 2:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = center;
+                     values[i+3] = east;
+                     values[i+4] = north;
+                     break;
+                  case 3:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = bottom;
+                     values[i+3] = center;
+                     values[i+4] = east;
+                     values[i+5] = north;
+                     values[i+6] = top;
+                     break;
+               }
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
+                                           stencil_indices, values);
+
+            hypre_TFree(values);
+         }
+      }
+      else if ( constant_coefficient==1 )
+      {
+         values = hypre_CTAlloc( HYPRE_Real, stencil_size );
+
+         switch (dim)
+         {
+            case 1:
+               values[0] = west;
+               values[1] = center;
+               values[2] = east;
+               break;
+            case 2:
+               values[0] = west;
+               values[1] = south;
+               values[2] = center;
+               values[3] = east;
+               values[4] = north;
+               break;
+            case 3:
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               values[3] = center;
+               values[4] = east;
+               values[5] = north;
+               values[6] = top;
+               break;
+         }
+
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size,
+                                                stencil_indices, values);
+         }
+
+         hypre_TFree(values);
+      }
+      else
+      {
+         hypre_assert( constant_coefficient==2 );
+         values = hypre_CTAlloc( HYPRE_Real, stencil_size-1 );
+         switch (dim)
+         {  /* no center in stencil_indices and values */
+            case 1:
+               stencil_indices[0] = 0;
+               stencil_indices[1] = 2;
+               values[0] = west;
+               values[1] = east;
+               break;
+            case 2:
+               stencil_indices[0] = 0;
+               stencil_indices[1] = 1;
+               stencil_indices[2] = 3;
+               stencil_indices[3] = 4;
+               values[0] = west;
+               values[1] = south;
+               values[2] = east;
+               values[3] = north;
+               break;
+            case 3:
+               stencil_indices[0] = 0;
+               stencil_indices[1] = 1;
+               stencil_indices[2] = 2;
+               stencil_indices[3] = 4;
+               stencil_indices[4] = 5;
+               stencil_indices[5] = 6;
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               values[3] = east;
+               values[4] = north;
+               values[5] = top;
+               break;
+         }
+
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size,
+                                                stencil_indices, values);
+         }
+         hypre_TFree(values);
+
+
+         /* center is variable */
+         stencil_indices[0] = dim; /* refers to center */
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, volume);
+
+            for ( i=0; i < volume; ++i )
+            {
+               values[i] = center;
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
+                                           stencil_indices, values);
+            hypre_TFree(values);
+         }
+      }
+   }
+
+   hypre_TFree(stencil_indices);
+
+   return ierr;
+}
+
+/*********************************************************************************
+ * this function sets to zero the stencil entries that are on the boundary
+ * Grid, matrix and the period are needed. 
+ *********************************************************************************/ 
+
+HYPRE_Int
+SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* period)
+{
+
+   HYPRE_Int ierr=0;
+   hypre_BoxArray    *gridboxes;
+   HYPRE_Int          size,i,j,d,ib;
+   HYPRE_Int        **ilower;
+   HYPRE_Int        **iupper;
+   HYPRE_Int         *vol;
+   HYPRE_Int         *istart, *iend;
+   hypre_Box         *box;
+   hypre_Box         *dummybox;
+   hypre_Box         *boundingbox;
+   HYPRE_Real        *values;
+   HYPRE_Int          volume, dim;
+   HYPRE_Int         *stencil_indices;
+   HYPRE_Int          constant_coefficient;
+
+   gridboxes       = hypre_StructGridBoxes(gridmatrix);
+   boundingbox     = hypre_StructGridBoundingBox(gridmatrix);
+   istart          = hypre_BoxIMin(boundingbox);
+   iend            = hypre_BoxIMax(boundingbox);
+   size            = hypre_StructGridNumBoxes(gridmatrix);
+   dim             = hypre_StructGridNDim(gridmatrix);
+   stencil_indices = hypre_CTAlloc(HYPRE_Int, 1);
+
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(A);
+   if ( constant_coefficient>0 ) return 1;
+   /*...no space dependence if constant_coefficient==1,
+     and space dependence only for diagonal if constant_coefficient==2 --
+     and this function only touches off-diagonal entries */
+
+   vol    = hypre_CTAlloc(HYPRE_Int, size);
+   ilower = hypre_CTAlloc(HYPRE_Int*, size);
+   iupper = hypre_CTAlloc(HYPRE_Int*, size);
+   for (i = 0; i < size; i++)
+   {
+      ilower[i] = hypre_CTAlloc(HYPRE_Int, dim);
+      iupper[i] = hypre_CTAlloc(HYPRE_Int, dim);
+   }
+
+   i = 0;
+   ib = 0;
+   hypre_ForBoxI(i, gridboxes)
+   {
+      dummybox = hypre_BoxCreate(dim);
+      box      = hypre_BoxArrayBox(gridboxes, i);
+      volume   =  hypre_BoxVolume(box);
+      vol[i]   = volume;
+      hypre_CopyBox(box,dummybox);
+      for (d = 0; d < dim; d++)
+      {
+         ilower[ib][d] = hypre_BoxIMinD(dummybox,d);
+         iupper[ib][d] = hypre_BoxIMaxD(dummybox,d);
+      }
+      ib++ ;
+      hypre_BoxDestroy(dummybox);
+   }
+
+   if ( constant_coefficient==0 )
+   {
+      for (d = 0; d < dim; d++)
+      {
+         for (ib = 0; ib < size; ib++)
+         {
+            values = hypre_CTAlloc(HYPRE_Real, vol[ib]);
+        
+            for (i = 0; i < vol[ib]; i++)
+            {
+               values[i] = 0.0;
+            }
+
+            if( ilower[ib][d] == istart[d] && period[d] == 0 )
+            {
+               j = iupper[ib][d];
+               iupper[ib][d] = istart[d];
+               stencil_indices[0] = d;
+               HYPRE_StructMatrixSetBoxValues(A, ilower[ib], iupper[ib],
+                                              1, stencil_indices, values);
+               iupper[ib][d] = j;
+            }
+
+            if( iupper[ib][d] == iend[d] && period[d] == 0 )
+            {
+               j = ilower[ib][d];
+               ilower[ib][d] = iend[d];
+               stencil_indices[0] = dim + 1 + d;
+               HYPRE_StructMatrixSetBoxValues(A, ilower[ib], iupper[ib],
+                                              1, stencil_indices, values);
+               ilower[ib][d] = j;
+            }
+            hypre_TFree(values);
+         }
+      }
+   }
+  
+   hypre_TFree(vol);
+   hypre_TFree(stencil_indices);
+   for (ib =0 ; ib < size ; ib++)
+   {
+      hypre_TFree(ilower[ib]);
+      hypre_TFree(iupper[ib]);
+   }
+   hypre_TFree(ilower);
+   hypre_TFree(iupper);
+
+   return ierr;
+}
diff -Nru hypre-2.11.2/src/test/TEST_examples/complex.jobs hypre-2.13.0/src/test/TEST_examples/complex.jobs
--- hypre-2.11.2/src/test/TEST_examples/complex.jobs	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_examples/complex.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -15,5 +15,5 @@
 # Run complex examples
 #=============================================================================
 
-mpirun -np 16 ex18comp -n 4 > complex.out.1
+mpirun -np 16 ./ex18comp -n 4 > complex.out.1
 
diff -Nru hypre-2.11.2/src/test/TEST_examples/maxdim.jobs hypre-2.13.0/src/test/TEST_examples/maxdim.jobs
--- hypre-2.11.2/src/test/TEST_examples/maxdim.jobs	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_examples/maxdim.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -15,6 +15,6 @@
 # Run maxdim examples
 #=============================================================================
 
-mpirun -np 16 ex17 -n 10 > maxdim.out.1
+mpirun -np 16 ./ex17 -n 10 > maxdim.out.1
 
-mpirun -np 16 ex18 -n 4 > maxdim.out.2
+mpirun -np 16 ./ex18 -n 4 > maxdim.out.2
diff -Nru hypre-2.11.2/src/test/TEST_ij/smoother.jobs hypre-2.13.0/src/test/TEST_ij/smoother.jobs
--- hypre-2.11.2/src/test/TEST_ij/smoother.jobs	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_ij/smoother.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -36,6 +36,7 @@
 #   14: Polynomial (Chebyshev 3rd order) on 4 procs
 #   15: FCF Jacobi on 4 procs
 #   16: CG smoother on 4 procs
+#   17-20: Polynomial (Chebyshev 2nd order) with various options
 #=============================================================================
 
 mpirun -np 3 ./ij -rhsrand -n 15 30 10 -w 1.1 -owl 1.0 0 \
@@ -87,3 +88,21 @@
 mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 15 -n 20 20 10 -P 2 2 1 \
 > smoother.out.15
 
+mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -cheby_scale 0 -n 20 20 20 \
+-P 2 2 1 -27pt > smoother.out.16
+
+mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -cheby_variant 1 -n 20 20 20 \
+-P 2 2 1 > smoother.out.17
+
+mpirun -np 4 ./ij -solver 3 -rlx 16 -cheby_eig_est 0 -n 40 40 20 \
+-P 2 2 1 -difconv -a 10 10 10  > smoother.out.18
+
+mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -rotate -alpha 60 -eps 0.1 -cheby_fraction 0.2 -n 200 200 \
+-P 2 2  > smoother.out.19
+
+mpirun -np 4 ./ij -solver 1 -rlx 16 -cheby_eig_est 5 -n 40 40 20 \
+-P 2 2 1 -vardifconv -eps 0.1  > smoother.out.20
+
+
+
+
diff -Nru hypre-2.11.2/src/test/TEST_ij/smoother.saved hypre-2.13.0/src/test/TEST_ij/smoother.saved
--- hypre-2.11.2/src/test/TEST_ij/smoother.saved	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_ij/smoother.saved	2017-10-20 17:42:22.000000000 +0000
@@ -74,3 +74,23 @@
 Iterations = 15
 Final Relative Residual Norm = 5.807749e-09
 
+# Output file: smoother.out.16
+Iterations = 6
+Final Relative Residual Norm = 1.555966e-09
+
+# Output file: smoother.out.17
+Iterations = 7
+Final Relative Residual Norm = 2.088732e-09
+
+# Output file: smoother.out.18
+GMRES Iterations = 11
+Final GMRES Relative Residual Norm = 8.192864e-09
+
+# Output file: smoother.out.19
+Iterations = 6
+Final Relative Residual Norm = 8.887087e-10
+
+# Output file: smoother.out.20
+Iterations = 11
+Final Relative Residual Norm = 3.089502e-09
+
diff -Nru hypre-2.11.2/src/test/TEST_ij/smoother.sh hypre-2.13.0/src/test/TEST_ij/smoother.sh
--- hypre-2.11.2/src/test/TEST_ij/smoother.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_ij/smoother.sh	2017-10-20 17:42:22.000000000 +0000
@@ -43,6 +43,11 @@
  ${TNAME}.out.13\
  ${TNAME}.out.14\
  ${TNAME}.out.15\
+ ${TNAME}.out.16\
+ ${TNAME}.out.17\
+ ${TNAME}.out.18\
+ ${TNAME}.out.19\
+ ${TNAME}.out.20\
 "
 
 for i in $FILES
diff -Nru hypre-2.11.2/src/test/TEST_ij/solvers.jobs hypre-2.13.0/src/test/TEST_ij/solvers.jobs
--- hypre-2.11.2/src/test/TEST_ij/solvers.jobs	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_ij/solvers.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -82,6 +82,9 @@
 mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111
 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112
 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120
 
 #nonGalerkin version
 mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
@@ -90,3 +93,30 @@
 #RAP options
 mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
 mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
+
+#
+# MGR and MGR-PCG
+#
+# coarse grid solver checks (1-level MGR == AMG (or coarse grid solver))
+# Also checks for keeping coarse nodes to coarsest level
+# coarse grid size in output should be ~ mgr_num_reserved_nodes
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203
+# multi level MGR tests with different coarse grid type strategies 
+# Fix non C points to F points with different F-relaxation methods (single/multileve F-relaxation)
+# with/ without reserved coarse nodes
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207
+# Not fixed non C points to F points with different F-relaxation methods (single/multileve F-relaxation)
+# with/ without reserved coarse nodes
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211
+# MGR-PCG tests
+mpirun -np 2 ./ij -solver 71 -mgr_nlevels 0 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212
+mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213
diff -Nru hypre-2.11.2/src/test/TEST_ij/solvers.saved hypre-2.13.0/src/test/TEST_ij/solvers.saved
--- hypre-2.11.2/src/test/TEST_ij/solvers.saved	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_ij/solvers.saved	2017-10-20 17:42:22.000000000 +0000
@@ -37,22 +37,22 @@
 Final Relative Residual Norm = 6.698760e-09
 
 # Output file: solvers.out.9
-Iterations = 12
-PCG_Iterations = 8
+Iterations = 11
+PCG_Iterations = 7
 DSCG_Iterations = 4
-Final Relative Residual Norm = 2.111630e-09
+Final Relative Residual Norm = 6.805037e-10
 
 # Output file: solvers.out.10
-Iterations = 10
-PCG_Iterations = 8
+Iterations = 9
+PCG_Iterations = 7
 DSCG_Iterations = 2
-Final Relative Residual Norm = 6.719344e-09
+Final Relative Residual Norm = 4.842561e-09
 
 # Output file: solvers.out.11
 Iterations = 7
 PCG_Iterations = 4
 DSCG_Iterations = 3
-Final Relative Residual Norm = 6.859548e-09
+Final Relative Residual Norm = 1.289994e-10
 
 # Output file: solvers.out.sysh
  Average Convergence Factor = 0.122031
@@ -143,3 +143,71 @@
 GMRES Iterations = 10
 Final GMRES Relative Residual Norm = 1.006494e-09
 
+# Output file: solvers.out.118
+GMRES Iterations = 25
+Final GMRES Relative Residual Norm = 9.464475e-09
+
+# Output file: solvers.out.119
+GMRES Iterations = 23
+Final GMRES Relative Residual Norm = 9.269998e-09
+
+# Output file: solvers.out.120
+GMRES Iterations = 17
+Final GMRES Relative Residual Norm = 3.995718e-09
+
+# Output file: solvers.out.200
+MGR Iterations = 6
+Final Relative Residual Norm = 6.980278e-10
+
+# Output file: solvers.out.201
+MGR Iterations = 6
+Final Relative Residual Norm = 5.136059e-09
+
+# Output file: solvers.out.202
+MGR Iterations = 6
+Final Relative Residual Norm = 6.980278e-10
+
+# Output file: solvers.out.203
+MGR Iterations = 6
+Final Relative Residual Norm = 5.136059e-09
+
+# Output file: solvers.out.204
+MGR Iterations = 74
+Final Relative Residual Norm = 8.553484e-09
+
+# Output file: solvers.out.205
+MGR Iterations = 70
+Final Relative Residual Norm = 9.518056e-09
+
+# Output file: solvers.out.206
+MGR Iterations = 9
+Final Relative Residual Norm = 3.330705e-09
+
+# Output file: solvers.out.207
+MGR Iterations = 9
+Final Relative Residual Norm = 6.199197e-09
+
+# Output file: solvers.out.208
+MGR Iterations = 57
+Final Relative Residual Norm = 7.786878e-09
+
+# Output file: solvers.out.209
+MGR Iterations = 54
+Final Relative Residual Norm = 7.641495e-09
+
+# Output file: solvers.out.210
+MGR Iterations = 9
+Final Relative Residual Norm = 1.103348e-09
+
+# Output file: solvers.out.211
+MGR Iterations = 9
+Final Relative Residual Norm = 2.211486e-09
+
+# Output file: solvers.out.212
+Iterations = 5
+Final Relative Residual Norm = 1.371340e-09
+
+# Output file: solvers.out.213
+Iterations = 29
+Final Relative Residual Norm = 5.204677e-09
+
diff -Nru hypre-2.11.2/src/test/TEST_ij/solvers.sh hypre-2.13.0/src/test/TEST_ij/solvers.sh
--- hypre-2.11.2/src/test/TEST_ij/solvers.sh	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_ij/solvers.sh	2017-10-20 17:42:22.000000000 +0000
@@ -33,6 +33,18 @@
 diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2
 
 #=============================================================================
+# IJ: MGR case nlevels < 1 and bsize < 2 should be the same 
+#                    compare results
+#=============================================================================
+
+tail -17 ${TNAME}.out.200 | head -6 > ${TNAME}.mgr_testdata
+
+#=============================================================================
+
+tail -17 ${TNAME}.out.202 | head -6 > ${TNAME}.mgr_testdata.temp
+diff ${TNAME}.mgr_testdata ${TNAME}.mgr_testdata.temp >&2
+
+#=============================================================================
 # compare with baseline case
 #=============================================================================
 
@@ -96,6 +108,32 @@
  ${TNAME}.out.115\
  ${TNAME}.out.116\
  ${TNAME}.out.117\
+ ${TNAME}.out.118\
+ ${TNAME}.out.119\
+ ${TNAME}.out.120\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done >> ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.200\
+ ${TNAME}.out.201\
+ ${TNAME}.out.202\
+ ${TNAME}.out.203\
+ ${TNAME}.out.204\
+ ${TNAME}.out.205\
+ ${TNAME}.out.206\
+ ${TNAME}.out.207\
+ ${TNAME}.out.208\
+ ${TNAME}.out.209\
+ ${TNAME}.out.210\
+ ${TNAME}.out.211\
+ ${TNAME}.out.212\
+ ${TNAME}.out.213\
 "
 
 for i in $FILES
@@ -121,3 +159,4 @@
 #=============================================================================
 
 rm -f ${TNAME}.testdata*
+rm -r ${TNAME}.mgr_testdata*
diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.jobs hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.jobs
--- hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.jobs	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,92 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+
+
+#=============================================================================
+# ij: Run default case with different solvers
+#    1: BoomerAMG_PCG
+#    2: DS_PCG
+#    3: BoomerAMG_GMRES
+#    4: DS_GMRES
+#    5: BoomerAMG_CGNR
+#    6: DS_CGNR
+#    7: PILUT_GMRES
+#    8: ParaSails_PCG
+#   20: Hybrid_PCG
+#
+
+
+
+# ij: test systems AMG
+#	  unknown approach
+#	  hybrid approach with block smoother
+# 	  nodal approach
+#       more solvers:
+#        51: BoomerAMG_LGMRES
+#        50: DS_LGMRES
+#        61: BoomerAMG_FlexGMRES
+#        60: DS_FlexGMRES
+#
+#=============================================================================
+
+mpirun -np 2 ./ij -solver 1 -tol 1.e-16 -rhsrand > solvers_ij.out.0
+mpirun -np 2 ./ij -solver 2 -tol 1.e-16 -rhsrand > solvers_ij.out.1
+mpirun -np 2 ./ij -solver 3 -tol 1.e-16 -rhsrand > solvers_ij.out.2
+mpirun -np 2 ./ij -solver 4 -tol 1.e-16 -rhsrand > solvers_ij.out.3
+mpirun -np 2 ./ij -solver 5 -tol 1.e-16 -rhsrand -w 0.67 -ns 2 > solvers_ij.out.4
+mpirun -np 2 ./ij -solver 6 -tol 1.e-16 -rhsrand > solvers_ij.out.5
+mpirun -np 2 ./ij -solver 7 -tol 1.e-16 -rhsrand > solvers_ij.out.6
+mpirun -np 2 ./ij -solver 8 -tol 1.e-16 -rhsrand > solvers_ij.out.7
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -rhsrand > solvers_ij.out.8
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand > solvers_ij.out.9
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand -solver_type 2 > solvers_ij.out.10
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand -solver_type 3 > solvers_ij.out.11
+
+#systems AMG run ...unknown approach, hybrid approach, nodal approach
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -tol 1.e-16 > solvers_ij.out.sysu
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 -tol 1.e-16 > solvers_ij.out.sysh
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 -tol 1.e-16 > solvers_ij.out.sysn
+
+#LGMRS and FlexGMRES
+mpirun -np 2 ./ij -solver 50 -tol 1.e-16 -rhsrand > solvers_ij.out.101
+mpirun -np 2 ./ij -solver 51 -tol 1.e-16 -rhsrand > solvers_ij.out.102
+mpirun -np 2 ./ij -solver 60 -tol 1.e-16 -rhsrand > solvers_ij.out.103
+mpirun -np 2 ./ij -solver 61 -tol 1.e-16 -rhsrand > solvers_ij.out.104
+
+#agglomerated coarse grid solve
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -tol 1.e-16 > solvers_ij.out.105
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -tol 1.e-16 > solvers_ij.out.107
+
+#redundant coarse grid solve
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 -tol 1.e-16 > solvers_ij.out.106
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 -tol 1.e-16 > solvers_ij.out.108
+
+#additive cycles
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 -tol 1.e-16 > solvers_ij.out.109
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 -tol 1.e-16 > solvers_ij.out.110
+mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 -tol 1.e-16 > solvers_ij.out.111
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 -tol 1.e-16 > solvers_ij.out.112
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -tol 1.e-16 > solvers_ij.out.113
+
+#nonGalerkin version
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 -tol 1.e-16 > solvers_ij.out.114
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 -tol 1.e-16 > solvers_ij.out.115
+
+#RAP options
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 -tol 1.e-16 > solvers_ij.out.116
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 -tol 1.e-16 > solvers_ij.out.117
diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.saved hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.saved
--- hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.saved	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.saved	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,145 @@
+# Output file: solvers_ij.out.0
+Iterations = 14
+Final Relative Residual Norm = 7.837531e-18
+
+# Output file: solvers_ij.out.1
+Iterations = 65
+Final Relative Residual Norm = 5.127733e-17
+
+# Output file: solvers_ij.out.2
+GMRES Iterations = 14
+Final GMRES Relative Residual Norm = 2.081070e-17
+
+# Output file: solvers_ij.out.3
+GMRES Iterations = 199
+Final GMRES Relative Residual Norm = 9.121629e-17
+
+# Output file: solvers_ij.out.4
+Iterations = 18
+Final Relative Residual Norm = 2.567047e-17
+
+# Output file: solvers_ij.out.5
+Iterations = 375
+Final Relative Residual Norm = 9.100420e-17
+
+# Output file: solvers_ij.out.6
+GMRES Iterations = 65
+Final GMRES Relative Residual Norm = 9.781966e-17
+
+# Output file: solvers_ij.out.7
+Iterations = 42
+Final Relative Residual Norm = 4.868667e-17
+
+# Output file: solvers_ij.out.8
+Iterations = 65
+PCG_Iterations = 0
+DSCG_Iterations = 65
+Final Relative Residual Norm = 5.127733e-17
+
+# Output file: solvers_ij.out.9
+Iterations = 17
+PCG_Iterations = 13
+DSCG_Iterations = 4
+Final Relative Residual Norm = 2.361559e-17
+
+# Output file: solvers_ij.out.10
+Iterations = 16
+PCG_Iterations = 14
+DSCG_Iterations = 2
+Final Relative Residual Norm = 2.081070e-17
+
+# Output file: solvers_ij.out.11
+Iterations = 10
+PCG_Iterations = 7
+DSCG_Iterations = 3
+Final Relative Residual Norm = 1.094045e-17
+
+# Output file: solvers_ij.out.sysh
+ Average Convergence Factor = 0.122527
+
+     Complexity:    grid = 1.613750
+                operator = 2.860373
+                   cycle = 5.720578
+
+# Output file: solvers_ij.out.sysn
+ Average Convergence Factor = 0.222219
+
+     Complexity:    grid = 1.592000
+                operator = 2.633619
+                   cycle = 11.267164
+
+# Output file: solvers_ij.out.sysu
+ Average Convergence Factor = 0.430735
+
+     Complexity:    grid = 1.614937
+                operator = 2.866488
+                   cycle = 5.732598
+
+# Output file: solvers_ij.out.101
+LGMRES Iterations = 175
+Final LGMRES Relative Residual Norm = 9.264825e-17
+
+# Output file: solvers_ij.out.102
+LGMRES Iterations = 17
+Final LGMRES Relative Residual Norm = 2.721454e-17
+
+# Output file: solvers_ij.out.103
+FlexGMRES Iterations = 199
+Final FlexGMRES Relative Residual Norm = 9.121063e-17
+
+# Output file: solvers_ij.out.104
+FlexGMRES Iterations = 14
+Final FlexGMRES Relative Residual Norm = 2.081457e-17
+
+# Output file: solvers_ij.out.105
+Iterations = 27
+Final Relative Residual Norm = 4.195866e-17
+
+# Output file: solvers_ij.out.106
+Iterations = 27
+Final Relative Residual Norm = 4.195866e-17
+
+# Output file: solvers_ij.out.107
+Iterations = 41
+Final Relative Residual Norm = 5.542245e-17
+
+# Output file: solvers_ij.out.108
+Iterations = 41
+Final Relative Residual Norm = 5.542245e-17
+
+# Output file: solvers_ij.out.109
+Iterations = 32
+Final Relative Residual Norm = 8.814032e-17
+
+# Output file: solvers_ij.out.110
+Iterations = 32
+Final Relative Residual Norm = 8.814032e-17
+
+# Output file: solvers_ij.out.111
+Iterations = 56
+Final Relative Residual Norm = 8.514971e-17
+
+# Output file: solvers_ij.out.112
+GMRES Iterations = 43
+Final GMRES Relative Residual Norm = 8.418244e-17
+
+# Output file: solvers_ij.out.113
+GMRES Iterations = 50
+Final GMRES Relative Residual Norm = 7.078678e-17
+
+# Output file: solvers_ij.out.114
+BoomerAMG Iterations = 33
+Final Relative Residual Norm = 3.984250e-17
+
+# Output file: solvers_ij.out.115
+BoomerAMG Iterations = 33
+Final Relative Residual Norm = 8.096418e-17
+
+# Output file: solvers_ij.out.116
+GMRES Iterations = 18
+Final GMRES Relative Residual Norm = 3.003469e-17
+
+# Output file: solvers_ij.out.117
+GMRES Iterations = 18
+Final GMRES Relative Residual Norm = 3.024076e-17
+
diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.sh hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.sh
--- hypre-2.11.2/src/test/TEST_longdouble/solvers_ij.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_longdouble/solvers_ij.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,123 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+TNAME=`basename $0 .sh`
+CONVTOL=$1
+
+# Set default check tolerance
+if [ x$CONVTOL = "x" ];
+then
+    CONVTOL=0.0
+fi
+#echo "tol = $CONVTOL"
+#=============================================================================
+# IJ: Run multiplicative and mult_additive cycle and compare results 
+#                    should be the same
+#=============================================================================
+
+tail -17 ${TNAME}.out.109 | head -6 > ${TNAME}.testdata
+
+#=============================================================================
+
+tail -17 ${TNAME}.out.110 | head -6 > ${TNAME}.testdata.temp
+diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2
+
+#=============================================================================
+# compare with baseline case
+#=============================================================================
+
+FILES="\
+ ${TNAME}.out.0\
+ ${TNAME}.out.1\
+ ${TNAME}.out.2\
+ ${TNAME}.out.3\
+ ${TNAME}.out.4\
+ ${TNAME}.out.5\
+ ${TNAME}.out.6\
+ ${TNAME}.out.7\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done > ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.8\
+ ${TNAME}.out.9\
+ ${TNAME}.out.10\
+ ${TNAME}.out.11\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -5 $i
+done >> ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.sysh\
+ ${TNAME}.out.sysn\
+ ${TNAME}.out.sysu\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -21 $i | head -6
+done >> ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.101\
+ ${TNAME}.out.102\
+ ${TNAME}.out.103\
+ ${TNAME}.out.104\
+ ${TNAME}.out.105\
+ ${TNAME}.out.106\
+ ${TNAME}.out.107\
+ ${TNAME}.out.108\
+ ${TNAME}.out.109\
+ ${TNAME}.out.110\
+ ${TNAME}.out.111\
+ ${TNAME}.out.112\
+ ${TNAME}.out.113\
+ ${TNAME}.out.114\
+ ${TNAME}.out.115\
+ ${TNAME}.out.116\
+ ${TNAME}.out.117\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done >> ${TNAME}.out
+
+# Make sure that the output files are reasonable
+CHECK_LINE="Complexity"
+OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l`
+SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l`
+if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
+   echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2
+fi
+
+if [ -z $HYPRE_NO_SAVED ]; then
+   (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2
+fi
+
+#=============================================================================
+# remove temporary files
+#=============================================================================
+
+rm -f ${TNAME}.testdata*
diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.jobs hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.jobs
--- hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.jobs	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,63 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+
+
+#=============================================================================
+# struct: Run SMG-CG, PFMG-CG, DSCG, CG, and Hybrid.
+#=============================================================================
+
+mpirun -np 3 ./struct -P 1 1 3 -solver 10 -tol 1.0e-16 > solvers_struct.out.0
+mpirun -np 3 ./struct -P 1 3 1 -solver 11 -tol 1.0e-16 > solvers_struct.out.1
+mpirun -np 3 ./struct -P 3 1 1 -solver 17 -tol 1.0e-16 > solvers_struct.out.2
+mpirun -np 1 ./struct -P 1 1 1 -solver 18 -tol 1.0e-16 > solvers_struct.out.3
+mpirun -np 1 ./struct -P 1 1 1 -solver 19 -tol 1.0e-16 > solvers_struct.out.4
+
+
+#=============================================================================
+# Run default case with all available PCG preconditioners (solvers): 
+#    10: SMG (default)
+#    11: PFMG
+#    17: 2-step Jacobi
+#    18: Diagonal scaling
+#    19: none
+#=============================================================================
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 10 -tol 1.e-16 > solvers_struct.out.10.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-16 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.10.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-16 -pcgitr 0  -seed 1 -vrand 5  > solvers_struct.out.10.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 11 -tol 1.e-16  > solvers_struct.out.11.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-16  -pcgitr 0 -seed 1 -vrand 1  > solvers_struct.out.11.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-16  -pcgitr 0 -seed 1 -vrand 5  > solvers_struct.out.11.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 17 -tol 1.e-16 > solvers_struct.out.17.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.17.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 5  > solvers_struct.out.17.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 18  -tol 1.e-16 > solvers_struct.out.18.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.18.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 5  > solvers_struct.out.18.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 19  1.e-16 > solvers_struct.out.19.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.19.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 5  > solvers_struct.out.19.lobpcg.5
diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.saved hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.saved
--- hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.saved	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.saved	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,120 @@
+# Output file: solvers_struct.out.0
+Iterations = 10
+Final Relative Residual Norm = 6.581106e-18
+
+# Output file: solvers_struct.out.1
+Iterations = 21
+Final Relative Residual Norm = 1.596699e-17
+
+# Output file: solvers_struct.out.2
+Iterations = 38
+Final Relative Residual Norm = 7.177049e-17
+
+# Output file: solvers_struct.out.3
+Iterations = 34
+Final Relative Residual Norm = 3.265691e-18
+
+# Output file: solvers_struct.out.4
+Iterations = 34
+Final Relative Residual Norm = 3.219688e-18
+
+# Output file: solvers_struct.out.10.lobpcg
+Iterations = 9
+Final Relative Residual Norm = 8.224720e-17
+
+# Output file: solvers_struct.out.10.lobpcg.1
+Eigenvalue lambda   1.84366453091756e-01
+Residual   6.51777159501743e-08
+
+# Output file: solvers_struct.out.10.lobpcg.5
+Eigenvalue lambda   1.84366453091754e-01
+Eigenvalue lambda   2.50882493969729e-01
+Eigenvalue lambda   3.60090369737174e-01
+Eigenvalue lambda   4.20845334658399e-01
+Eigenvalue lambda   4.20845334658517e-01
+Residual   3.01898591710117e-08
+Residual   4.55422767371506e-08
+Residual   3.17971068849731e-07
+Residual   1.36018481189308e-07
+Residual   3.39522268990133e-07
+
+# Output file: solvers_struct.out.11.lobpcg
+Iterations = 20
+Final Relative Residual Norm = 4.957736e-17
+
+# Output file: solvers_struct.out.11.lobpcg.1
+Eigenvalue lambda   1.84366453091756e-01
+Residual   5.90808378149077e-08
+
+# Output file: solvers_struct.out.11.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969729e-01
+Eigenvalue lambda   3.60090369737179e-01
+Eigenvalue lambda   4.20845334658427e-01
+Eigenvalue lambda   4.20845334658489e-01
+Residual   3.21720429474309e-08
+Residual   6.44224703495813e-08
+Residual   2.42812454193691e-07
+Residual   3.37502149337777e-07
+Residual   3.28950848883713e-07
+
+# Output file: solvers_struct.out.17.lobpcg
+Iterations = 34
+Final Relative Residual Norm = 4.087402e-17
+
+# Output file: solvers_struct.out.17.lobpcg.1
+Eigenvalue lambda   1.84366453091770e-01
+Residual   1.03034775753859e-07
+
+# Output file: solvers_struct.out.17.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969729e-01
+Eigenvalue lambda   3.60090369737173e-01
+Eigenvalue lambda   4.20845334658410e-01
+Eigenvalue lambda   4.20845334658418e-01
+Residual   2.01876274958806e-08
+Residual   2.36598865456373e-07
+Residual   1.80211166022567e-07
+Residual   1.64172205059144e-07
+Residual   1.88447274083737e-07
+
+# Output file: solvers_struct.out.18.lobpcg
+Iterations = 59
+Final Relative Residual Norm = 5.038238e-17
+
+# Output file: solvers_struct.out.18.lobpcg.1
+Eigenvalue lambda   1.84366453091755e-01
+Residual   8.10727379998635e-08
+
+# Output file: solvers_struct.out.18.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969729e-01
+Eigenvalue lambda   3.60090369737172e-01
+Eigenvalue lambda   4.20845334658451e-01
+Eigenvalue lambda   4.20845334658495e-01
+Residual   9.97341733992902e-08
+Residual   1.68228046062857e-07
+Residual   1.17106600356615e-07
+Residual   3.61346668971189e-07
+Residual   3.12170077450817e-07
+
+# Output file: solvers_struct.out.19.lobpcg
+Iterations = 32
+Final Relative Residual Norm = 8.259590e-07
+
+# Output file: solvers_struct.out.19.lobpcg.1
+Eigenvalue lambda   1.84366453091755e-01
+Residual   8.10727379998105e-08
+
+# Output file: solvers_struct.out.19.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969730e-01
+Eigenvalue lambda   3.60090369737172e-01
+Eigenvalue lambda   4.20845334658451e-01
+Eigenvalue lambda   4.20845334658495e-01
+Residual   9.97341733991334e-08
+Residual   1.68228046056442e-07
+Residual   1.17106600356972e-07
+Residual   3.61346507824777e-07
+Residual   3.12150570425402e-07
+
diff -Nru hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.sh hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.sh
--- hypre-2.11.2/src/test/TEST_longdouble/solvers_struct.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_longdouble/solvers_struct.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,79 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+TNAME=`basename $0 .sh`
+CONVTOL=$1
+
+# Set default check tolerance
+if [ x$CONVTOL = "x" ];
+then
+    CONVTOL=0.0
+fi
+#echo "tol = $CONVTOL"
+#=============================================================================
+# compare with baseline case
+#=============================================================================
+
+FILES="\
+ ${TNAME}.out.0\
+ ${TNAME}.out.1\
+ ${TNAME}.out.2\
+ ${TNAME}.out.3\
+ ${TNAME}.out.4\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done > ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.10.lobpcg\
+ ${TNAME}.out.11.lobpcg\
+ ${TNAME}.out.17.lobpcg\
+ ${TNAME}.out.18.lobpcg\
+ ${TNAME}.out.19.lobpcg\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+  echo "# Output file: $i.1"
+  tail -13 $i.1 | head -3
+  echo "# Output file: $i.5"
+  tail -21 $i.5 | head -11
+done >> ${TNAME}.out
+
+# Make sure that the output files are reasonable
+CHECK_LINE="Iterations"
+OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l`
+SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l`
+if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
+   echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2
+fi
+
+if [ -z $HYPRE_NO_SAVED ]; then
+   (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2
+fi
+
+#=============================================================================
+# remove temporary files
+#=============================================================================
+
+# rm -f ${TNAME}.testdata*
diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_ij.jobs hypre-2.13.0/src/test/TEST_single/solvers_ij.jobs
--- hypre-2.11.2/src/test/TEST_single/solvers_ij.jobs	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_single/solvers_ij.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,92 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+
+
+#=============================================================================
+# ij: Run default case with different solvers
+#    1: BoomerAMG_PCG
+#    2: DS_PCG
+#    3: BoomerAMG_GMRES
+#    4: DS_GMRES
+#    5: BoomerAMG_CGNR
+#    6: DS_CGNR
+#    7: PILUT_GMRES
+#    8: ParaSails_PCG
+#   20: Hybrid_PCG
+#
+
+
+
+# ij: test systems AMG
+#	  unknown approach
+#	  hybrid approach with block smoother
+# 	  nodal approach
+#       more solvers:
+#        51: BoomerAMG_LGMRES
+#        50: DS_LGMRES
+#        61: BoomerAMG_FlexGMRES
+#        60: DS_FlexGMRES
+#
+#=============================================================================
+
+mpirun -np 2 ./ij -solver 1 -tol 1.e-4 -rhsrand > solvers_ij.out.0
+mpirun -np 2 ./ij -solver 2 -tol 1.e-4 -rhsrand > solvers_ij.out.1
+mpirun -np 2 ./ij -solver 3 -tol 1.e-4 -rhsrand > solvers_ij.out.2
+mpirun -np 2 ./ij -solver 4 -tol 1.e-4 -rhsrand > solvers_ij.out.3
+mpirun -np 2 ./ij -solver 5 -tol 1.e-4 -rhsrand -w 0.67 -ns 2 > solvers_ij.out.4
+mpirun -np 2 ./ij -solver 6 -tol 1.e-4 -rhsrand > solvers_ij.out.5
+mpirun -np 2 ./ij -solver 7 -tol 1.e-4 -rhsrand > solvers_ij.out.6
+mpirun -np 2 ./ij -solver 8 -tol 1.e-4 -rhsrand > solvers_ij.out.7
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -rhsrand > solvers_ij.out.8
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand > solvers_ij.out.9
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand -solver_type 2 > solvers_ij.out.10
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand -solver_type 3 > solvers_ij.out.11
+
+#systems AMG run ...unknown approach, hybrid approach, nodal approach
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -tol 1.e-4 > solvers_ij.out.sysu
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 -tol 1.e-4 > solvers_ij.out.sysh
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 -tol 1.e-4 > solvers_ij.out.sysn
+
+#LGMRS and FlexGMRES
+mpirun -np 2 ./ij -solver 50 -tol 1.e-4 -rhsrand > solvers_ij.out.101
+mpirun -np 2 ./ij -solver 51 -tol 1.e-4 -rhsrand > solvers_ij.out.102
+mpirun -np 2 ./ij -solver 60 -tol 1.e-4 -rhsrand > solvers_ij.out.103
+mpirun -np 2 ./ij -solver 61 -tol 1.e-4 -rhsrand > solvers_ij.out.104
+
+#agglomerated coarse grid solve
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -tol 1.e-4 > solvers_ij.out.105
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -tol 1.e-4 > solvers_ij.out.107
+
+#redundant coarse grid solve
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 -tol 1.e-4 > solvers_ij.out.106
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 -tol 1.e-4 > solvers_ij.out.108
+
+#additive cycles
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 -tol 1.e-4 > solvers_ij.out.109
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 -tol 1.e-4 > solvers_ij.out.110
+mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 -tol 1.e-4 > solvers_ij.out.111
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 -tol 1.e-4 > solvers_ij.out.112
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -tol 1.e-4 > solvers_ij.out.113
+
+#nonGalerkin version
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 -tol 1.e-4 > solvers_ij.out.114
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 -tol 1.e-4 > solvers_ij.out.115
+
+#RAP options
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 -tol 1.e-4 > solvers_ij.out.116
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 -tol 1.e-4 > solvers_ij.out.117
diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_ij.saved hypre-2.13.0/src/test/TEST_single/solvers_ij.saved
--- hypre-2.11.2/src/test/TEST_single/solvers_ij.saved	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_single/solvers_ij.saved	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,145 @@
+# Output file: solvers_ij.out.0
+Iterations = 4
+Final Relative Residual Norm = 1.622019e-05
+
+# Output file: solvers_ij.out.1
+Iterations = 26
+Final Relative Residual Norm = 7.193490e-05
+
+# Output file: solvers_ij.out.2
+GMRES Iterations = 4
+Final GMRES Relative Residual Norm = 1.566634e-05
+
+# Output file: solvers_ij.out.3
+GMRES Iterations = 39
+Final GMRES Relative Residual Norm = 9.040770e-05
+
+# Output file: solvers_ij.out.4
+Iterations = 5
+Final Relative Residual Norm = 1.566360e-05
+
+# Output file: solvers_ij.out.5
+Iterations = 103
+Final Relative Residual Norm = 8.784404e-05
+
+# Output file: solvers_ij.out.6
+GMRES Iterations = 15
+Final GMRES Relative Residual Norm = 7.131740e-05
+
+# Output file: solvers_ij.out.7
+Iterations = 13
+Final Relative Residual Norm = 7.750608e-05
+
+# Output file: solvers_ij.out.8
+Iterations = 26
+PCG_Iterations = 0
+DSCG_Iterations = 26
+Final Relative Residual Norm = 7.193490e-05
+
+# Output file: solvers_ij.out.9
+Iterations = 7
+PCG_Iterations = 3
+DSCG_Iterations = 4
+Final Relative Residual Norm = 6.677458e-05
+
+# Output file: solvers_ij.out.10
+Iterations = 6
+PCG_Iterations = 4
+DSCG_Iterations = 2
+Final Relative Residual Norm = 1.566634e-05
+
+# Output file: solvers_ij.out.11
+Iterations = 5
+PCG_Iterations = 2
+DSCG_Iterations = 3
+Final Relative Residual Norm = 9.148609e-06
+
+# Output file: solvers_ij.out.sysh
+ Average Convergence Factor = 0.122042
+
+     Complexity:    grid = 1.613750
+                operator = 2.860298
+                   cycle = 5.720429
+
+# Output file: solvers_ij.out.sysn
+ Average Convergence Factor = 0.241065
+
+     Complexity:    grid = 1.592000
+                operator = 2.633619
+                   cycle = 11.267164
+
+# Output file: solvers_ij.out.sysu
+ Average Convergence Factor = 0.413666
+
+     Complexity:    grid = 1.614812
+                operator = 2.865126
+                   cycle = 5.730247
+
+# Output file: solvers_ij.out.101
+LGMRES Iterations = 39
+Final LGMRES Relative Residual Norm = 7.223550e-05
+
+# Output file: solvers_ij.out.102
+LGMRES Iterations = 4
+Final LGMRES Relative Residual Norm = 1.566634e-05
+
+# Output file: solvers_ij.out.103
+FlexGMRES Iterations = 39
+Final FlexGMRES Relative Residual Norm = 9.040783e-05
+
+# Output file: solvers_ij.out.104
+FlexGMRES Iterations = 4
+Final FlexGMRES Relative Residual Norm = 1.567694e-05
+
+# Output file: solvers_ij.out.105
+Iterations = 9
+Final Relative Residual Norm = 2.436101e-05
+
+# Output file: solvers_ij.out.106
+Iterations = 9
+Final Relative Residual Norm = 2.436101e-05
+
+# Output file: solvers_ij.out.107
+Iterations = 12
+Final Relative Residual Norm = 6.371664e-05
+
+# Output file: solvers_ij.out.108
+Iterations = 12
+Final Relative Residual Norm = 6.371664e-05
+
+# Output file: solvers_ij.out.109
+Iterations = 9
+Final Relative Residual Norm = 8.952725e-05
+
+# Output file: solvers_ij.out.110
+Iterations = 9
+Final Relative Residual Norm = 8.952696e-05
+
+# Output file: solvers_ij.out.111
+Iterations = 15
+Final Relative Residual Norm = 8.953455e-05
+
+# Output file: solvers_ij.out.112
+GMRES Iterations = 12
+Final GMRES Relative Residual Norm = 5.915735e-05
+
+# Output file: solvers_ij.out.113
+GMRES Iterations = 13
+Final GMRES Relative Residual Norm = 7.913169e-05
+
+# Output file: solvers_ij.out.114
+BoomerAMG Iterations = 9
+Final Relative Residual Norm = 3.746825e-05
+
+# Output file: solvers_ij.out.115
+BoomerAMG Iterations = 9
+Final Relative Residual Norm = 3.886564e-05
+
+# Output file: solvers_ij.out.116
+GMRES Iterations = 6
+Final GMRES Relative Residual Norm = 3.062358e-05
+
+# Output file: solvers_ij.out.117
+GMRES Iterations = 6
+Final GMRES Relative Residual Norm = 3.026567e-05
+
diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_ij.sh hypre-2.13.0/src/test/TEST_single/solvers_ij.sh
--- hypre-2.11.2/src/test/TEST_single/solvers_ij.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_single/solvers_ij.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,124 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+TNAME=`basename $0 .sh`
+CONVTOL=$1
+
+# Set default check tolerance
+if [ x$CONVTOL = "x" ];
+then
+    CONVTOL=0.0
+fi
+#echo "tol = $CONVTOL"
+#=============================================================================
+# IJ: Run multiplicative and mult_additive cycle and compare results 
+#                    should be the same
+#=============================================================================
+
+tail -17 ${TNAME}.out.109 | head -3 > ${TNAME}.testdata
+
+#=============================================================================
+
+tail -17 ${TNAME}.out.110 | head -3 > ${TNAME}.testdata.temp
+#diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2
+../runcheck.sh ${TNAME}.testdata ${TNAME}.testdata.temp 1.e-4 >&2
+
+#=============================================================================
+# compare with baseline case
+#=============================================================================
+
+FILES="\
+ ${TNAME}.out.0\
+ ${TNAME}.out.1\
+ ${TNAME}.out.2\
+ ${TNAME}.out.3\
+ ${TNAME}.out.4\
+ ${TNAME}.out.5\
+ ${TNAME}.out.6\
+ ${TNAME}.out.7\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done > ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.8\
+ ${TNAME}.out.9\
+ ${TNAME}.out.10\
+ ${TNAME}.out.11\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -5 $i
+done >> ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.sysh\
+ ${TNAME}.out.sysn\
+ ${TNAME}.out.sysu\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -21 $i | head -6
+done >> ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.101\
+ ${TNAME}.out.102\
+ ${TNAME}.out.103\
+ ${TNAME}.out.104\
+ ${TNAME}.out.105\
+ ${TNAME}.out.106\
+ ${TNAME}.out.107\
+ ${TNAME}.out.108\
+ ${TNAME}.out.109\
+ ${TNAME}.out.110\
+ ${TNAME}.out.111\
+ ${TNAME}.out.112\
+ ${TNAME}.out.113\
+ ${TNAME}.out.114\
+ ${TNAME}.out.115\
+ ${TNAME}.out.116\
+ ${TNAME}.out.117\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done >> ${TNAME}.out
+
+# Make sure that the output files are reasonable
+CHECK_LINE="Complexity"
+OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l`
+SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l`
+if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
+   echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2
+fi
+
+if [ -z $HYPRE_NO_SAVED ]; then
+   (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2
+fi
+
+#=============================================================================
+# remove temporary files
+#=============================================================================
+
+rm -f ${TNAME}.testdata*
diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_struct.jobs hypre-2.13.0/src/test/TEST_single/solvers_struct.jobs
--- hypre-2.11.2/src/test/TEST_single/solvers_struct.jobs	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_single/solvers_struct.jobs	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,63 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+
+
+#=============================================================================
+# struct: Run SMG-CG, PFMG-CG, DSCG, CG, and Hybrid.
+#=============================================================================
+
+mpirun -np 3 ./struct -P 1 1 3 -solver 10 -tol 1.0e-4 > solvers_struct.out.0
+mpirun -np 3 ./struct -P 1 3 1 -solver 11 -tol 1.0e-4 > solvers_struct.out.1
+mpirun -np 3 ./struct -P 3 1 1 -solver 17 -tol 1.0e-4 > solvers_struct.out.2
+mpirun -np 1 ./struct -P 1 1 1 -solver 18 -tol 1.0e-4 > solvers_struct.out.3
+mpirun -np 1 ./struct -P 1 1 1 -solver 19 -tol 1.0e-4 > solvers_struct.out.4
+
+
+#=============================================================================
+# Run default case with all available PCG preconditioners (solvers): 
+#    10: SMG (default)
+#    11: PFMG
+#    17: 2-step Jacobi
+#    18: Diagonal scaling
+#    19: none
+#=============================================================================
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 10 -tol 1.e-4 > solvers_struct.out.10.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-4 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.10.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-4 -pcgitr 0  -seed 1 -vrand 3  > solvers_struct.out.10.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 11 -tol 1.e-4  > solvers_struct.out.11.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-4  -pcgitr 0 -seed 1 -vrand 1  > solvers_struct.out.11.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-4  -pcgitr 0 -seed 1 -vrand 3  > solvers_struct.out.11.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 17  > solvers_struct.out.17.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.17.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 3  > solvers_struct.out.17.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 18  > solvers_struct.out.18.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.18.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 3  > solvers_struct.out.18.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 19 -tol 1.e-4  > solvers_struct.out.19.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.19.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 3  > solvers_struct.out.19.lobpcg.3
diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_struct.saved hypre-2.13.0/src/test/TEST_single/solvers_struct.saved
--- hypre-2.11.2/src/test/TEST_single/solvers_struct.saved	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_single/solvers_struct.saved	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,120 @@
+# Output file: solvers_struct.out.0
+Iterations = 3
+Final Relative Residual Norm = 3.246673e-05
+
+# Output file: solvers_struct.out.1
+Iterations = 6
+Final Relative Residual Norm = 2.055852e-05
+
+# Output file: solvers_struct.out.2
+Iterations = 16
+Final Relative Residual Norm = 5.377689e-05
+
+# Output file: solvers_struct.out.3
+Iterations = 16
+Final Relative Residual Norm = 3.726248e-05
+
+# Output file: solvers_struct.out.4
+Iterations = 16
+Final Relative Residual Norm = 3.718718e-05
+
+# Output file: solvers_struct.out.10.lobpcg
+Iterations = 3
+Final Relative Residual Norm = 6.275783e-06
+
+# Output file: solvers_struct.out.10.lobpcg.1
+Eigenvalue lambda   1.84366211295128e-01
+Residual   2.48430933424970e-05
+
+# Output file: solvers_struct.out.10.lobpcg.3
+Iteration 10 	bsize 2 	maxres   4.35155990999192e-04
+Iteration 11 	bsize 1 	maxres   2.05302669201046e-04
+Iteration 12 	bsize 1 	maxres   8.52039884193800e-05
+
+Eigenvalue lambda   1.84366419911385e-01
+Eigenvalue lambda   2.50882804393768e-01
+Eigenvalue lambda   3.60089868307114e-01
+Residual   7.40827381378040e-05
+Residual   4.10445172747131e-05
+Residual   8.52039884193800e-05
+
+# Output file: solvers_struct.out.11.lobpcg
+Iterations = 6
+Final Relative Residual Norm = 2.112822e-05
+
+# Output file: solvers_struct.out.11.lobpcg.1
+Eigenvalue lambda   1.84366270899773e-01
+Residual   3.14826756948605e-05
+
+# Output file: solvers_struct.out.11.lobpcg.3
+Iteration 11 	bsize 2 	maxres   6.89935637637973e-04
+Iteration 12 	bsize 2 	maxres   2.52041267231107e-04
+Iteration 13 	bsize 1 	maxres   7.04026824678294e-05
+
+Eigenvalue lambda   1.84366539120674e-01
+Eigenvalue lambda   2.50883042812347e-01
+Eigenvalue lambda   3.60092163085938e-01
+Residual   5.59787331440020e-05
+Residual   2.58176805800758e-05
+Residual   7.04026824678294e-05
+
+# Output file: solvers_struct.out.17.lobpcg
+Iterations = 20
+Final Relative Residual Norm = 4.194806e-07
+
+# Output file: solvers_struct.out.17.lobpcg.1
+Eigenvalue lambda   1.84366390109062e-01
+Residual   1.95900083781453e-05
+
+# Output file: solvers_struct.out.17.lobpcg.3
+Iteration 10 	bsize 2 	maxres   3.62457707524300e-04
+Iteration 11 	bsize 1 	maxres   1.69860562891699e-04
+Iteration 12 	bsize 1 	maxres   7.12833571014926e-05
+
+Eigenvalue lambda   1.84366583824158e-01
+Eigenvalue lambda   2.50883996486664e-01
+Eigenvalue lambda   3.60090583562851e-01
+Residual   5.53683385078330e-05
+Residual   3.08582348225173e-05
+Residual   7.12833571014926e-05
+
+# Output file: solvers_struct.out.18.lobpcg
+Iterations = 33
+Final Relative Residual Norm = 8.027236e-07
+
+# Output file: solvers_struct.out.18.lobpcg.1
+Eigenvalue lambda   1.84366077184677e-01
+Residual   4.44860852439888e-05
+
+# Output file: solvers_struct.out.18.lobpcg.3
+Iteration 10 	bsize 2 	maxres   5.79534214921296e-04
+Iteration 11 	bsize 1 	maxres   1.98537352844141e-04
+Iteration 12 	bsize 1 	maxres   9.26745269680396e-05
+
+Eigenvalue lambda   1.84366509318352e-01
+Eigenvalue lambda   2.50894546508789e-01
+Eigenvalue lambda   3.60091388225555e-01
+Residual   9.26745269680396e-05
+Residual   8.93285541678779e-05
+Residual   5.54441285203211e-05
+
+# Output file: solvers_struct.out.19.lobpcg
+Iterations = 25
+Final Relative Residual Norm = 7.712499e-05
+
+# Output file: solvers_struct.out.19.lobpcg.1
+Eigenvalue lambda   1.84366524219513e-01
+Residual   4.43533899670001e-05
+
+# Output file: solvers_struct.out.19.lobpcg.3
+Iteration 10 	bsize 2 	maxres   5.82181091886014e-04
+Iteration 11 	bsize 1 	maxres   1.99127141968347e-04
+Iteration 12 	bsize 1 	maxres   9.26545544643886e-05
+
+Eigenvalue lambda   1.84366345405579e-01
+Eigenvalue lambda   2.50889092683792e-01
+Eigenvalue lambda   3.60089659690857e-01
+Residual   9.26545544643886e-05
+Residual   8.75688710948452e-05
+Residual   5.51430639461614e-05
+
diff -Nru hypre-2.11.2/src/test/TEST_single/solvers_struct.sh hypre-2.13.0/src/test/TEST_single/solvers_struct.sh
--- hypre-2.11.2/src/test/TEST_single/solvers_struct.sh	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_single/solvers_struct.sh	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,79 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+TNAME=`basename $0 .sh`
+CONVTOL=$1
+
+# Set default check tolerance
+if [ x$CONVTOL = "x" ];
+then
+    CONVTOL=0.0
+fi
+#echo "tol = $CONVTOL"
+#=============================================================================
+# compare with baseline case
+#=============================================================================
+
+FILES="\
+ ${TNAME}.out.0\
+ ${TNAME}.out.1\
+ ${TNAME}.out.2\
+ ${TNAME}.out.3\
+ ${TNAME}.out.4\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+done > ${TNAME}.out
+
+FILES="\
+ ${TNAME}.out.10.lobpcg\
+ ${TNAME}.out.11.lobpcg\
+ ${TNAME}.out.17.lobpcg\
+ ${TNAME}.out.18.lobpcg\
+ ${TNAME}.out.19.lobpcg\
+"
+
+for i in $FILES
+do
+  echo "# Output file: $i"
+  tail -3 $i
+  echo "# Output file: $i.1"
+  tail -13 $i.1 | head -3
+  echo "# Output file: $i.3"
+  tail -21 $i.3 | head -11
+done >> ${TNAME}.out
+
+# Make sure that the output files are reasonable
+CHECK_LINE="Iterations"
+OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l`
+SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l`
+if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
+   echo "Incorrect number of \"$CHECK_LINE\" lines in ${TNAME}.out" >&2
+fi
+
+if [ -z $HYPRE_NO_SAVED ]; then
+   (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2
+fi
+
+#=============================================================================
+# remove temporary files
+#=============================================================================
+
+# rm -f ${TNAME}.testdata*
diff -Nru hypre-2.11.2/src/test/TEST_sstruct/solvers.saved hypre-2.13.0/src/test/TEST_sstruct/solvers.saved
--- hypre-2.11.2/src/test/TEST_sstruct/solvers.saved	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_sstruct/solvers.saved	2017-10-20 17:42:22.000000000 +0000
@@ -87,70 +87,70 @@
 Final Relative Residual Norm = 6.429522e-07
 
 # Output file: solvers.out.10.lobpcg.1
-Eigenvalue lambda   1.34883860790786e+00
-Residual   2.40758202827198e-03
+Eigenvalue lambda   1.34880853089918e+00
+Residual   1.40515754419491e-04
 
 # Output file: solvers.out.10.lobpcg.4
-Eigenvalue lambda   1.34880848246868e+00
-Eigenvalue lambda   1.36560847743331e+00
-Eigenvalue lambda   1.47908699641224e+00
-Eigenvalue lambda   1.49589925871990e+00
-Residual   2.16895015738541e-06
-Residual   2.26924940977357e-06
-Residual   2.30461087038239e-06
-Residual   2.31349855264711e-06
+Eigenvalue lambda   1.34880848246852e+00
+Eigenvalue lambda   1.36560847743334e+00
+Eigenvalue lambda   1.47908699641636e+00
+Eigenvalue lambda   1.49589925873578e+00
+Residual   1.88411068757809e-06
+Residual   1.92352730596892e-06
+Residual   2.62402461132032e-06
+Residual   5.33772292781341e-06
 
 # Output file: solvers.out.11.lobpcg
 Iterations = 24
 Final Relative Residual Norm = 6.654613e-07
 
 # Output file: solvers.out.11.lobpcg.1
-Eigenvalue lambda   1.34884333064988e+00
-Residual   2.59663483715351e-03
+Eigenvalue lambda   1.34880853049745e+00
+Residual   1.27609473406123e-04
 
 # Output file: solvers.out.11.lobpcg.4
-Eigenvalue lambda   1.34880848246851e+00
-Eigenvalue lambda   1.36560847743324e+00
-Eigenvalue lambda   1.47908699641209e+00
-Eigenvalue lambda   1.49589925871908e+00
-Residual   1.80601168111517e-06
-Residual   2.08224805414380e-06
-Residual   2.43864085668235e-06
-Residual   2.07089097060174e-06
+Eigenvalue lambda   1.34880848246853e+00
+Eigenvalue lambda   1.36560847743866e+00
+Eigenvalue lambda   1.47908699650267e+00
+Eigenvalue lambda   1.49589925889085e+00
+Residual   2.34138605050087e-06
+Residual   2.11050112570845e-06
+Residual   2.34273161452232e-06
+Residual   4.55133814558691e-06
 
 # Output file: solvers.out.18.lobpcg
 Iterations = 25
 Final Relative Residual Norm = 9.124482e-07
 
 # Output file: solvers.out.18.lobpcg.1
-Eigenvalue lambda   1.34880848247360e+00
-Residual   1.27710890197410e-06
+Eigenvalue lambda   1.34880848247421e+00
+Residual   1.68760664468992e-06
 
 # Output file: solvers.out.18.lobpcg.4
-Eigenvalue lambda   1.34880848246838e+00
-Eigenvalue lambda   1.36560847743320e+00
-Eigenvalue lambda   1.47908699641198e+00
-Eigenvalue lambda   1.49589925871866e+00
-Residual   1.35084619640744e-06
-Residual   1.98965928996447e-06
-Residual   1.13053053095074e-06
-Residual   2.44142594272676e-06
+Eigenvalue lambda   1.34880848246833e+00
+Eigenvalue lambda   1.36560847743301e+00
+Eigenvalue lambda   1.47908699641137e+00
+Eigenvalue lambda   1.49589925871590e+00
+Residual   7.29467860779249e-07
+Residual   9.62135023675930e-07
+Residual   1.05238502112812e-06
+Residual   1.42836272383011e-06
 
 # Output file: solvers.out.19.lobpcg
 Iterations = 25
 Final Relative Residual Norm = 9.124482e-07
 
 # Output file: solvers.out.19.lobpcg.1
-Eigenvalue lambda   1.34880848247360e+00
-Residual   1.27710890239225e-06
+Eigenvalue lambda   1.34880848247421e+00
+Residual   1.68760664640186e-06
 
 # Output file: solvers.out.19.lobpcg.4
-Eigenvalue lambda   1.34880848246840e+00
-Eigenvalue lambda   1.36560847743319e+00
-Eigenvalue lambda   1.47908699641203e+00
-Eigenvalue lambda   1.49589925871871e+00
-Residual   1.35084619677939e-06
-Residual   1.98965929201569e-06
-Residual   1.13053053130770e-06
-Residual   2.44142594408303e-06
+Eigenvalue lambda   1.34880848246915e+00
+Eigenvalue lambda   1.36560847743475e+00
+Eigenvalue lambda   1.47908699641975e+00
+Eigenvalue lambda   1.49589925875392e+00
+Residual   7.29467866482847e-07
+Residual   9.62135024900930e-07
+Residual   1.05238500241244e-06
+Residual   1.42836275555007e-06
 
diff -Nru hypre-2.11.2/src/test/TEST_struct/solvers.saved hypre-2.13.0/src/test/TEST_struct/solvers.saved
--- hypre-2.11.2/src/test/TEST_struct/solvers.saved	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/TEST_struct/solvers.saved	2017-10-20 17:42:22.000000000 +0000
@@ -23,98 +23,98 @@
 Final Relative Residual Norm = 1.355288e-07
 
 # Output file: solvers.out.10.lobpcg.1
-Eigenvalue lambda   1.84366453092285e-01
-Residual   9.48102341087739e-07
+Eigenvalue lambda   1.84366453091860e-01
+Residual   4.52048636947317e-07
 
 # Output file: solvers.out.10.lobpcg.5
-Eigenvalue lambda   1.84366453091761e-01
-Eigenvalue lambda   2.50882493969895e-01
-Eigenvalue lambda   3.60090369737266e-01
-Eigenvalue lambda   4.20845334658830e-01
-Eigenvalue lambda   4.20845334659278e-01
-Residual   3.22224736401280e-07
-Residual   4.14402445882080e-07
-Residual   9.42143466466521e-07
-Residual   7.77020993206880e-07
-Residual   7.94159301934379e-07
+Eigenvalue lambda   1.84366453091770e-01
+Eigenvalue lambda   2.50882493969728e-01
+Eigenvalue lambda   3.60090369737200e-01
+Eigenvalue lambda   4.20845334658571e-01
+Eigenvalue lambda   4.20845334658890e-01
+Residual   3.35930189890736e-07
+Residual   3.11260814535858e-07
+Residual   7.33053164875689e-07
+Residual   5.14259223167152e-07
+Residual   7.32161520953280e-07
 
 # Output file: solvers.out.11.lobpcg
 Iterations = 8
 Final Relative Residual Norm = 4.807900e-07
 
 # Output file: solvers.out.11.lobpcg.1
-Eigenvalue lambda   1.84366453091815e-01
-Residual   3.85721107269604e-07
+Eigenvalue lambda   1.84366453092320e-01
+Residual   1.05361158868911e-06
 
 # Output file: solvers.out.11.lobpcg.5
-Eigenvalue lambda   1.84366453091761e-01
-Eigenvalue lambda   2.50882493969760e-01
-Eigenvalue lambda   3.60090369737228e-01
-Eigenvalue lambda   4.20845334658737e-01
-Eigenvalue lambda   4.20845334659947e-01
-Residual   2.05382831978917e-07
-Residual   4.59866546803063e-07
-Residual   9.90818911866409e-07
-Residual   1.33480975037863e-06
-Residual   1.29716729646718e-06
+Eigenvalue lambda   1.84366453091757e-01
+Eigenvalue lambda   2.50882493969849e-01
+Eigenvalue lambda   3.60090369737247e-01
+Eigenvalue lambda   4.20845334660032e-01
+Eigenvalue lambda   4.20845334660337e-01
+Residual   1.94648827543750e-07
+Residual   1.22160192258141e-06
+Residual   7.71834588037176e-07
+Residual   1.28057049961329e-06
+Residual   1.26100802396762e-06
 
 # Output file: solvers.out.17.lobpcg
 Iterations = 17
 Final Relative Residual Norm = 8.241147e-07
 
 # Output file: solvers.out.17.lobpcg.1
-Eigenvalue lambda   1.84366453091981e-01
-Residual   6.58030973127707e-07
+Eigenvalue lambda   1.84366453091822e-01
+Residual   2.74559569275018e-07
 
 # Output file: solvers.out.17.lobpcg.5
-Eigenvalue lambda   1.84366453091754e-01
-Eigenvalue lambda   2.50882493969533e-01
-Eigenvalue lambda   3.60090369737259e-01
-Eigenvalue lambda   4.20845334658560e-01
-Eigenvalue lambda   4.20845334659094e-01
-Residual   2.18600906457648e-07
-Residual   2.01409973718392e-07
-Residual   7.10622693277451e-07
-Residual   8.92153133859236e-07
-Residual   9.10269958132537e-07
+Eigenvalue lambda   1.84366453091761e-01
+Eigenvalue lambda   2.50882493969758e-01
+Eigenvalue lambda   3.60090369737186e-01
+Eigenvalue lambda   4.20845334658611e-01
+Eigenvalue lambda   4.20845334658816e-01
+Residual   2.41984718651313e-07
+Residual   1.16310823945663e-06
+Residual   6.95498018578113e-07
+Residual   7.11395856759138e-07
+Residual   7.20734654889730e-07
 
 # Output file: solvers.out.18.lobpcg
 Iterations = 32
 Final Relative Residual Norm = 8.259590e-07
 
 # Output file: solvers.out.18.lobpcg.1
-Eigenvalue lambda   1.84366453091822e-01
-Residual   2.06395470109285e-07
+Eigenvalue lambda   1.84366453091920e-01
+Residual   4.04811751101852e-07
 
 # Output file: solvers.out.18.lobpcg.5
-Eigenvalue lambda   1.84366453091753e-01
-Eigenvalue lambda   2.50882493969752e-01
-Eigenvalue lambda   3.60090369737251e-01
-Eigenvalue lambda   4.20845334658871e-01
-Eigenvalue lambda   4.20845334659104e-01
-Residual   1.23291603592449e-07
-Residual   4.17547991069515e-07
-Residual   1.17426104064181e-06
-Residual   1.09268026031814e-06
-Residual   9.56432798770891e-07
+Eigenvalue lambda   1.84366453091760e-01
+Eigenvalue lambda   2.50882493969440e-01
+Eigenvalue lambda   3.60090369737180e-01
+Eigenvalue lambda   4.20845334658791e-01
+Eigenvalue lambda   4.20845334659400e-01
+Residual   6.51620151548372e-07
+Residual   1.04113397676966e-06
+Residual   1.05795257133655e-06
+Residual   1.01625954175797e-06
+Residual   1.09288008877697e-06
 
 # Output file: solvers.out.19.lobpcg
 Iterations = 32
 Final Relative Residual Norm = 8.259590e-07
 
 # Output file: solvers.out.19.lobpcg.1
-Eigenvalue lambda   1.84366453091822e-01
-Residual   2.06395470089861e-07
+Eigenvalue lambda   1.84366453091920e-01
+Residual   4.04811751079993e-07
 
 # Output file: solvers.out.19.lobpcg.5
-Eigenvalue lambda   1.84366453091755e-01
-Eigenvalue lambda   2.50882493969748e-01
-Eigenvalue lambda   3.60090369737247e-01
-Eigenvalue lambda   4.20845334658872e-01
-Eigenvalue lambda   4.20845334659103e-01
-Residual   1.23291603639448e-07
-Residual   4.17547991296164e-07
-Residual   1.17426103999361e-06
-Residual   1.09271838882886e-06
-Residual   9.55993938911220e-07
+Eigenvalue lambda   1.84366453091760e-01
+Eigenvalue lambda   2.50882493969706e-01
+Eigenvalue lambda   3.60090369737178e-01
+Eigenvalue lambda   4.20845334658819e-01
+Eigenvalue lambda   4.20845334659484e-01
+Residual   6.51620151941224e-07
+Residual   1.04113396974827e-06
+Residual   1.05795252287838e-06
+Residual   1.01635836602686e-06
+Residual   1.09013903468312e-06
 
diff -Nru hypre-2.11.2/src/test/zboxloop.c hypre-2.13.0/src/test/zboxloop.c
--- hypre-2.11.2/src/test/zboxloop.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/test/zboxloop.c	2017-10-20 17:42:22.000000000 +0000
@@ -38,7 +38,8 @@
    HYPRE_Int         rep, reps, fail, sum;
    HYPRE_Int         size;
    hypre_Box        *x1_data_box, *x2_data_box, *x3_data_box, *x4_data_box;
-   HYPRE_Int         xi1, xi2, xi3, xi4;
+   //HYPRE_Int         xi1, xi2, xi3, xi4;
+   HYPRE_Int         xi1;
    HYPRE_Real       *xp1, *xp2, *xp3, *xp4;
    hypre_Index       loop_size, start, unit_stride, index;
    
@@ -192,7 +193,7 @@
    zypre_BoxLoop1Begin(dim, loop_size,
                        x1_data_box, start, unit_stride, xi1);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    zypre_BoxLoop1For(xi1)
    {
@@ -203,9 +204,8 @@
    /* Use old boxloop to check that values are set to 1 */
    fail = 0;
    sum = 0;
-   hypre_BoxLoop1Begin(3, loop_size,
-                       x1_data_box, start, unit_stride, xi1);
-   hypre_BoxLoop1For(xi1)
+   hypre_SerialBoxLoop1Begin(3, loop_size,
+			     x1_data_box, start, unit_stride, xi1);
    {
       sum += xp1[xi1];
       if (xp1[xi1] != 1)
@@ -216,7 +216,7 @@
          fail = 1;
       }
    }
-   hypre_BoxLoop1End(xi1);
+   hypre_SerialBoxLoop1End(xi1);
 
    if (sum != (nx*ny*nz))
    {
@@ -251,7 +251,7 @@
       hypre_BoxLoop0For()
       {
          xp1[xi1] += xp1[xi1];
-         xi1++;
+         //xi1++;
       }
       hypre_BoxLoop0End();
    }
@@ -265,7 +265,7 @@
       hypre_BoxLoop1Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(xi1)
       {
@@ -284,7 +284,7 @@
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi1, xi2)
       {
@@ -304,7 +304,7 @@
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2,xi3) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop3For(xi1, xi2, xi3)
       {
@@ -325,7 +325,7 @@
                           x3_data_box, start, unit_stride, xi3,
                           x4_data_box, start, unit_stride, xi4);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2,xi3,xi4) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(xi1, xi2, xi3, xi4)
       {
@@ -370,7 +370,7 @@
       zypre_BoxLoop1Begin(dim, loop_size,
                           x1_data_box, start, unit_stride, xi1);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop1For(xi1)
       {
@@ -389,7 +389,7 @@
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop2For(xi1, xi2)
       {
@@ -409,7 +409,7 @@
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2,xi3) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop3For(xi1, xi2, xi3)
       {
@@ -430,7 +430,7 @@
                           x3_data_box, start, unit_stride, xi3,
                           x4_data_box, start, unit_stride, xi4);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2,xi3,xi4) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop4For(xi1, xi2, xi3, xi4)
       {
diff -Nru hypre-2.11.2/src/utilities/amg_linklist.h hypre-2.13.0/src/utilities/amg_linklist.h
--- hypre-2.11.2/src/utilities/amg_linklist.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/amg_linklist.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,8 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
 /******************************************************************************
  *
  * Header file link lists
@@ -46,3 +44,4 @@
 #endif
 
 #endif
+
diff -Nru hypre-2.11.2/src/utilities/caliper_instrumentation.h hypre-2.13.0/src/utilities/caliper_instrumentation.h
--- hypre-2.11.2/src/utilities/caliper_instrumentation.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/caliper_instrumentation.h	2017-10-20 17:42:22.000000000 +0000
@@ -36,3 +36,4 @@
 #endif
 
 #endif /* CALIPER_INSTRUMENTATION_HEADER */
+
diff -Nru hypre-2.11.2/src/utilities/exchange_data.h hypre-2.13.0/src/utilities/exchange_data.h
--- hypre-2.11.2/src/utilities/exchange_data.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/exchange_data.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,7 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
 #ifndef hypre_EXCHANGE_DATA_HEADER
 #define hypre_EXCHANGE_DATA_HEADER
 
@@ -19,7 +18,6 @@
 #define hypre_BinaryTreeChildIds(tree)      (tree->child_id)
 #define hypre_BinaryTreeChildId(tree, i)    (tree->child_id[i])
 
-
 typedef struct
 {
    HYPRE_Int                   parent_id;
@@ -27,8 +25,6 @@
    HYPRE_Int		        *child_id;
 } hypre_BinaryTree;
 
-
-
 /* In the fill_response() function the user needs to set the recv__buf
    and the response_message_size.  Memory of size send_response_storage has been
    alllocated for the send_buf (in exchange_data) - if more is needed, then
@@ -37,7 +33,6 @@
    If the response is an empty "confirmation" message, then set
    response_message_size =0 (and do not modify the send_buf) */
 
-
 typedef struct
 {
    HYPRE_Int    (*fill_response)(void* recv_buf, HYPRE_Int contact_size, 
@@ -51,11 +46,9 @@
    
 } hypre_DataExchangeResponse;
 
-
 HYPRE_Int hypre_CreateBinaryTree(HYPRE_Int, HYPRE_Int, hypre_BinaryTree*);
 HYPRE_Int hypre_DestroyBinaryTree(hypre_BinaryTree*);
 
-
 HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts, 
 		     HYPRE_Int *contact_proc_list, void *contact_send_buf, 
 		     HYPRE_Int *contact_send_buf_starts, HYPRE_Int contact_obj_size, 
@@ -64,5 +57,5 @@
                      HYPRE_Int rnum, MPI_Comm comm,  void **p_response_recv_buf, 
                      HYPRE_Int **p_response_recv_buf_starts);
 
-
 #endif /* end of header */
+
diff -Nru hypre-2.11.2/src/utilities/general.h hypre-2.13.0/src/utilities/general.h
--- hypre-2.11.2/src/utilities/general.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/general.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,7 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
 /******************************************************************************
  *
  * General structures and values
@@ -20,6 +19,15 @@
 #ifndef hypre_GENERAL_HEADER
 #define hypre_GENERAL_HEADER
 
+/* This allows us to consistently avoid 'int' throughout hypre */
+typedef int               hypre_int;
+typedef long int          hypre_longint;
+typedef unsigned int      hypre_uint;
+typedef unsigned long int hypre_ulongint;
+
+/* This allows us to consistently avoid 'double' throughout hypre */
+typedef double            hypre_double;
+
 /*--------------------------------------------------------------------------
  * Define various functions
  *--------------------------------------------------------------------------*/
@@ -44,3 +52,4 @@
 #endif
 
 #endif
+
diff -Nru hypre-2.11.2/src/utilities/gpgpu.h hypre-2.13.0/src/utilities/gpgpu.h
--- hypre-2.11.2/src/utilities/gpgpu.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/gpgpu.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,19 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+//#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+//#define CUDAMEMATTACHTYPE cudaMemAttachHost
+#define HYPRE_GPU_USE_PINNED 1
+#define HYPRE_USE_MANAGED_SCALABLE 1
+#endif
+
diff -Nru hypre-2.11.2/src/utilities/gpuErrorCheck.c hypre-2.13.0/src/utilities/gpuErrorCheck.c
--- hypre-2.11.2/src/utilities/gpuErrorCheck.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/gpuErrorCheck.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,111 @@
+
+#include "_hypre_utilities.h"
+
+#if defined(HYPRE_USE_GPU) || defined(HYPRE_USE_MANAGED)
+#include <signal.h>
+#ifdef HYPRE_USE_GPU
+extern const char *cusparseErrorCheck(cusparseStatus_t error);
+extern void gpuAssert(cudaError_t code, const char *file, int line);
+extern void cusparseAssert(cusparseStatus_t code, const char *file, int line);
+#endif
+
+/*
+  cudaSafeFree frees Managed memory allocated in hypre_MAlloc,hypre_CAlloc and hypre_ReAlloc
+  It checks if the memory is managed before freeing and emits a warning if it is not memory
+  allocated using the above routines. This behaviour can be changed by defining ABORT_ON_RAW_POINTER.
+  The core file can then be used to find the location of the anomalous hypre_Free.
+ */
+void cudaSafeFree(void *ptr,int padding)
+{
+  PUSH_RANGE("SAFE_FREE",3);
+  struct cudaPointerAttributes ptr_att;
+  size_t *sptr=(size_t*)ptr-padding;
+  cudaError_t err;
+
+  err=cudaPointerGetAttributes(&ptr_att,ptr);
+  if (err!=cudaSuccess){
+    cudaGetLastError(); 
+#define FULL_WARN
+#ifndef ABORT_ON_RAW_POINTER
+#ifdef FULL_WARN
+    if (err==cudaErrorInvalidValue) fprintf(stderr,"WARNING :: Raw pointer passed to cudaSafeFree %p\n",ptr);
+    if (err==cudaErrorInvalidDevice) fprintf(stderr,"WARNING :: cudaSafeFree :: INVALID DEVICE on ptr = %p\n",ptr);
+    //PrintPointerAttributes(ptr);
+#endif
+#else
+    fprintf(stderr,"ERROR:: cudaSafeFree Aborting on raw unmanaged pointer %p\n",ptr);
+    raise(SIGABRT);
+#endif
+    free(ptr); /* Free the nonManaged pointer */
+    return;
+  }
+  if (ptr_att.isManaged){
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_MEASURE_GPU_HWM)
+    size_t mfree,mtotal;
+    gpuErrchk(cudaMemGetInfo(&mfree,&mtotal));
+    HYPRE_GPU_HWM=hypre_max((mtotal-mfree),HYPRE_GPU_HWM);
+#endif
+    /* Code below for handling managed memory pointers not allocated using hypre_CTAlloc oir hypre_TAlooc */
+    if (PointerAttributes(ptr)!=PointerAttributes(sptr)){
+      //fprintf(stderr,"ERROR IN Pointer for freeing %p %p\n",ptr,sptr);
+      gpuErrchk(cudaFree(ptr)); 
+      return;
+    }
+    gpuErrchk(cudaFree(sptr)); 
+  } else {
+    /* It is a pinned memory pointer */
+    //printf("ERROR:: NON-managed pointer passed to cudaSafeFree\n");
+    if (ptr_att.memoryType==cudaMemoryTypeHost){
+      gpuErrchk(cudaFreeHost(sptr));
+    } else if (ptr_att.memoryType==cudaMemoryTypeDevice){
+      gpuErrchk(cudaFree(sptr)); 
+    }
+  }
+  POP_RANGE;
+  return;
+}
+hypre_int PrintPointerAttributes(const void *ptr){
+  struct cudaPointerAttributes ptr_att;
+  if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess){
+    cudaGetLastError(); 
+    fprintf(stderr,"PrintPointerAttributes:: Raw pointer %p\n",ptr);
+    return HYPRE_HOST_POINTER;
+  }
+  if (ptr_att.isManaged){
+    fprintf(stderr,"PrintPointerAttributes:: Managed pointer\n");
+    fprintf(stderr,"Host address = %p, Device Address = %p\n",ptr_att.hostPointer, ptr_att.devicePointer);
+    if (ptr_att.memoryType==cudaMemoryTypeHost) fprintf(stderr,"Memory is located on host\n");
+    if (ptr_att.memoryType==cudaMemoryTypeDevice) fprintf(stderr,"Memory is located on device\n");
+    fprintf(stderr,"Device associated with this pointer is %d\n",ptr_att.device);
+    return HYPRE_MANAGED_POINTER;
+  } else {
+    fprintf(stderr,"PrintPointerAttributes:: Non-Managed & non-raw pointer\n Probably pinned host pointer\n");
+    if (ptr_att.memoryType==cudaMemoryTypeHost) {
+      fprintf(stderr,"Memory is located on host\n");
+      return HYPRE_PINNED_POINTER;
+    }
+    if (ptr_att.memoryType==cudaMemoryTypeDevice) {
+      fprintf(stderr,"Memory is located on device\n");
+      return HYPRE_DEVICE_POINTER ;
+    }
+    return HYPRE_UNDEFINED_POINTER1;
+  }
+  return HYPRE_UNDEFINED_POINTER2;
+}
+hypre_int PointerAttributes(const void *ptr){
+  struct cudaPointerAttributes ptr_att;
+  if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess){
+     cudaGetLastError(); 
+     return HYPRE_HOST_POINTER;
+  }
+  if (ptr_att.isManaged){
+    return HYPRE_MANAGED_POINTER; 
+  } else {
+    if (ptr_att.memoryType==cudaMemoryTypeHost) return HYPRE_PINNED_POINTER; /* Host pointer from cudaMallocHost */
+    if (ptr_att.memoryType==cudaMemoryTypeDevice) return HYPRE_DEVICE_POINTER ; /* cudadevice pointer */
+    return HYPRE_UNDEFINED_POINTER1; /* Shouldn't happen */
+  }
+  return HYPRE_UNDEFINED_POINTER2; /* Shouldnt happen */
+}
+
+#endif
diff -Nru hypre-2.11.2/src/utilities/gpuErrorCheck.h hypre-2.13.0/src/utilities/gpuErrorCheck.h
--- hypre-2.11.2/src/utilities/gpuErrorCheck.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/gpuErrorCheck.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,153 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef HYPRE_USE_MANAGED
+#include <cuda_runtime_api.h>
+#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+#define MEM_PAD_LEN 1
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line)
+{
+   if (code != cudaSuccess) 
+   {
+     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+     exit(2);
+   }
+}
+#define HYPRE_HOST_POINTER 0
+#define HYPRE_MANAGED_POINTER 1
+#define HYPRE_PINNED_POINTER 2
+#define HYPRE_DEVICE_POINTER 3
+#define HYPRE_UNDEFINED_POINTER1 4
+#define HYPRE_UNDEFINED_POINTER2 5
+void cudaSafeFree(void *ptr,int padding);
+hypre_int PrintPointerAttributes(const void *ptr);
+hypre_int PointerAttributes(const void *ptr);
+#endif
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __cusparseErrorCheck__
+#define __cusparseErrorCheck__
+#include <cusparse.h>
+#include <cublas_v2.h>
+#include <stdio.h>
+//#include <cuda_runtime_api.h>
+#include <stdlib.h>
+inline const char *cusparseErrorCheck(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+	    return "Unknown error in cusparseErrorCheck";
+    }
+    
+}
+inline const char *cublasErrorCheck(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+	    return "CUBLAS_STATUS_LICENSE_ERROR";
+        default:
+	    return "Unknown error in cublasErrorCheck";
+    }
+
+}
+//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+//inline void gpuAssert(cudaError_t code, const char *file, int line)
+//{
+//   if (code != cudaSuccess) 
+//   {
+//     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+//     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+//     exit(2);
+//   }
+//}
+#define cusparseErrchk(ans) { cusparseAssert((ans), __FILE__, __LINE__); }
+inline void cusparseAssert(cusparseStatus_t code, const char *file, int line)
+{
+   if (code != CUSPARSE_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUSPARSE ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUSPARSE ERROR : %s \n", cusparseErrorCheck(code));
+   }
+}
+#define cublasErrchk(ans){ cublasAssert((ans), __FILE__, __LINE__); }
+inline void cublasAssert(cublasStatus_t code, const char *file, int line)
+{
+   if (code != CUBLAS_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUBLAS ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUBLAS ERROR : %s \n", cublasErrorCheck(code));
+   }
+}
+//int PointerType(const void *ptr);
+void cudaSafeFree(void *ptr,int padding);
+//void PrintPointerAttributes(const void *ptr);
+//size_t mempush(void* ptr, size_t size,int purge);
+//int memloc(void *ptr, int device);
+#endif
+#endif
+
diff -Nru hypre-2.11.2/src/utilities/gpuMem.c hypre-2.13.0/src/utilities/gpuMem.c
--- hypre-2.11.2/src/utilities/gpuMem.c	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/gpuMem.c	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,513 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "_hypre_utilities.h"
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <sched.h>
+#include <errno.h>
+hypre_int ggc(hypre_int id);
+
+/* Global struct that holds device,library handles etc */
+struct hypre__global_struct hypre__global_handle = { .initd=0, .device=0, .device_count=1,.memoryHWM=0};
+
+
+/* Initialize GPU branch of Hypre AMG */
+/* use_device =-1 */
+/* Application passes device number it is using or -1 to let Hypre decide on which device to use */
+void hypre_GPUInit(hypre_int use_device){
+  char pciBusId[80];
+  hypre_int myid;
+  hypre_int nDevices;
+  hypre_int device;
+  if (!HYPRE_GPU_HANDLE){
+    HYPRE_GPU_HANDLE=1;
+    HYPRE_DEVICE=0;
+    gpuErrchk(cudaGetDeviceCount(&nDevices));
+    HYPRE_DEVICE_COUNT=nDevices;
+    
+    if (use_device<0){
+      if (nDevices==1){
+	/* with mpibind each process will only see 1 GPU */
+	HYPRE_DEVICE=0;
+	gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+	cudaDeviceGetPCIBusId ( pciBusId, 80, HYPRE_DEVICE);
+      } else if (nDevices>1) {
+	/* No mpibind or it is a single rank run */
+	hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+	//affs(myid);
+	MPI_Comm node_comm;
+	MPI_Info info;
+	MPI_Info_create(&info);
+	MPI_Comm_split_type(hypre_MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, myid, info, &node_comm);
+	hypre_int round_robin=1;
+	hypre_int myNodeid, NodeSize;
+	MPI_Comm_rank(node_comm, &myNodeid);
+	MPI_Comm_size(node_comm, &NodeSize);
+	if (round_robin){
+	  /* Round robin allocation of GPUs. Does not account for affinities */
+	  HYPRE_DEVICE=myNodeid%nDevices; 
+	  gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+	  cudaDeviceGetPCIBusId ( pciBusId, 80, HYPRE_DEVICE);
+	  hypre_printf("WARNING:: Code running without mpibind\n");
+	  hypre_printf("Global ID = %d , Node ID %d running on device %d of %d \n",myid,myNodeid,HYPRE_DEVICE,nDevices);
+	} else {
+	  /* Try to set the GPU based on process binding */
+	  /* works correcly for all cases */
+	  MPI_Comm numa_comm;
+	  MPI_Comm_split(node_comm,getnuma(),myNodeid,&numa_comm);
+	  hypre_int myNumaId,NumaSize;
+	  MPI_Comm_rank(numa_comm, &myNumaId);
+	  MPI_Comm_size(numa_comm, &NumaSize);
+	  hypre_int domain_devices=nDevices/2; /* Again hardwired for 2 NUMA domains */
+	  HYPRE_DEVICE = getnuma()*2+myNumaId%domain_devices;
+	  gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+	  hypre_printf("WARNING:: Code running without mpibind\n");
+	  hypre_printf("NUMA %d GID %d , NodeID %d NumaID %d running on device %d (RR=%d) of %d \n",getnuma(),myid,myNodeid,myNumaId,HYPRE_DEVICE,myNodeid%nDevices,nDevices);
+	  
+	}
+	
+	MPI_Info_free(&info);
+      } else {
+	/* No device found  */
+	hypre_fprintf(stderr,"ERROR:: NO GPUS found \n");
+	exit(2);
+      }
+    } else {
+      HYPRE_DEVICE = use_device;
+      gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+    }
+      
+      /* Create NVTX domain for all the nvtx calls in HYPRE */
+      HYPRE_DOMAIN=nvtxDomainCreateA("Hypre");
+      
+      /* Initialize streams */
+      hypre_int jj;
+      for(jj=0;jj<MAX_HGS_ELEMENTS;jj++)
+	gpuErrchk(cudaStreamCreateWithFlags(&(HYPRE_STREAM(jj)),cudaStreamNonBlocking));
+      
+      /* Initialize the library handles and streams */
+      
+    cusparseErrchk(cusparseCreate(&(HYPRE_CUSPARSE_HANDLE)));
+    cusparseErrchk(cusparseSetStream(HYPRE_CUSPARSE_HANDLE,HYPRE_STREAM(4)));
+    cusparseErrchk(cusparseCreateMatDescr(&(HYPRE_CUSPARSE_MAT_DESCR))); 
+    cusparseErrchk(cusparseSetMatType(HYPRE_CUSPARSE_MAT_DESCR,CUSPARSE_MATRIX_TYPE_GENERAL));
+    cusparseErrchk(cusparseSetMatIndexBase(HYPRE_CUSPARSE_MAT_DESCR,CUSPARSE_INDEX_BASE_ZERO));
+
+    cublasErrchk(cublasCreate(&(HYPRE_CUBLAS_HANDLE)));
+    cublasErrchk(cublasSetStream(HYPRE_CUBLAS_HANDLE,HYPRE_STREAM(4)));
+    if (!checkDeviceProps()) hypre_printf("WARNING:: Concurrent memory access not allowed\n");
+    /* Check if the arch flags used for compiling the cuda kernels match the device */
+    CudaCompileFlagCheck();
+  }
+}
+
+
+void hypre_GPUFinalize(){
+  
+  cusparseErrchk(cusparseDestroy(HYPRE_CUSPARSE_HANDLE));
+  
+  cublasErrchk(cublasDestroy(HYPRE_CUBLAS_HANDLE));
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_MEASURE_GPU_HWM)
+  hypre_printf("GPU Memory High Water Mark(per MPI_RANK) %f MB \n",(HYPRE_Real)HYPRE_GPU_HWM/1024/1024);
+#endif
+  /* Destroy streams */
+  hypre_int jj;
+  for(jj=0;jj<MAX_HGS_ELEMENTS;jj++)
+    gpuErrchk(cudaStreamDestroy(HYPRE_STREAM(jj)));
+  
+}
+
+void MemAdviseReadOnly(const void* ptr, hypre_int device){
+  if (ptr==NULL) return;
+    size_t size=mempush(ptr,0,0);
+    if (size==0) printf("WARNING:: Operations with 0 size vector \n");
+    gpuErrchk(cudaMemAdvise(ptr,size,cudaMemAdviseSetReadMostly,device));
+}
+
+void MemAdviseUnSetReadOnly(const void* ptr, hypre_int device){
+  if (ptr==NULL) return;
+    size_t size=mempush(ptr,0,0);
+    if (size==0) printf("WARNING:: Operations with 0 size vector \n");
+    gpuErrchk(cudaMemAdvise(ptr,size,cudaMemAdviseUnsetReadMostly,device));
+}
+
+
+void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device){
+  if (ptr==NULL) return;
+  gpuErrchk(cudaMemAdvise(ptr,mempush(ptr,0,0),cudaMemAdviseSetPreferredLocation,device));
+}
+
+void MemAdviseSetPrefLocHost(const void *ptr){
+  if (ptr==NULL) return;
+  gpuErrchk(cudaMemAdvise(ptr,mempush(ptr,0,0),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId));
+}
+
+
+void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream){
+  if (ptr==NULL) return;
+  size_t size;
+  size=memsize(ptr);
+  PUSH_RANGE("MemPreFetchForce",4);
+  /* Do a prefetch every time until a possible UM bug is fixed */
+  if (size>0){
+    PrintPointerAttributes(ptr);
+     gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream));
+    gpuErrchk(cudaStreamSynchronize(stream));
+    POP_RANGE;
+  return;
+  } 
+  return;
+}
+
+
+void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream){
+  if (ptr==NULL) return;
+  size_t size=memsize(ptr);
+  PUSH_RANGE_PAYLOAD("MemPreFetchForce",4,size);
+  gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream));
+  POP_RANGE;
+  return;
+}
+
+void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream){
+  if (ptr==NULL) return;
+  PUSH_RANGE_DOMAIN("MemPreFetchSized",4,0);
+  /* Do a prefetch every time until a possible UM bug is fixed */
+  if (size>0){
+    gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream));
+    POP_RANGE_DOMAIN(0);
+    return;
+  } 
+  return;
+}
+
+
+/* Returns the same cublas handle with every call */
+cublasHandle_t getCublasHandle(){
+  cublasStatus_t stat;
+  static cublasHandle_t handle;
+  static hypre_int firstcall=1;
+  if (firstcall){
+    firstcall=0;
+    stat = cublasCreate(&handle);
+    if (stat!=CUBLAS_STATUS_SUCCESS) {
+      printf("ERROR:: CUBLAS Library initialization failed\n");
+      handle=0;
+      exit(2);
+    }
+    cublasErrchk(cublasSetStream(handle,HYPRE_STREAM(4)));
+  } else return handle;
+  return handle;
+}
+
+/* Returns the same cusparse handle with every call */
+cusparseHandle_t getCusparseHandle(){
+  cusparseStatus_t status;
+  static cusparseHandle_t handle;
+  static hypre_int firstcall=1;
+  if (firstcall){
+    firstcall=0;
+    status= cusparseCreate(&handle);
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+      printf("ERROR:: CUSPARSE Library initialization failed\n");
+      handle=0;
+      exit(2);
+    }
+    cusparseErrchk(cusparseSetStream(handle,HYPRE_STREAM(4)));
+  } else return handle;
+  return handle;
+}
+
+/* C version of mempush using linked lists */
+
+size_t mempush(const void *ptr, size_t size, hypre_int action){
+  static node* head=NULL;
+  static hypre_int nc=0;
+  node *found=NULL;
+  if (!head){
+    if ((size<=0)||(action==1)) {
+      fprintf(stderr,"mempush can start only with an insertion or a size call \n");
+      return 0;
+    }
+    head = (node*)malloc(sizeof(node));
+    head->ptr=ptr;
+    head->size=size;
+    head->next=NULL;
+    nc++;
+    return size;
+  } else {
+    // Purge an address
+    if (action==1){
+      found=memfind(head,ptr);
+      if (found){
+	memdel(&head, found);
+	nc--;
+	return 0;
+      } else {
+#ifdef FULL_WARN
+	fprintf(stderr,"ERROR :: Pointer for deletion not found in linked list %p\n",ptr);
+#endif
+	return 0;
+      }
+    } // End purge
+    
+    // Insertion
+    if (size>0){
+      found=memfind(head,ptr);
+      if (found){
+#ifdef FULL_WARN
+	fprintf(stderr,"ERROR :: Pointer for insertion already in use in linked list %p\n",ptr);
+	//printlist(head,nc);
+#endif
+	return 0;
+      } else {
+	nc++;
+	meminsert(&head,ptr,size);
+	return 0;
+      }
+    }
+
+    // Getting allocation size
+    found=memfind(head,ptr);
+    if (found){
+      return found->size;
+    } else{
+#ifdef FULL_WARN
+      fprintf(stderr,"ERROR :: Pointer for size check NOT found in linked list\n");
+#endif
+      return 0;
+    }
+  }
+}
+
+node *memfind(node *head, const void *ptr){
+  node *next;
+  next=head;
+  while(next!=NULL){
+    if (next->ptr==ptr) return next;
+    next=next->next;
+  }
+  return NULL;
+}
+
+void memdel(node **head, node *found){
+  node *next;
+  if (found==*head){
+    next=(*head)->next;
+    free(*head);
+    *head=next;
+    return;
+  }
+  next=*head;
+  while(next->next!=found){
+    next=next->next;
+  }
+  next->next=next->next->next;
+  free(found);
+  return;
+}
+void meminsert(node **head, const void  *ptr,size_t size){
+  node *nhead;
+  nhead = (node*)malloc(sizeof(node));
+  nhead->ptr=ptr;
+  nhead->size=size;
+  nhead->next=*head;
+  *head=nhead;
+  return;
+}
+
+void printlist(node *head,hypre_int nc){
+  node *next;
+  next=head;
+  printf("Node count %d \n",nc);
+  while(next!=NULL){
+    printf("Address %p of size %zu \n",next->ptr,next->size);
+    next=next->next;
+  }
+}
+
+cudaStream_t getstreamOlde(hypre_int i){
+  static hypre_int firstcall=1;
+  const hypre_int MAXSTREAMS=10;
+  static cudaStream_t s[MAXSTREAMS];
+  if (firstcall){
+    hypre_int jj;
+    for(jj=0;jj<MAXSTREAMS;jj++)
+      gpuErrchk(cudaStreamCreateWithFlags(&s[jj],cudaStreamNonBlocking));
+    //printf("Created streams ..\n");
+    firstcall=0;
+  }
+  if (i<MAXSTREAMS) return s[i];
+  fprintf(stderr,"ERROR in HYPRE_STREAM in utilities/gpuMem.c %d is greater than MAXSTREAMS = %d\n Returning default stream",i,MAXSTREAMS);
+  return 0;
+}
+
+nvtxDomainHandle_t getdomain(hypre_int i){
+    static hypre_int firstcall=1;
+    const hypre_int MAXDOMAINS=1;
+    static nvtxDomainHandle_t h[MAXDOMAINS];
+    if (firstcall){
+      h[0]= nvtxDomainCreateA("HYPRE_A");
+      firstcall=0;
+    }
+    if (i<MAXDOMAINS) return h[i];
+    fprintf(stderr,"ERROR in getdomain in utilities/gpuMem.c %d  is greater than MAXDOMAINS = %d \n Returning default domain",i,MAXDOMAINS);
+    return NULL;
+  }
+
+cudaEvent_t getevent(hypre_int i){
+  static hypre_int firstcall=1;
+  const hypre_int MAXEVENTS=10;
+  static cudaEvent_t s[MAXEVENTS];
+  if (firstcall){
+    hypre_int jj;
+    for(jj=0;jj<MAXEVENTS;jj++)
+      gpuErrchk(cudaEventCreateWithFlags(&s[jj],cudaEventDisableTiming));
+    //printf("Created events ..\n");
+    firstcall=0;
+  }
+  if (i<MAXEVENTS) return s[i];
+  fprintf(stderr,"ERROR in getevent in utilities/gpuMem.c %d is greater than MAXEVENTS = %d\n Returning default stream",i,MAXEVENTS);
+  return 0;
+}
+
+hypre_int getsetasyncmode(hypre_int mode, hypre_int action){
+  static hypre_int async_mode=0;
+  if (action==0) async_mode = mode;
+  if (action==1) return async_mode;
+  return async_mode;
+}
+
+void SetAsyncMode(hypre_int mode){
+  getsetasyncmode(mode,0);
+}
+
+hypre_int GetAsyncMode(){
+  return getsetasyncmode(0,1);
+}
+
+void branchStream(hypre_int i, hypre_int j){
+  gpuErrchk(cudaEventRecord(getevent(i),HYPRE_STREAM(i)));
+  gpuErrchk(cudaStreamWaitEvent(HYPRE_STREAM(j),getevent(i),0));
+}
+
+void joinStreams(hypre_int i, hypre_int j, hypre_int k){
+  gpuErrchk(cudaEventRecord(getevent(i),HYPRE_STREAM(i)));
+  gpuErrchk(cudaEventRecord(getevent(j),HYPRE_STREAM(j)));
+  gpuErrchk(cudaStreamWaitEvent(HYPRE_STREAM(k),getevent(i),0));
+  gpuErrchk(cudaStreamWaitEvent(HYPRE_STREAM(k),getevent(j),0));
+}
+
+void affs(hypre_int myid){
+  const hypre_int NCPUS=160;
+  cpu_set_t* mask = CPU_ALLOC(NCPUS);
+  size_t size = CPU_ALLOC_SIZE(NCPUS);
+  hypre_int cpus[NCPUS],i;
+  hypre_int retval=sched_getaffinity(0, size,mask);
+  if (!retval){
+    for(i=0;i<NCPUS;i++){
+      if (CPU_ISSET(i,mask)) 
+	cpus[i]=1; 
+      else
+	cpus[i]=0;
+    }
+    printf("Node(%d)::",myid);
+    for(i=0;i<160;i++)printf("%d",cpus[i]);
+    printf("\n");
+  } else {
+    fprintf(stderr,"sched_affinity failed\n");
+    switch(errno){
+    case EFAULT:
+      printf("INVALID MEMORY ADDRESS\n");
+      break;
+    case EINVAL:
+      printf("EINVAL:: NO VALID CPUS\n");
+      break;
+    default:
+      printf("%d something else\n",errno);
+    }
+  }
+  
+  CPU_FREE(mask);
+  
+}
+hypre_int getcore(){
+  const hypre_int NCPUS=160;
+  cpu_set_t* mask = CPU_ALLOC(NCPUS);
+  size_t size = CPU_ALLOC_SIZE(NCPUS);
+  hypre_int cpus[NCPUS],i;
+  hypre_int retval=sched_getaffinity(0, size,mask);
+  if (!retval){
+    for(i=0;i<NCPUS;i+=20){
+      if (CPU_ISSET(i,mask)) {
+	CPU_FREE(mask);
+	return i;
+      }
+    }
+  } else {
+    fprintf(stderr,"sched_affinity failed\n");
+    switch(errno){
+    case EFAULT:
+      printf("INVALID MEMORY ADDRESS\n");
+      break;
+    case EINVAL:
+      printf("EINVAL:: NO VALID CPUS\n");
+      break;
+    default:
+      printf("%d something else\n",errno);
+    }
+  }
+  return 0;
+  CPU_FREE(mask);
+  
+}
+hypre_int getnuma(){
+  const hypre_int NCPUS=160;
+  cpu_set_t* mask = CPU_ALLOC(NCPUS);
+  size_t size = CPU_ALLOC_SIZE(NCPUS);
+  hypre_int retval=sched_getaffinity(0, size,mask);
+  /* HARDWIRED FOR 2 NUMA DOMAINS */
+  if (!retval){
+    hypre_int sum0=0,i;
+    for(i=0;i<NCPUS/2;i++) 
+      if (CPU_ISSET(i,mask)) sum0++;
+    hypre_int sum1=0;
+    for(i=NCPUS/2;i<NCPUS;i++) 
+      if (CPU_ISSET(i,mask)) sum1++;
+    CPU_FREE(mask);
+    if (sum0>sum1) return 0;
+    else return 1;
+  } else {
+    fprintf(stderr,"sched_affinity failed\n");
+    switch(errno){
+    case EFAULT:
+      printf("INVALID MEMORY ADDRESS\n");
+      break;
+    case EINVAL:
+      printf("EINVAL:: NO VALID CPUS\n");
+      break;
+    default:
+      printf("%d something else\n",errno);
+    }
+  }
+  return 0;
+  CPU_FREE(mask);
+  
+}
+hypre_int checkDeviceProps(){
+  struct cudaDeviceProp prop;
+  gpuErrchk(cudaGetDeviceProperties(&prop, HYPRE_DEVICE));
+  HYPRE_GPU_CMA=prop.concurrentManagedAccess;
+  return HYPRE_GPU_CMA;
+}
+hypre_int pointerIsManaged(const void *ptr){
+  struct cudaPointerAttributes ptr_att;
+  if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess) {
+    return 0;
+  }
+  return ptr_att.isManaged;
+}
+#endif
diff -Nru hypre-2.11.2/src/utilities/gpuMem.h hypre-2.13.0/src/utilities/gpuMem.h
--- hypre-2.11.2/src/utilities/gpuMem.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/gpuMem.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,104 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __GPUMEM_H__
+#define  __GPUMEM_H__
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+void hypre_GPUInit(hypre_int use_device);
+void hypre_GPUFinalize();
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+#endif
+
+cudaStream_t getstreamOlde(hypre_int i);
+nvtxDomainHandle_t getdomain(hypre_int i);
+cudaEvent_t getevent(hypre_int i);
+void MemAdviseReadOnly(const void *ptr, hypre_int device);
+void MemAdviseUnSetReadOnly(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocHost(const void *ptr);
+void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream);
+void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream);
+void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream);
+cublasHandle_t getCublasHandle();
+cusparseHandle_t getCusparseHandle();
+typedef struct node {
+  const void *ptr;
+  size_t size;
+  struct node *next;
+} node;
+size_t mempush(const void *ptr, size_t size, hypre_int action);
+node *memfind(node *head, const void *ptr);
+void memdel(node **head, node *found);
+void meminsert(node **head, const void *ptr,size_t size);
+void printlist(node *head,hypre_int nc);
+//#define MEM_PAD_LEN 1
+size_t memsize(const void *ptr);
+hypre_int getsetasyncmode(hypre_int mode, hypre_int action);
+void SetAsyncMode(hypre_int mode);
+hypre_int GetAsyncMode();
+void branchStream(hypre_int i, hypre_int j);
+void joinStreams(hypre_int i, hypre_int j, hypre_int k);
+void affs(hypre_int myid);
+hypre_int getcore();
+hypre_int getnuma();
+hypre_int checkDeviceProps();
+hypre_int pointerIsManaged(const void *ptr);
+/*
+ * Global struct for keeping HYPRE GPU Init state
+ */
+
+#define MAX_HGS_ELEMENTS 10
+struct hypre__global_struct{
+  hypre_int initd;
+  hypre_int device;
+  hypre_int device_count;
+  cublasHandle_t cublas_handle;
+  cusparseHandle_t cusparse_handle;
+  cusparseMatDescr_t cusparse_mat_descr;
+  cudaStream_t streams[MAX_HGS_ELEMENTS];
+  nvtxDomainHandle_t nvtx_domain;
+  hypre_int concurrent_managed_access;
+  size_t memoryHWM;
+};
+
+extern struct hypre__global_struct hypre__global_handle ;
+
+/*
+ * Macros for accessing elements of the global handle
+ */
+#define HYPRE_GPU_HANDLE hypre__global_handle.initd
+#define HYPRE_CUBLAS_HANDLE hypre__global_handle.cublas_handle
+#define HYPRE_CUSPARSE_HANDLE hypre__global_handle.cusparse_handle
+#define HYPRE_DEVICE hypre__global_handle.device
+#define HYPRE_DEVICE_COUNT hypre__global_handle.device_count
+#define HYPRE_CUSPARSE_MAT_DESCR hypre__global_handle.cusparse_mat_descr
+#define HYPRE_STREAM(index) (hypre__global_handle.streams[index])
+#define HYPRE_DOMAIN  hypre__global_handle.nvtx_domain
+#define HYPRE_GPU_CMA hypre__global_handle.concurrent_managed_access
+#define HYPRE_GPU_HWM hypre__global_handle.memoryHWM
+
+#endif
+
+#else
+
+#define hypre_GPUInit(use_device)
+#define hypre_GPUFinalize()
+
+#endif
+
diff -Nru hypre-2.11.2/src/utilities/headers hypre-2.13.0/src/utilities/headers
--- hypre-2.11.2/src/utilities/headers	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/headers	2017-10-20 17:42:22.000000000 +0000
@@ -19,11 +19,18 @@
 
 cat > $INTERNAL_HEADER <<@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_UTILITIES_HEADER
 #define hypre_UTILITIES_HEADER
 
 #include "HYPRE_utilities.h"
 
+#ifdef HYPRE_USING_OPENMP
+#include <omp.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -38,19 +45,19 @@
 cat mpistubs.h                 >> $INTERNAL_HEADER
 cat hypre_smp.h                >> $INTERNAL_HEADER
 cat hypre_memory.h             >> $INTERNAL_HEADER
-cat thread_mpistubs.h          >> $INTERNAL_HEADER
 cat threading.h                >> $INTERNAL_HEADER		          
 cat timing.h                   >> $INTERNAL_HEADER
 cat amg_linklist.h             >> $INTERNAL_HEADER
 cat exchange_data.h            >> $INTERNAL_HEADER
 cat hypre_error.h              >> $INTERNAL_HEADER
-./protos amg_linklist.c        >> $INTERNAL_HEADER
-./protos binsearch.c           >> $INTERNAL_HEADER
-./protos hypre_printf.c        >> $INTERNAL_HEADER
-./protos hypre_qsort.c         >> $INTERNAL_HEADER
-./protos qsplit.c              >> $INTERNAL_HEADER
-./protos random.c              >> $INTERNAL_HEADER
-		          
+cat caliper_instrumentation.h  >> $INTERNAL_HEADER
+cat gpgpu.h                    >> $INTERNAL_HEADER
+cat hypre_nvtx.h               >> $INTERNAL_HEADER
+cat gpuErrorCheck.h            >> $INTERNAL_HEADER
+cat gpuMem.h                   >> $INTERNAL_HEADER
+
+cat protos.h                   >> $INTERNAL_HEADER
+
 #===========================================================================
 # Include guards
 #===========================================================================
diff -Nru hypre-2.11.2/src/utilities/hypre_error.h hypre-2.13.0/src/utilities/hypre_error.h
--- hypre-2.11.2/src/utilities/hypre_error.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/hypre_error.h	2017-10-20 17:42:22.000000000 +0000
@@ -10,8 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
 #ifndef hypre_ERROR_HEADER
 #define hypre_ERROR_HEADER
 
@@ -37,3 +35,4 @@
 #endif
 
 #endif
+
diff -Nru hypre-2.11.2/src/utilities/hypre_memory.c hypre-2.13.0/src/utilities/hypre_memory.c
--- hypre-2.11.2/src/utilities/hypre_memory.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/hypre_memory.c	2017-10-20 17:42:22.000000000 +0000
@@ -16,12 +16,14 @@
  *
  *****************************************************************************/
 
+#define HYPRE_USE_MANAGED_SCALABLE 1
 #include "_hypre_utilities.h"
-
+//#include "gpgpu.h"
+//#include "hypre_nvtx.h"
+//#include "gpuMem.h"
 #ifdef HYPRE_USE_UMALLOC
 #undef HYPRE_USE_UMALLOC
 #endif
-
 /******************************************************************************
  *
  * Standard routines
@@ -54,10 +56,23 @@
 
    if (size > 0)
    {
+     PUSH_RANGE_PAYLOAD("MALLOC",2,size);
 #ifdef HYPRE_USE_UMALLOC
       HYPRE_Int threadid = hypre_GetThreadID();
-
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
       ptr = _umalloc_(size);
+#elif HYPRE_USE_MANAGED
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+      gpuErrchk( cudaMallocManaged(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,CUDAMEMATTACHTYPE) );
+      size_t *sp=(size_t*)ptr;
+      *sp=size;
+      ptr=(void*)(&sp[MEM_PAD_LEN]);
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) );
+      mempush(ptr,size,0);
+#endif
 #else
       ptr = malloc(size);
 #endif
@@ -68,6 +83,7 @@
         hypre_OutOfMemory(size);
       }
 #endif
+      POP_RANGE;
    }
    else
    {
@@ -90,10 +106,23 @@
 
    if (size > 0)
    {
+     PUSH_RANGE_PAYLOAD("MALLOC",4,size);
 #ifdef HYPRE_USE_UMALLOC
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
       HYPRE_Int threadid = hypre_GetThreadID();
 
       ptr = _ucalloc_(count, elt_size);
+#elif HYPRE_USE_MANAGED
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+      ptr=(void*)hypre_MAlloc(size);
+      memset(ptr,0,count*elt_size);
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) );
+      memset(ptr,0,count*elt_size);
+      mempush(ptr,size,0);
+#endif
 #else
       ptr = calloc(count, elt_size);
 #endif
@@ -104,6 +133,7 @@
         hypre_OutOfMemory(size);
       }
 #endif
+      POP_RANGE;
    }
    else
    {
@@ -113,6 +143,11 @@
    return(char*) ptr;
 }
 
+#ifdef HYPRE_USE_MANAGED
+size_t memsize(const void *ptr){
+return ((size_t*)ptr)[-MEM_PAD_LEN];
+}
+#endif
 /*--------------------------------------------------------------------------
  * hypre_ReAlloc
  *--------------------------------------------------------------------------*/
@@ -135,6 +170,32 @@
       HYPRE_Int threadid = hypre_GetThreadID();
       ptr = (char*)_urealloc_(ptr, size);
    }
+#elif HYPRE_USE_MANAGED
+   if (ptr == NULL)
+   {
+
+      ptr = hypre_MAlloc(size);
+   }
+   else if (size == 0)
+   {
+     hypre_Free(ptr);
+     return NULL;
+   }
+   else
+   {
+     void *nptr = hypre_MAlloc(size);
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+     size_t old_size=memsize((void*)ptr);
+#else
+     size_t old_size=mempush((void*)ptr,0,0);
+#endif
+     if (size>old_size)
+       memcpy(nptr,ptr,old_size);
+     else
+       memcpy(nptr,ptr,size);
+     hypre_Free(ptr);
+     ptr=(char*) nptr;
+   }
 #else
    if (ptr == NULL)
    {
@@ -156,6 +217,7 @@
    return ptr;
 }
 
+
 /*--------------------------------------------------------------------------
  * hypre_Free
  *--------------------------------------------------------------------------*/
@@ -169,6 +231,182 @@
       HYPRE_Int threadid = hypre_GetThreadID();
 
       _ufree_(ptr);
+#elif HYPRE_USE_MANAGED
+      //size_t size=mempush(ptr,0,0);
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+      cudaSafeFree(ptr,MEM_PAD_LEN);
+#else
+      mempush(ptr,0,1);
+      cudaSafeFree(ptr,0);
+#endif
+      //gpuErrchk(cudaFree((void*)ptr));
+#else
+      free(ptr);
+#endif
+   }
+}
+/*--------------------------------------------------------------------------
+ * hypre_MAllocPinned
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_MAllocPinned( size_t size )
+{
+   void *ptr;
+
+   if (size > 0)
+   {
+     PUSH_RANGE_PAYLOAD("MALLOC",2,size);
+#ifdef HYPRE_USE_UMALLOC
+      HYPRE_Int threadid = hypre_GetThreadID();
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
+      ptr = _umalloc_(size);
+#elif HYPRE_USE_MANAGED
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+#ifdef HYPRE_GPU_USE_PINNED
+      gpuErrchk( cudaHostAlloc(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,cudaHostAllocMapped));
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,CUDAMEMATTACHTYPE) );
+#endif
+      size_t *sp=(size_t*)ptr;
+      *sp=size;
+      ptr=(void*)(&sp[MEM_PAD_LEN]);
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) );
+      mempush(ptr,size,0);
+#endif
+#else
+      ptr = malloc(size);
+#endif
+
+#if 1
+      if (ptr == NULL)
+      {
+        hypre_OutOfMemory(size);
+      }
+#endif
+      POP_RANGE;
+   }
+   else
+   {
+      ptr = NULL;
+   }
+
+   return (char*)ptr;
+}
+/*--------------------------------------------------------------------------
+ * hypre_MAllocHost
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_MAllocHost( size_t size )
+{
+   void *ptr;
+
+   if (size > 0)
+   {
+     ptr = malloc(size);
+#if 1
+      if (ptr == NULL)
+      {
+        hypre_OutOfMemory(size);
+      }
+#endif
+      POP_RANGE;
+   }
+   else
+   {
+      ptr = NULL;
+   }
+
+   return (char*)ptr;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_CAllocHost
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_CAllocHost( size_t count,
+		  size_t elt_size )
+{
+   void   *ptr;
+   size_t  size = count*elt_size;
+
+   if (size > 0)
+   {
+     PUSH_RANGE_PAYLOAD("CAllocHost",4,size);
+#ifdef HYPRE_USE_UMALLOC
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
+      HYPRE_Int threadid = hypre_GetThreadID();
+
+ptr = _ucalloc_(count, elt_size);
+
+#else
+     ptr = calloc(count, elt_size);
+#endif
+
+#if 1
+      if (ptr == NULL)
+      {
+        hypre_OutOfMemory(size);
+      }
+#endif
+      POP_RANGE;
+   }
+   else
+   {
+      ptr = NULL;
+   }
+
+   return(char*) ptr;
+}
+/*--------------------------------------------------------------------------
+ * hypre_ReAllocHost
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_ReAllocHost( char   *ptr,
+               size_t  size )
+{
+  if (ptr == NULL)
+   {
+          ptr = (char*)malloc(size);
+   }
+   else
+   {
+
+	   ptr = (char*)realloc(ptr, size);
+   }
+
+#if 1
+   if ((ptr == NULL) && (size > 0))
+   {
+      hypre_OutOfMemory(size);
+   }
+#endif
+
+   return ptr;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_CHFree
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_FreeHost( char *ptr )
+{
+   if (ptr)
+   {
+#ifdef HYPRE_USE_UMALLOC
+      HYPRE_Int threadid = hypre_GetThreadID();
+
+      _ufree_(ptr);
+
 #else
       free(ptr);
 #endif
diff -Nru hypre-2.11.2/src/utilities/hypre_memory.h hypre-2.13.0/src/utilities/hypre_memory.h
--- hypre-2.11.2/src/utilities/hypre_memory.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/hypre_memory.h	2017-10-20 17:42:22.000000000 +0000
@@ -26,6 +26,157 @@
 extern "C" {
 #endif
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED)
+#ifdef __cplusplus
+extern "C++" {
+#endif
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifdef __cplusplus
+}
+#endif
+#define HYPRE_CUDA_GLOBAL __host__ __device__
+  
+#if defined(HYPRE_MEMORY_GPU)
+#define hypre_DeviceTAlloc(type, count) \
+  ({									\
+    type * ptr;								\
+    cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+    if ( cudaerr != cudaSuccess ) {					\
+      printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+      HYPRE_Int *p = NULL; *p = 1;						\
+    }									\
+    ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                         cudaMalloc((void**)&,sizeof(type)*(count), cudaMemAttachGlobal);	\
+											 memcpy(newptr, ptr, sizeof(type)*(count)); \
+											 cudaFree(ptr);				\
+											 ptr = newptr;}
+#else
+ #define hypre_DeviceTAlloc(type, count) \
+	({																	\
+	type * ptr;															\
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);\
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);	\
+					      memcpy(newptr, ptr, sizeof(type)*(count)); \
+					      cudaFree(ptr);		\
+					      ptr = newptr;} 
+#endif
+  
+#define hypre_DeviceTFree(ptr) \
+	{											\
+		cudaError_t cudaerr = cudaFree(ptr);							\
+		if ( cudaerr != cudaSuccess ) {									\
+			printf("\n CudaFree : %s in %s(%d) function %s\n",cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+			HYPRE_Int *p = NULL; *p = 1;										\
+		}																\
+	}																	\
+	
+
+#define hypre_DataCopyToData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrD, ptrH, sizeof(type)*count, cudaMemcpyHostToDevice); \
+if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCopyToData %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+}							  \
+	}
+	
+	
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrH, ptrD, sizeof(type)*count, cudaMemcpyDeviceToHost); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	}
+
+#define hypre_DeviceMemset(ptr,value,type,count)	\
+	cudaMemset(ptr,value,count*sizeof(type));
+	
+#define hypre_UMTAlloc(type, count)				\
+  ({									\
+      type * ptr;								\
+      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+      ptr;								\
+  })
+	
+#define hypre_UMCTAlloc(type, count)					\
+  ({									\
+    type * ptr;								\
+    cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaMemset(ptr,0,sizeof(type)*(count));				\
+    ptr;})								\
+  
+  
+#define hypre_UMTReAlloc(type, count)\
+  ({							 \
+    type * ptr;								\
+    type *newptr;							\
+    cudaMallocManaged((void**)&newptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaFree(ptr);							\
+    ptr = newptr;							\
+    ptr;})								\
+  
+#define hypre_UMTFree(ptr) \
+      cudaFree(ptr)
+
+#define hypre_InitMemoryDebug(id)
+#define hypre_FinalizeMemoryDebug()
+
+#define hypre_TAlloc(type, count) \
+( (type *)hypre_MAlloc((size_t)(sizeof(type) * (count))) )
+
+#define hypre_CTAlloc(type, count) \
+( (type *)hypre_CAlloc((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_TReAlloc(ptr, type, count) \
+( (type *)hypre_ReAlloc((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_TFree(ptr) \
+( hypre_Free((char *)ptr), ptr = NULL )
+  
+  //#define hypre_TAlloc(type, count)  hypre_UMTAlloc(type, count)
+  //#define hypre_CTAlloc(type, count) hypre_UMCTAlloc(type, count)
+  //#define hypre_TReAlloc(ptr, type, count) hypre_UMTReAlloc(type, count)
+  //#define hypre_TFree(ptr) hypre_UMTFree(ptr)
+
+#define hypre_SharedTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_SharedCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_SharedTFree(ptr) hypre_TFree(ptr)
+#else
+#define HYPRE_CUDA_GLOBAL 
+
 /*--------------------------------------------------------------------------
  * Use "Debug Malloc Library", dmalloc
  *--------------------------------------------------------------------------*/
@@ -79,6 +230,34 @@
 #define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
 #define hypre_SharedTFree(ptr) hypre_TFree(ptr)
 
+#define hypre_DeviceTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_DeviceCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_DeviceTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_DeviceTFree(ptr) hypre_TFree(ptr)
+#define hypre_DataCopyToData(ptrH,ptrD,type,count) memcpy(ptrD, ptrH, sizeof(type)*(count))
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count) memcpy(ptrH, ptrD, sizeof(type)*(count))
+#define hypre_DeviceMemset(ptr,value,type,count)	memset(ptr,value,count*sizeof(type))
+#define hypre_UMTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_UMCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_UMTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_UMTFree(ptr) hypre_TFree(ptr)
+#endif
+  
+#define hypre_PinnedTAlloc(type, count)\
+( (type *)hypre_MAllocPinned((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTAlloc(type, count) \
+( (type *)hypre_MAllocHost((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostCTAlloc(type, count) \
+( (type *)hypre_CAllocHost((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_HostTReAlloc(ptr, type, count) \
+( (type *)hypre_ReAllocHost((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTFree(ptr) \
+( hypre_FreeHost((char *)ptr), ptr = NULL )
+
 /*--------------------------------------------------------------------------
  * Prototypes
  *--------------------------------------------------------------------------*/
@@ -87,8 +266,13 @@
 HYPRE_Int hypre_OutOfMemory ( size_t size );
 char *hypre_MAlloc ( size_t size );
 char *hypre_CAlloc ( size_t count , size_t elt_size );
+char *hypre_MAllocPinned( size_t size );
 char *hypre_ReAlloc ( char *ptr , size_t size );
 void hypre_Free ( char *ptr );
+char *hypre_CAllocHost( size_t count,size_t elt_size );
+char *hypre_MAllocHost( size_t size );
+char *hypre_ReAllocHost( char   *ptr,size_t  size );
+void hypre_FreeHost( char *ptr );
 char *hypre_SharedMAlloc ( size_t size );
 char *hypre_SharedCAlloc ( size_t count , size_t elt_size );
 char *hypre_SharedReAlloc ( char *ptr , size_t size );
@@ -108,3 +292,4 @@
 #endif
 
 #endif
+
diff -Nru hypre-2.11.2/src/utilities/hypre_nvtx.h hypre-2.13.0/src/utilities/hypre_nvtx.h
--- hypre-2.11.2/src/utilities/hypre_nvtx.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/hypre_nvtx.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,72 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#include "nvToolsExtCudaRt.h"
+
+static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
+static const int num_colors = sizeof(colors)/sizeof(uint32_t);
+
+#define PUSH_RANGE(name,cid) { \
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib);	\
+}
+
+#define PUSH_RANGE_PAYLOAD(name,cid,load) {		\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_INT64; \
+    eventAttrib.payload.llValue = load; \
+    eventAttrib.category=1; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \
+}
+
+#define PUSH_RANGE_DOMAIN(name,cid,dId) {				\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(getdomain(dId),&eventAttrib);	\
+}
+
+#define POP_RANGE nvtxDomainRangePop(HYPRE_DOMAIN);
+#define POP_RANGE_DOMAIN(dId) {			\
+  nvtxDomainRangePop(getdomain(dId));		\
+  }
+#else
+#define PUSH_RANGE(name,cid)
+#define POP_RANGE
+#define PUSH_RANGE_PAYLOAD(name,cid,load)
+#define PUSH_RANGE_DOMAIN(name,cid,domainName)
+#endif
+
diff -Nru hypre-2.11.2/src/utilities/hypre_printf.c hypre-2.13.0/src/utilities/hypre_printf.c
--- hypre-2.11.2/src/utilities/hypre_printf.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/hypre_printf.c	2017-10-20 17:42:22.000000000 +0000
@@ -14,7 +14,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-#ifdef HYPRE_BIGINT
+// #ifdef HYPRE_BIGINT
 
 /* these prototypes are missing by default for some compilers */
 int vscanf( const char *format , va_list arg );
@@ -42,32 +42,50 @@
       }
       else if (foundpercent)
       {
+         if (*fp == 'l')
+         {
+            fp++; /* remove 'l' and maybe add it back in switch statement */
+            if (*fp == 'l')
+            {
+               fp++; /* remove second 'l' if present */
+            }
+         }
          switch(*fp)
          {
             case 'd':
+            case 'i':
+#if defined(HYPRE_BIGINT)
                *nfp = 'l'; nfp++;
                *nfp = 'l'; nfp++;
-            case 'c':
+#endif
+               foundpercent = 0; break;
+            case 'f':
             case 'e':
             case 'E':
-            case 'f':
             case 'g':
             case 'G':
-            case 'i':
+#if defined(HYPRE_SINGLE)          /* no modifier */
+#elif defined(HYPRE_LONG_DOUBLE)   /* modify with 'L' */
+               *nfp = 'L'; nfp++;
+#else                              /* modify with 'l' (default is _double_) */
+               *nfp = 'l'; nfp++;
+#endif
+               foundpercent = 0; break;
+            case 'c':
             case 'n':
             case 'o':
             case 'p':
             case 's':
             case 'u':
             case 'x':
-            case 'S':
+            case 'X':
             case '%':
-               foundpercent = 0;
+               foundpercent = 0; break;
          }
       }
       *nfp = *fp; nfp++;
    }
-   *nfp = *fp; nfp++;
+   *nfp = *fp;
 
    *newformat_ptr = newformat;
 
@@ -79,9 +97,7 @@
 HYPRE_Int
 free_format( char *newformat )
 {
-#ifdef HYPRE_BIGINT
    hypre_TFree(newformat);
-#endif
 
    return 0;
 }
@@ -186,9 +202,9 @@
    return ierr;
 }
 
-#else
-
-/* this is used only to eliminate compiler warnings */
-HYPRE_Int hypre_printf_empty;
-
-#endif
+// #else
+// 
+// /* this is used only to eliminate compiler warnings */
+// HYPRE_Int hypre_printf_empty;
+// 
+// #endif
diff -Nru hypre-2.11.2/src/utilities/_hypre_utilities.h hypre-2.13.0/src/utilities/_hypre_utilities.h
--- hypre-2.11.2/src/utilities/_hypre_utilities.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/_hypre_utilities.h	2017-10-20 17:42:22.000000000 +0000
@@ -1,14 +1,6 @@
-/*BHEADER**********************************************************************
- * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
- * Produced at the Lawrence Livermore National Laboratory.
- * This file is part of HYPRE.  See file COPYRIGHT for details.
- *
- * HYPRE is free software; you can redistribute it and/or modify it under the
- * terms of the GNU Lesser General Public License (as published by the Free
- * Software Foundation) version 2.1 dated February 1999.
- *
- * $Revision$
- ***********************************************************************EHEADER*/
+
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
 
 #ifndef hypre_UTILITIES_HEADER
 #define hypre_UTILITIES_HEADER
@@ -19,15 +11,6 @@
 #include <omp.h>
 #endif
 
-/* This allows us to consistently avoid 'int' throughout hypre */
-typedef int               hypre_int;
-typedef long int          hypre_longint;
-typedef unsigned int      hypre_uint;
-typedef unsigned long int hypre_ulongint;
-
-/* This allows us to consistently avoid 'double' throughout hypre */
-typedef double            hypre_double;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -53,6 +36,15 @@
 #ifndef hypre_GENERAL_HEADER
 #define hypre_GENERAL_HEADER
 
+/* This allows us to consistently avoid 'int' throughout hypre */
+typedef int               hypre_int;
+typedef long int          hypre_longint;
+typedef unsigned int      hypre_uint;
+typedef unsigned long int hypre_ulongint;
+
+/* This allows us to consistently avoid 'double' throughout hypre */
+typedef double            hypre_double;
+
 /*--------------------------------------------------------------------------
  * Define various functions
  *--------------------------------------------------------------------------*/
@@ -130,7 +122,9 @@
 
 #define MPI_BOTTOM  	    hypre_MPI_BOTTOM
 
+#define MPI_FLOAT           hypre_MPI_FLOAT
 #define MPI_DOUBLE          hypre_MPI_DOUBLE           
+#define MPI_LONG_DOUBLE     hypre_MPI_LONG_DOUBLE
 #define MPI_INT             hypre_MPI_INT              
 #define MPI_LONG_LONG_INT   hypre_MPI_INT              
 #define MPI_CHAR            hypre_MPI_CHAR             
@@ -142,6 +136,7 @@
 #define MPI_MIN             hypre_MPI_MIN              
 #define MPI_MAX             hypre_MPI_MAX              
 #define MPI_LOR             hypre_MPI_LOR              
+#define MPI_LAND            hypre_MPI_LAND             
 #define MPI_SUCCESS         hypre_MPI_SUCCESS
 #define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE
 
@@ -232,18 +227,21 @@
 
 #define  hypre_MPI_BOTTOM  0x0
 
-#define  hypre_MPI_DOUBLE 0
-#define  hypre_MPI_INT 1
-#define  hypre_MPI_CHAR 2
-#define  hypre_MPI_LONG 3
-#define  hypre_MPI_BYTE 4
-#define  hypre_MPI_REAL 5
-#define  hypre_MPI_COMPLEX 6
+#define  hypre_MPI_FLOAT 0
+#define  hypre_MPI_DOUBLE 1
+#define  hypre_MPI_LONG_DOUBLE 2
+#define  hypre_MPI_INT 3
+#define  hypre_MPI_CHAR 4
+#define  hypre_MPI_LONG 5
+#define  hypre_MPI_BYTE 6
+#define  hypre_MPI_REAL 7
+#define  hypre_MPI_COMPLEX 8
 
 #define  hypre_MPI_SUM 0
 #define  hypre_MPI_MIN 1
 #define  hypre_MPI_MAX 2
 #define  hypre_MPI_LOR 3
+#define  hypre_MPI_LAND 4
 #define  hypre_MPI_SUCCESS 0
 #define  hypre_MPI_STATUSES_IGNORE 0
 
@@ -272,7 +270,9 @@
 #define  hypre_MPI_BOTTOM     MPI_BOTTOM
 #define  hypre_MPI_COMM_SELF  MPI_COMM_SELF
 
+#define  hypre_MPI_FLOAT   MPI_FLOAT
 #define  hypre_MPI_DOUBLE  MPI_DOUBLE
+#define  hypre_MPI_LONG_DOUBLE  MPI_LONG_DOUBLE
 /* HYPRE_MPI_INT is defined in HYPRE_utilities.h */
 #define  hypre_MPI_INT     HYPRE_MPI_INT
 #define  hypre_MPI_CHAR    MPI_CHAR
@@ -288,6 +288,7 @@
 #define  hypre_MPI_MAX MPI_MAX
 #define  hypre_MPI_LOR MPI_LOR
 #define  hypre_MPI_SUCCESS MPI_SUCCESS
+#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 
 #define  hypre_MPI_UNDEFINED       MPI_UNDEFINED   
 #define  hypre_MPI_REQUEST_NULL    MPI_REQUEST_NULL
@@ -295,7 +296,6 @@
 #define  hypre_MPI_ANY_TAG         MPI_ANY_TAG
 #define  hypre_MPI_SOURCE          MPI_SOURCE
 #define  hypre_MPI_TAG             MPI_TAG
-#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 #define  hypre_MPI_LAND            MPI_LAND
 
 #endif
@@ -415,6 +415,157 @@
 extern "C" {
 #endif
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED)
+#ifdef __cplusplus
+extern "C++" {
+#endif
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifdef __cplusplus
+}
+#endif
+#define HYPRE_CUDA_GLOBAL __host__ __device__
+  
+#if defined(HYPRE_MEMORY_GPU)
+#define hypre_DeviceTAlloc(type, count) \
+  ({									\
+    type * ptr;								\
+    cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+    if ( cudaerr != cudaSuccess ) {					\
+      printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+      HYPRE_Int *p = NULL; *p = 1;						\
+    }									\
+    ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                         cudaMalloc((void**)&,sizeof(type)*(count), cudaMemAttachGlobal);	\
+											 memcpy(newptr, ptr, sizeof(type)*(count)); \
+											 cudaFree(ptr);				\
+											 ptr = newptr;}
+#else
+ #define hypre_DeviceTAlloc(type, count) \
+	({																	\
+	type * ptr;															\
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);\
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);	\
+					      memcpy(newptr, ptr, sizeof(type)*(count)); \
+					      cudaFree(ptr);		\
+					      ptr = newptr;} 
+#endif
+  
+#define hypre_DeviceTFree(ptr) \
+	{											\
+		cudaError_t cudaerr = cudaFree(ptr);							\
+		if ( cudaerr != cudaSuccess ) {									\
+			printf("\n CudaFree : %s in %s(%d) function %s\n",cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+			HYPRE_Int *p = NULL; *p = 1;										\
+		}																\
+	}																	\
+	
+
+#define hypre_DataCopyToData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrD, ptrH, sizeof(type)*count, cudaMemcpyHostToDevice); \
+if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCopyToData %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+}							  \
+	}
+	
+	
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrH, ptrD, sizeof(type)*count, cudaMemcpyDeviceToHost); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	}
+
+#define hypre_DeviceMemset(ptr,value,type,count)	\
+	cudaMemset(ptr,value,count*sizeof(type));
+	
+#define hypre_UMTAlloc(type, count)				\
+  ({									\
+      type * ptr;								\
+      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+      ptr;								\
+  })
+	
+#define hypre_UMCTAlloc(type, count)					\
+  ({									\
+    type * ptr;								\
+    cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaMemset(ptr,0,sizeof(type)*(count));				\
+    ptr;})								\
+  
+  
+#define hypre_UMTReAlloc(type, count)\
+  ({							 \
+    type * ptr;								\
+    type *newptr;							\
+    cudaMallocManaged((void**)&newptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaFree(ptr);							\
+    ptr = newptr;							\
+    ptr;})								\
+  
+#define hypre_UMTFree(ptr) \
+      cudaFree(ptr)
+
+#define hypre_InitMemoryDebug(id)
+#define hypre_FinalizeMemoryDebug()
+
+#define hypre_TAlloc(type, count) \
+( (type *)hypre_MAlloc((size_t)(sizeof(type) * (count))) )
+
+#define hypre_CTAlloc(type, count) \
+( (type *)hypre_CAlloc((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_TReAlloc(ptr, type, count) \
+( (type *)hypre_ReAlloc((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_TFree(ptr) \
+( hypre_Free((char *)ptr), ptr = NULL )
+  
+  //#define hypre_TAlloc(type, count)  hypre_UMTAlloc(type, count)
+  //#define hypre_CTAlloc(type, count) hypre_UMCTAlloc(type, count)
+  //#define hypre_TReAlloc(ptr, type, count) hypre_UMTReAlloc(type, count)
+  //#define hypre_TFree(ptr) hypre_UMTFree(ptr)
+
+#define hypre_SharedTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_SharedCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_SharedTFree(ptr) hypre_TFree(ptr)
+#else
+#define HYPRE_CUDA_GLOBAL 
+
 /*--------------------------------------------------------------------------
  * Use "Debug Malloc Library", dmalloc
  *--------------------------------------------------------------------------*/
@@ -468,6 +619,34 @@
 #define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
 #define hypre_SharedTFree(ptr) hypre_TFree(ptr)
 
+#define hypre_DeviceTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_DeviceCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_DeviceTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_DeviceTFree(ptr) hypre_TFree(ptr)
+#define hypre_DataCopyToData(ptrH,ptrD,type,count) memcpy(ptrD, ptrH, sizeof(type)*(count))
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count) memcpy(ptrH, ptrD, sizeof(type)*(count))
+#define hypre_DeviceMemset(ptr,value,type,count)	memset(ptr,value,count*sizeof(type))
+#define hypre_UMTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_UMCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_UMTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_UMTFree(ptr) hypre_TFree(ptr)
+#endif
+  
+#define hypre_PinnedTAlloc(type, count)\
+( (type *)hypre_MAllocPinned((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTAlloc(type, count) \
+( (type *)hypre_MAllocHost((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostCTAlloc(type, count) \
+( (type *)hypre_CAllocHost((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_HostTReAlloc(ptr, type, count) \
+( (type *)hypre_ReAllocHost((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTFree(ptr) \
+( hypre_FreeHost((char *)ptr), ptr = NULL )
+
 /*--------------------------------------------------------------------------
  * Prototypes
  *--------------------------------------------------------------------------*/
@@ -476,8 +655,13 @@
 HYPRE_Int hypre_OutOfMemory ( size_t size );
 char *hypre_MAlloc ( size_t size );
 char *hypre_CAlloc ( size_t count , size_t elt_size );
+char *hypre_MAllocPinned( size_t size );
 char *hypre_ReAlloc ( char *ptr , size_t size );
 void hypre_Free ( char *ptr );
+char *hypre_CAllocHost( size_t count,size_t elt_size );
+char *hypre_MAllocHost( size_t size );
+char *hypre_ReAllocHost( char   *ptr,size_t  size );
+void hypre_FreeHost( char *ptr );
 char *hypre_SharedMAlloc ( size_t size );
 char *hypre_SharedCAlloc ( size_t count , size_t elt_size );
 char *hypre_SharedReAlloc ( char *ptr , size_t size );
@@ -577,11 +761,12 @@
 #ifndef HYPRE_TIMING
 
 #define hypre_InitializeTiming(name) 0
+#define hypre_FinalizeTiming(index)
 #define hypre_IncFLOPCount(inc)
 #define hypre_BeginTiming(i)
 #define hypre_EndTiming(i)
 #define hypre_PrintTiming(heading, comm)
-#define hypre_FinalizeTiming(index)
+#define hypre_ClearTiming()
 
 /*--------------------------------------------------------------------------
  * With timing on
@@ -747,11 +932,9 @@
    
 } hypre_DataExchangeResponse;
 
-
 HYPRE_Int hypre_CreateBinaryTree(HYPRE_Int, HYPRE_Int, hypre_BinaryTree*);
 HYPRE_Int hypre_DestroyBinaryTree(hypre_BinaryTree*);
 
-
 HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts, 
 		     HYPRE_Int *contact_proc_list, void *contact_send_buf, 
 		     HYPRE_Int *contact_send_buf_starts, HYPRE_Int contact_obj_size, 
@@ -760,7 +943,6 @@
                      HYPRE_Int rnum, MPI_Comm comm,  void **p_response_recv_buf, 
                      HYPRE_Int **p_response_recv_buf_starts);
 
-
 #endif /* end of header */
 
 /*BHEADER**********************************************************************
@@ -840,9 +1022,365 @@
 
 #endif /* CALIPER_INSTRUMENTATION_HEADER */
 
-/*--------------------------------------------------------------------------
- * Other prototypes
- *--------------------------------------------------------------------------*/
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+//#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+//#define CUDAMEMATTACHTYPE cudaMemAttachHost
+#define HYPRE_GPU_USE_PINNED 1
+#define HYPRE_USE_MANAGED_SCALABLE 1
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#include "nvToolsExtCudaRt.h"
+
+static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
+static const int num_colors = sizeof(colors)/sizeof(uint32_t);
+
+#define PUSH_RANGE(name,cid) { \
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib);	\
+}
+
+#define PUSH_RANGE_PAYLOAD(name,cid,load) {		\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_INT64; \
+    eventAttrib.payload.llValue = load; \
+    eventAttrib.category=1; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \
+}
+
+#define PUSH_RANGE_DOMAIN(name,cid,dId) {				\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(getdomain(dId),&eventAttrib);	\
+}
+
+#define POP_RANGE nvtxDomainRangePop(HYPRE_DOMAIN);
+#define POP_RANGE_DOMAIN(dId) {			\
+  nvtxDomainRangePop(getdomain(dId));		\
+  }
+#else
+#define PUSH_RANGE(name,cid)
+#define POP_RANGE
+#define PUSH_RANGE_PAYLOAD(name,cid,load)
+#define PUSH_RANGE_DOMAIN(name,cid,domainName)
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef HYPRE_USE_MANAGED
+#include <cuda_runtime_api.h>
+#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+#define MEM_PAD_LEN 1
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line)
+{
+   if (code != cudaSuccess) 
+   {
+     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+     exit(2);
+   }
+}
+#define HYPRE_HOST_POINTER 0
+#define HYPRE_MANAGED_POINTER 1
+#define HYPRE_PINNED_POINTER 2
+#define HYPRE_DEVICE_POINTER 3
+#define HYPRE_UNDEFINED_POINTER1 4
+#define HYPRE_UNDEFINED_POINTER2 5
+void cudaSafeFree(void *ptr,int padding);
+hypre_int PrintPointerAttributes(const void *ptr);
+hypre_int PointerAttributes(const void *ptr);
+#endif
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __cusparseErrorCheck__
+#define __cusparseErrorCheck__
+#include <cusparse.h>
+#include <cublas_v2.h>
+#include <stdio.h>
+//#include <cuda_runtime_api.h>
+#include <stdlib.h>
+inline const char *cusparseErrorCheck(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+	    return "Unknown error in cusparseErrorCheck";
+    }
+    
+}
+inline const char *cublasErrorCheck(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+	    return "CUBLAS_STATUS_LICENSE_ERROR";
+        default:
+	    return "Unknown error in cublasErrorCheck";
+    }
+
+}
+//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+//inline void gpuAssert(cudaError_t code, const char *file, int line)
+//{
+//   if (code != cudaSuccess) 
+//   {
+//     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+//     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+//     exit(2);
+//   }
+//}
+#define cusparseErrchk(ans) { cusparseAssert((ans), __FILE__, __LINE__); }
+inline void cusparseAssert(cusparseStatus_t code, const char *file, int line)
+{
+   if (code != CUSPARSE_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUSPARSE ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUSPARSE ERROR : %s \n", cusparseErrorCheck(code));
+   }
+}
+#define cublasErrchk(ans){ cublasAssert((ans), __FILE__, __LINE__); }
+inline void cublasAssert(cublasStatus_t code, const char *file, int line)
+{
+   if (code != CUBLAS_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUBLAS ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUBLAS ERROR : %s \n", cublasErrorCheck(code));
+   }
+}
+//int PointerType(const void *ptr);
+void cudaSafeFree(void *ptr,int padding);
+//void PrintPointerAttributes(const void *ptr);
+//size_t mempush(void* ptr, size_t size,int purge);
+//int memloc(void *ptr, int device);
+#endif
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __GPUMEM_H__
+#define  __GPUMEM_H__
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+void hypre_GPUInit(hypre_int use_device);
+void hypre_GPUFinalize();
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+#endif
+
+cudaStream_t getstreamOlde(hypre_int i);
+nvtxDomainHandle_t getdomain(hypre_int i);
+cudaEvent_t getevent(hypre_int i);
+void MemAdviseReadOnly(const void *ptr, hypre_int device);
+void MemAdviseUnSetReadOnly(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocHost(const void *ptr);
+void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream);
+void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream);
+void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream);
+cublasHandle_t getCublasHandle();
+cusparseHandle_t getCusparseHandle();
+typedef struct node {
+  const void *ptr;
+  size_t size;
+  struct node *next;
+} node;
+size_t mempush(const void *ptr, size_t size, hypre_int action);
+node *memfind(node *head, const void *ptr);
+void memdel(node **head, node *found);
+void meminsert(node **head, const void *ptr,size_t size);
+void printlist(node *head,hypre_int nc);
+//#define MEM_PAD_LEN 1
+size_t memsize(const void *ptr);
+hypre_int getsetasyncmode(hypre_int mode, hypre_int action);
+void SetAsyncMode(hypre_int mode);
+hypre_int GetAsyncMode();
+void branchStream(hypre_int i, hypre_int j);
+void joinStreams(hypre_int i, hypre_int j, hypre_int k);
+void affs(hypre_int myid);
+hypre_int getcore();
+hypre_int getnuma();
+hypre_int checkDeviceProps();
+hypre_int pointerIsManaged(const void *ptr);
+/*
+ * Global struct for keeping HYPRE GPU Init state
+ */
+
+#define MAX_HGS_ELEMENTS 10
+struct hypre__global_struct{
+  hypre_int initd;
+  hypre_int device;
+  hypre_int device_count;
+  cublasHandle_t cublas_handle;
+  cusparseHandle_t cusparse_handle;
+  cusparseMatDescr_t cusparse_mat_descr;
+  cudaStream_t streams[MAX_HGS_ELEMENTS];
+  nvtxDomainHandle_t nvtx_domain;
+  hypre_int concurrent_managed_access;
+  size_t memoryHWM;
+};
+
+extern struct hypre__global_struct hypre__global_handle ;
+
+/*
+ * Macros for accessing elements of the global handle
+ */
+#define HYPRE_GPU_HANDLE hypre__global_handle.initd
+#define HYPRE_CUBLAS_HANDLE hypre__global_handle.cublas_handle
+#define HYPRE_CUSPARSE_HANDLE hypre__global_handle.cusparse_handle
+#define HYPRE_DEVICE hypre__global_handle.device
+#define HYPRE_DEVICE_COUNT hypre__global_handle.device_count
+#define HYPRE_CUSPARSE_MAT_DESCR hypre__global_handle.cusparse_mat_descr
+#define HYPRE_STREAM(index) (hypre__global_handle.streams[index])
+#define HYPRE_DOMAIN  hypre__global_handle.nvtx_domain
+#define HYPRE_GPU_CMA hypre__global_handle.concurrent_managed_access
+#define HYPRE_GPU_HWM hypre__global_handle.memoryHWM
+
+#endif
+
+#else
+
+#define hypre_GPUInit(use_device)
+#define hypre_GPUFinalize()
+
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
 
 /* amg_linklist.c */
 void hypre_dispose_elt ( hypre_LinkList element_ptr );
@@ -869,21 +1407,21 @@
 #endif
 
 /* hypre_printf.c */
-#ifdef HYPRE_BIGINT
+// #ifdef HYPRE_BIGINT
 HYPRE_Int hypre_printf( const char *format , ... );
 HYPRE_Int hypre_fprintf( FILE *stream , const char *format, ... );
 HYPRE_Int hypre_sprintf( char *s , const char *format, ... );
 HYPRE_Int hypre_scanf( const char *format , ... );
 HYPRE_Int hypre_fscanf( FILE *stream , const char *format, ... );
 HYPRE_Int hypre_sscanf( char *s , const char *format, ... );
-#else
-#define hypre_printf  printf
-#define hypre_fprintf fprintf
-#define hypre_sprintf sprintf
-#define hypre_scanf   scanf
-#define hypre_fscanf  fscanf
-#define hypre_sscanf  sscanf
-#endif
+// #else
+// #define hypre_printf  printf
+// #define hypre_fprintf fprintf
+// #define hypre_sprintf sprintf
+// #define hypre_scanf   scanf
+// #define hypre_fscanf  fscanf
+// #define hypre_sscanf  sscanf
+// #endif
 
 /* hypre_qsort.c */
 void hypre_swap ( HYPRE_Int *v , HYPRE_Int i , HYPRE_Int j );
@@ -906,8 +1444,9 @@
 HYPRE_Int hypre_DoubleQuickSplit ( HYPRE_Real *values , HYPRE_Int *indices , HYPRE_Int list_length , HYPRE_Int NumberKept );
 
 /* random.c */
-void hypre_SeedRand ( HYPRE_Int seed );
-HYPRE_Real hypre_Rand ( void );
+HYPRE_CUDA_GLOBAL void hypre_SeedRand ( HYPRE_Int seed );
+HYPRE_CUDA_GLOBAL HYPRE_Int hypre_RandI ( void );
+HYPRE_CUDA_GLOBAL HYPRE_Real hypre_Rand ( void );
 
 /* hypre_prefix_sum.c */
 /**
@@ -1057,7 +1596,5 @@
 }
 #endif
 
-/*#include "hypre_hopscotch_hash.h"*/
-
 #endif
 
diff -Nru hypre-2.11.2/src/utilities/HYPRE_utilities.h hypre-2.13.0/src/utilities/HYPRE_utilities.h
--- hypre-2.11.2/src/utilities/HYPRE_utilities.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/HYPRE_utilities.h	2017-10-20 17:42:22.000000000 +0000
@@ -43,25 +43,51 @@
  * Big int stuff
  *--------------------------------------------------------------------------*/
 
-#ifdef HYPRE_BIGINT
+#if defined(HYPRE_BIGINT)
 typedef long long int HYPRE_Int;
 #define HYPRE_MPI_INT MPI_LONG_LONG_INT
-#else 
+
+#else /* default */
 typedef int HYPRE_Int;
 #define HYPRE_MPI_INT MPI_INT
 #endif
 
 /*--------------------------------------------------------------------------
- * Complex stuff
+ * Real and Complex types
  *--------------------------------------------------------------------------*/
 
+#include <float.h>
+
+#if defined(HYPRE_SINGLE)
+typedef float HYPRE_Real;
+#define HYPRE_REAL_MAX FLT_MAX
+#define HYPRE_REAL_MIN FLT_MIN
+#define HYPRE_REAL_EPSILON FLT_EPSILON
+#define HYPRE_REAL_MIN_EXP FLT_MIN_EXP
+#define HYPRE_MPI_REAL MPI_FLOAT
+
+#elif defined(HYPRE_LONG_DOUBLE)
+typedef long double HYPRE_Real;
+#define HYPRE_REAL_MAX LDBL_MAX
+#define HYPRE_REAL_MIN LDBL_MIN
+#define HYPRE_REAL_EPSILON LDBL_EPSILON
+#define HYPRE_REAL_MIN_EXP DBL_MIN_EXP
+#define HYPRE_MPI_REAL MPI_LONG_DOUBLE
+
+#else /* default */
 typedef double HYPRE_Real;
+#define HYPRE_REAL_MAX DBL_MAX
+#define HYPRE_REAL_MIN DBL_MIN
+#define HYPRE_REAL_EPSILON DBL_EPSILON
+#define HYPRE_REAL_MIN_EXP DBL_MIN_EXP
 #define HYPRE_MPI_REAL MPI_DOUBLE
+#endif
 
-#ifdef HYPRE_COMPLEX
+#if defined(HYPRE_COMPLEX)
 typedef double _Complex HYPRE_Complex;
 #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX  /* or MPI_LONG_DOUBLE ? */
-#else 
+
+#else  /* default */
 typedef HYPRE_Real HYPRE_Complex;
 #define HYPRE_MPI_COMPLEX HYPRE_MPI_REAL
 #endif
diff -Nru hypre-2.11.2/src/utilities/Makefile hypre-2.13.0/src/utilities/Makefile
--- hypre-2.11.2/src/utilities/Makefile	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/Makefile	2017-10-20 17:42:22.000000000 +0000
@@ -60,7 +60,9 @@
  F90_HYPRE_error.c\
  hypre_prefix_sum.c\
  hypre_merge_sort.c\
- hypre_hopscotch_hash.c
+ hypre_hopscotch_hash.c\
+ gpuErrorCheck.c\
+ gpuMem.c
 
 OBJS = ${FILES:.c=.o}
 
diff -Nru hypre-2.11.2/src/utilities/mpistubs.c hypre-2.13.0/src/utilities/mpistubs.c
--- hypre-2.11.2/src/utilities/mpistubs.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/mpistubs.c	2017-10-20 17:42:22.000000000 +0000
@@ -25,7 +25,7 @@
 #ifdef HYPRE_HAVE_MPI_COMM_F2C
    return (hypre_MPI_Comm) MPI_Comm_f2c(comm);
 #else
-   return (hypre_MPI_Comm) comm;
+   return (hypre_MPI_Comm) (size_t)comm;
 #endif
 }
 
diff -Nru hypre-2.11.2/src/utilities/mpistubs.h hypre-2.13.0/src/utilities/mpistubs.h
--- hypre-2.11.2/src/utilities/mpistubs.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/mpistubs.h	2017-10-20 17:42:22.000000000 +0000
@@ -46,10 +46,13 @@
 
 #define MPI_COMM_WORLD      hypre_MPI_COMM_WORLD       
 #define MPI_COMM_NULL       hypre_MPI_COMM_NULL
+#define MPI_COMM_SELF       hypre_MPI_COMM_SELF
 
 #define MPI_BOTTOM  	    hypre_MPI_BOTTOM
 
+#define MPI_FLOAT           hypre_MPI_FLOAT
 #define MPI_DOUBLE          hypre_MPI_DOUBLE           
+#define MPI_LONG_DOUBLE     hypre_MPI_LONG_DOUBLE
 #define MPI_INT             hypre_MPI_INT              
 #define MPI_LONG_LONG_INT   hypre_MPI_INT              
 #define MPI_CHAR            hypre_MPI_CHAR             
@@ -61,6 +64,9 @@
 #define MPI_MIN             hypre_MPI_MIN              
 #define MPI_MAX             hypre_MPI_MAX              
 #define MPI_LOR             hypre_MPI_LOR              
+#define MPI_LAND            hypre_MPI_LAND             
+#define MPI_SUCCESS         hypre_MPI_SUCCESS
+#define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE
 
 #define MPI_UNDEFINED       hypre_MPI_UNDEFINED        
 #define MPI_REQUEST_NULL    hypre_MPI_REQUEST_NULL        
@@ -68,8 +74,6 @@
 #define MPI_ANY_TAG         hypre_MPI_ANY_TAG
 #define MPI_SOURCE          hypre_MPI_SOURCE
 #define MPI_TAG             hypre_MPI_TAG
-#define MPI_SUCCESS         hypre_MPI_SUCCESS
-#define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE
 
 #define MPI_Init            hypre_MPI_Init             
 #define MPI_Finalize        hypre_MPI_Finalize         
@@ -93,9 +97,9 @@
 #define MPI_Allgather       hypre_MPI_Allgather        
 #define MPI_Allgatherv      hypre_MPI_Allgatherv       
 #define MPI_Gather          hypre_MPI_Gather       
-#define MPI_Gatherv          hypre_MPI_Gatherv       
+#define MPI_Gatherv         hypre_MPI_Gatherv       
 #define MPI_Scatter         hypre_MPI_Scatter       
-#define MPI_Scatterv         hypre_MPI_Scatterv       
+#define MPI_Scatterv        hypre_MPI_Scatterv       
 #define MPI_Bcast           hypre_MPI_Bcast            
 #define MPI_Send            hypre_MPI_Send             
 #define MPI_Recv            hypre_MPI_Recv             
@@ -122,6 +126,9 @@
 #define MPI_Type_struct     hypre_MPI_Type_struct      
 #define MPI_Type_commit     hypre_MPI_Type_commit
 #define MPI_Type_free       hypre_MPI_Type_free        
+#define MPI_Op_free         hypre_MPI_Op_free        
+#define MPI_Op_create       hypre_MPI_Op_create
+#define MPI_User_function   hypre_MPI_User_function
 
 /*--------------------------------------------------------------------------
  * Types, etc.
@@ -132,6 +139,7 @@
 typedef HYPRE_Int hypre_MPI_Group;
 typedef HYPRE_Int hypre_MPI_Request;
 typedef HYPRE_Int hypre_MPI_Datatype;
+typedef void (hypre_MPI_User_function) ();
 
 typedef struct
 {
@@ -141,23 +149,27 @@
 typedef HYPRE_Int  hypre_MPI_Op;
 typedef HYPRE_Int  hypre_MPI_Aint;
 
+#define  hypre_MPI_COMM_SELF 1
 #define  hypre_MPI_COMM_WORLD 0
 #define  hypre_MPI_COMM_NULL  -1
 
 #define  hypre_MPI_BOTTOM  0x0
 
-#define  hypre_MPI_DOUBLE 0
-#define  hypre_MPI_INT 1
-#define  hypre_MPI_CHAR 2
-#define  hypre_MPI_LONG 3
-#define  hypre_MPI_BYTE 4
-#define  hypre_MPI_REAL 5
-#define  hypre_MPI_COMPLEX 6
+#define  hypre_MPI_FLOAT 0
+#define  hypre_MPI_DOUBLE 1
+#define  hypre_MPI_LONG_DOUBLE 2
+#define  hypre_MPI_INT 3
+#define  hypre_MPI_CHAR 4
+#define  hypre_MPI_LONG 5
+#define  hypre_MPI_BYTE 6
+#define  hypre_MPI_REAL 7
+#define  hypre_MPI_COMPLEX 8
 
 #define  hypre_MPI_SUM 0
 #define  hypre_MPI_MIN 1
 #define  hypre_MPI_MAX 2
 #define  hypre_MPI_LOR 3
+#define  hypre_MPI_LAND 4
 #define  hypre_MPI_SUCCESS 0
 #define  hypre_MPI_STATUSES_IGNORE 0
 
@@ -179,13 +191,16 @@
 typedef MPI_Status   hypre_MPI_Status;
 typedef MPI_Op       hypre_MPI_Op;
 typedef MPI_Aint     hypre_MPI_Aint;
+typedef MPI_User_function    hypre_MPI_User_function;
 
 #define  hypre_MPI_COMM_WORLD MPI_COMM_WORLD
 #define  hypre_MPI_COMM_NULL  MPI_COMM_NULL
 #define  hypre_MPI_BOTTOM     MPI_BOTTOM
-#define  hypre_MPI_SUCCESS    MPI_SUCCESS
+#define  hypre_MPI_COMM_SELF  MPI_COMM_SELF
 
+#define  hypre_MPI_FLOAT   MPI_FLOAT
 #define  hypre_MPI_DOUBLE  MPI_DOUBLE
+#define  hypre_MPI_LONG_DOUBLE  MPI_LONG_DOUBLE
 /* HYPRE_MPI_INT is defined in HYPRE_utilities.h */
 #define  hypre_MPI_INT     HYPRE_MPI_INT
 #define  hypre_MPI_CHAR    MPI_CHAR
@@ -200,6 +215,8 @@
 #define  hypre_MPI_MIN MPI_MIN
 #define  hypre_MPI_MAX MPI_MAX
 #define  hypre_MPI_LOR MPI_LOR
+#define  hypre_MPI_SUCCESS MPI_SUCCESS
+#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 
 #define  hypre_MPI_UNDEFINED       MPI_UNDEFINED   
 #define  hypre_MPI_REQUEST_NULL    MPI_REQUEST_NULL
@@ -207,7 +224,6 @@
 #define  hypre_MPI_ANY_TAG         MPI_ANY_TAG
 #define  hypre_MPI_SOURCE          MPI_SOURCE
 #define  hypre_MPI_TAG             MPI_TAG
-#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 #define  hypre_MPI_LAND            MPI_LAND
 
 #endif
@@ -272,9 +288,12 @@
 HYPRE_Int hypre_MPI_Type_struct( HYPRE_Int count , HYPRE_Int *array_of_blocklengths , hypre_MPI_Aint *array_of_displacements , hypre_MPI_Datatype *array_of_types , hypre_MPI_Datatype *newtype );
 HYPRE_Int hypre_MPI_Type_commit( hypre_MPI_Datatype *datatype );
 HYPRE_Int hypre_MPI_Type_free( hypre_MPI_Datatype *datatype );
+HYPRE_Int hypre_MPI_Op_free( hypre_MPI_Op *op );
+HYPRE_Int hypre_MPI_Op_create( hypre_MPI_User_function *function , hypre_int commute , hypre_MPI_Op *op );
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif
+
diff -Nru hypre-2.11.2/src/utilities/protos.h hypre-2.13.0/src/utilities/protos.h
--- hypre-2.11.2/src/utilities/protos.h	1970-01-01 00:00:00.000000000 +0000
+++ hypre-2.13.0/src/utilities/protos.h	2017-10-20 17:42:22.000000000 +0000
@@ -0,0 +1,221 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/* amg_linklist.c */
+void hypre_dispose_elt ( hypre_LinkList element_ptr );
+void hypre_remove_point ( hypre_LinkList *LoL_head_ptr , hypre_LinkList *LoL_tail_ptr , HYPRE_Int measure , HYPRE_Int index , HYPRE_Int *lists , HYPRE_Int *where );
+hypre_LinkList hypre_create_elt ( HYPRE_Int Item );
+void hypre_enter_on_lists ( hypre_LinkList *LoL_head_ptr , hypre_LinkList *LoL_tail_ptr , HYPRE_Int measure , HYPRE_Int index , HYPRE_Int *lists , HYPRE_Int *where );
+
+/* binsearch.c */
+HYPRE_Int hypre_BinarySearch ( HYPRE_Int *list , HYPRE_Int value , HYPRE_Int list_length );
+HYPRE_Int hypre_BinarySearch2 ( HYPRE_Int *list , HYPRE_Int value , HYPRE_Int low , HYPRE_Int high , HYPRE_Int *spot );
+HYPRE_Int *hypre_LowerBound( HYPRE_Int *first, HYPRE_Int *last, HYPRE_Int value );
+
+/* hypre_complex.c */
+#ifdef HYPRE_COMPLEX
+HYPRE_Complex hypre_conj( HYPRE_Complex value );
+HYPRE_Real    hypre_cabs( HYPRE_Complex value );
+HYPRE_Real    hypre_creal( HYPRE_Complex value );
+HYPRE_Real    hypre_cimag( HYPRE_Complex value );
+#else
+#define hypre_conj(value)  value
+#define hypre_cabs(value)  fabs(value)
+#define hypre_creal(value) value
+#define hypre_cimag(value) 0.0
+#endif
+
+/* hypre_printf.c */
+// #ifdef HYPRE_BIGINT
+HYPRE_Int hypre_printf( const char *format , ... );
+HYPRE_Int hypre_fprintf( FILE *stream , const char *format, ... );
+HYPRE_Int hypre_sprintf( char *s , const char *format, ... );
+HYPRE_Int hypre_scanf( const char *format , ... );
+HYPRE_Int hypre_fscanf( FILE *stream , const char *format, ... );
+HYPRE_Int hypre_sscanf( char *s , const char *format, ... );
+// #else
+// #define hypre_printf  printf
+// #define hypre_fprintf fprintf
+// #define hypre_sprintf sprintf
+// #define hypre_scanf   scanf
+// #define hypre_fscanf  fscanf
+// #define hypre_sscanf  sscanf
+// #endif
+
+/* hypre_qsort.c */
+void hypre_swap ( HYPRE_Int *v , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap2 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap2i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap3i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap3_d ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap4_d ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int *y , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap_d ( HYPRE_Real *v , HYPRE_Int i , HYPRE_Int j );
+void hypre_qsort0 ( HYPRE_Int *v , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort1 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort2i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort2 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort3i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort3_abs ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort4_abs ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int *y , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort_abs ( HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
+
+/* qsplit.c */
+HYPRE_Int hypre_DoubleQuickSplit ( HYPRE_Real *values , HYPRE_Int *indices , HYPRE_Int list_length , HYPRE_Int NumberKept );
+
+/* random.c */
+HYPRE_CUDA_GLOBAL void hypre_SeedRand ( HYPRE_Int seed );
+HYPRE_CUDA_GLOBAL HYPRE_Int hypre_RandI ( void );
+HYPRE_CUDA_GLOBAL HYPRE_Real hypre_Rand ( void );
+
+/* hypre_prefix_sum.c */
+/**
+ * Assumed to be called within an omp region.
+ * Let x_i be the input of ith thread.
+ * The output of ith thread y_i = x_0 + x_1 + ... + x_{i-1}
+ * Additionally, sum = x_0 + x_1 + ... + x_{nthreads - 1}
+ * Note that always y_0 = 0
+ *
+ * @param workspace at least with length (nthreads+1)
+ *                  workspace[tid] will contain result for tid
+ *                  workspace[nthreads] will contain sum
+ */
+void hypre_prefix_sum(HYPRE_Int *in_out, HYPRE_Int *sum, HYPRE_Int *workspace);
+/**
+ * This version does prefix sum in pair.
+ * Useful when we prefix sum of diag and offd in tandem.
+ *
+ * @param worksapce at least with length 2*(nthreads+1)
+ *                  workspace[2*tid] and workspace[2*tid+1] will contain results for tid
+ *                  workspace[3*nthreads] and workspace[3*nthreads + 1] will contain sums
+ */
+void hypre_prefix_sum_pair(HYPRE_Int *in_out1, HYPRE_Int *sum1, HYPRE_Int *in_out2, HYPRE_Int *sum2, HYPRE_Int *workspace);
+/**
+ * @param workspace at least with length 3*(nthreads+1)
+ *                  workspace[3*tid:3*tid+3) will contain results for tid
+ */
+void hypre_prefix_sum_triple(HYPRE_Int *in_out1, HYPRE_Int *sum1, HYPRE_Int *in_out2, HYPRE_Int *sum2, HYPRE_Int *in_out3, HYPRE_Int *sum3, HYPRE_Int *workspace);
+
+/**
+ * n prefix-sums together.
+ * workspace[n*tid:n*(tid+1)) will contain results for tid
+ * workspace[nthreads*tid:nthreads*(tid+1)) will contain sums
+ *
+ * @param workspace at least with length n*(nthreads+1)
+ */
+void hypre_prefix_sum_multiple(HYPRE_Int *in_out, HYPRE_Int *sum, HYPRE_Int n, HYPRE_Int *workspace);
+
+/* hypre_merge_sort.c */
+/**
+ * Why merge sort?
+ * 1) Merge sort can take advantage of eliminating duplicates.
+ * 2) Merge sort is more efficiently parallelizable than qsort
+ */
+
+/**
+ * Out of place merge sort with duplicate elimination
+ * @ret number of unique elements
+ */
+HYPRE_Int hypre_merge_sort_unique(HYPRE_Int *in, HYPRE_Int *out, HYPRE_Int len);
+/**
+ * Out of place merge sort with duplicate elimination
+ *
+ * @param out pointer to output can be in or temp
+ * @ret number of unique elements
+ */
+HYPRE_Int hypre_merge_sort_unique2(HYPRE_Int *in, HYPRE_Int *temp, HYPRE_Int len, HYPRE_Int **out);
+
+void hypre_merge_sort(HYPRE_Int *in, HYPRE_Int *temp, HYPRE_Int len, HYPRE_Int **sorted);
+
+/* hypre_hopscotch_hash.c */
+
+#ifdef HYPRE_USING_OPENMP
+
+/* Check if atomic operations are available to use concurrent hopscotch hash table */
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) && (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40100
+#define HYPRE_USING_ATOMIC 
+//#elif defined _MSC_VER // JSP: haven't tested, so comment out for now
+//#define HYPRE_USING_ATOMIC
+//#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
+// JSP: not many compilers have implemented this, so comment out for now
+//#define HYPRE_USING_ATOMIC
+//#include <stdatomic.h>
+#endif
+
+#endif // HYPRE_USING_OPENMP
+
+#ifdef HYPRE_HOPSCOTCH
+#ifdef HYPRE_USING_ATOMIC
+// concurrent hopscotch hashing is possible only with atomic supports
+#define HYPRE_CONCURRENT_HOPSCOTCH 
+#endif 
+#endif 
+
+#ifdef HYPRE_CONCURRENT_HOPSCOTCH
+typedef struct {
+  HYPRE_Int volatile timestamp;
+  omp_lock_t         lock;
+} hypre_HopscotchSegment;
+#endif
+
+/**
+ * The current typical use case of unordered set is putting input sequence
+ * with lots of duplication (putting all colidx received from other ranks),
+ * followed by one sweep of enumeration.
+ * Since the capacity is set to the number of inputs, which is much larger
+ * than the number of unique elements, we optimize for initialization and
+ * enumeration whose time is proportional to the capacity.
+ * For initialization and enumeration, structure of array (SoA) is better
+ * for vectorization, cache line utilization, and so on.
+ */
+typedef struct
+{
+	HYPRE_Int  volatile              segmentMask;
+	HYPRE_Int  volatile              bucketMask;
+#ifdef HYPRE_CONCURRENT_HOPSCOTCH
+	hypre_HopscotchSegment* volatile segments;
+#endif
+  HYPRE_Int *volatile              key;
+  hypre_uint *volatile             hopInfo;
+	HYPRE_Int *volatile	             hash;
+} hypre_UnorderedIntSet;
+
+typedef struct
+{
+  hypre_uint volatile hopInfo;
+  HYPRE_Int  volatile hash;
+  HYPRE_Int  volatile key;
+  HYPRE_Int  volatile data;
+} hypre_HopscotchBucket;
+
+/**
+ * The current typical use case of unoredered map is putting input sequence
+ * with no duplication (inverse map of a bijective mapping) followed by
+ * lots of lookups.
+ * For lookup, array of structure (AoS) gives better cache line utilization.
+ */
+typedef struct
+{
+	HYPRE_Int  volatile              segmentMask;
+	HYPRE_Int  volatile              bucketMask;
+#ifdef HYPRE_CONCURRENT_HOPSCOTCH
+	hypre_HopscotchSegment*	volatile segments;
+#endif
+	hypre_HopscotchBucket* volatile	 table;
+} hypre_UnorderedIntMap;
+
+/**
+ * Sort array "in" with length len and put result in array "out"
+ * "in" will be deallocated unless in == *out
+ * inverse_map is an inverse hash table s.t. inverse_map[i] = j iff (*out)[j] = i
+ */
+void hypre_sort_and_create_inverse_map(
+  HYPRE_Int *in, HYPRE_Int len, HYPRE_Int **out, hypre_UnorderedIntMap *inverse_map);
diff -Nru hypre-2.11.2/src/utilities/random.c hypre-2.13.0/src/utilities/random.c
--- hypre-2.11.2/src/utilities/random.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/random.c	2017-10-20 17:42:22.000000000 +0000
@@ -38,25 +38,39 @@
 
 #include "_hypre_utilities.h"
 
-/*--------------------------------------------------------------------------
- * Static variables
- *--------------------------------------------------------------------------*/
-
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED)
+__managed__ __device__
+#endif
+
+/*-------------------------------------------------------------------------------
+ * Static global variable: Seed
+ * ``... all initial seeds between 1 and 2147483646 (2^31-2) are equally valid''
+ *-------------------------------------------------------------------------------*/
 static HYPRE_Int Seed = 13579;
 
-#define a  16807
-#define m  2147483647
-#define q  127773
-#define r  2836
+#define a  16807      /* 7^5 */
+#define m  2147483647 /* 2*31 - 1 */
+#define q  127773     /* m div a */
+#define r  2836       /* m mod a */
 
 /*--------------------------------------------------------------------------
  * Initializes the pseudo-random number generator to a place in the sequence.
  *
  * @param seed an HYPRE_Int containing the seed for the RNG.
  *--------------------------------------------------------------------------*/
-
+HYPRE_CUDA_GLOBAL
 void  hypre_SeedRand( HYPRE_Int seed )
 {
+   /* RL: seed must be between 1 and 2^31-2 */
+   if (seed < 1) 
+   {
+      seed = 1;
+   }
+   else if (seed >= m)
+   {
+     seed = m - 1;
+   }
+
    Seed = seed;
 }
 
@@ -64,11 +78,10 @@
  * Computes the next pseudo-random number in the sequence using the global
  * variable Seed.
  *
- * @return a HYPRE_Real containing the next number in the sequence divided by
- * 2147483647 so that the numbers are in (0, 1].
+ * @return a HYPRE_Int between (0, 2147483647]
  *--------------------------------------------------------------------------*/
-
-HYPRE_Real  hypre_Rand()
+HYPRE_CUDA_GLOBAL
+HYPRE_Int  hypre_RandI()
 {
    HYPRE_Int  low, high, test;
 
@@ -84,5 +97,19 @@
       Seed = test + m;
    }
 
-   return ((HYPRE_Real)(Seed) / m);
+   return Seed;
 }
+
+/*--------------------------------------------------------------------------
+ * Computes the next pseudo-random number in the sequence using the global
+ * variable Seed.
+ *
+ * @return a HYPRE_Real containing the next number in the sequence divided by
+ * 2147483647 so that the numbers are in (0, 1].
+ *--------------------------------------------------------------------------*/
+HYPRE_CUDA_GLOBAL
+HYPRE_Real  hypre_Rand()
+{
+   return ((HYPRE_Real)(hypre_RandI()) / m);
+}
+
diff -Nru hypre-2.11.2/src/utilities/threading.h hypre-2.13.0/src/utilities/threading.h
--- hypre-2.11.2/src/utilities/threading.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/threading.h	2017-10-20 17:42:22.000000000 +0000
@@ -30,3 +30,4 @@
 void hypre_GetSimpleThreadPartition( HYPRE_Int *begin, HYPRE_Int *end, HYPRE_Int n );
 
 #endif
+
diff -Nru hypre-2.11.2/src/utilities/timing.c hypre-2.13.0/src/utilities/timing.c
--- hypre-2.11.2/src/utilities/timing.c	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/timing.c	2017-10-20 17:42:22.000000000 +0000
@@ -328,9 +328,9 @@
          local_wall_time = hypre_TimingWallTime(i);
          local_cpu_time  = hypre_TimingCPUTime(i);
          hypre_MPI_Allreduce(&local_wall_time, &wall_time, 1,
-                       hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
+                       hypre_MPI_REAL, hypre_MPI_MAX, comm);
          hypre_MPI_Allreduce(&local_cpu_time, &cpu_time, 1,
-                       hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
+                       hypre_MPI_REAL, hypre_MPI_MAX, comm);
 
          if (myrank == 0)
          {
diff -Nru hypre-2.11.2/src/utilities/timing.h hypre-2.13.0/src/utilities/timing.h
--- hypre-2.11.2/src/utilities/timing.h	2017-03-13 19:37:24.000000000 +0000
+++ hypre-2.13.0/src/utilities/timing.h	2017-10-20 17:42:22.000000000 +0000
@@ -48,8 +48,8 @@
 #define hypre_IncFLOPCount(inc)
 #define hypre_BeginTiming(i)
 #define hypre_EndTiming(i)
-#define hypre_ClearTiming()
 #define hypre_PrintTiming(heading, comm)
+#define hypre_ClearTiming()
 
 /*--------------------------------------------------------------------------
  * With timing on
@@ -119,3 +119,4 @@
 #endif
 
 #endif
+